{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2315886984715146, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 175.375, "completions/mean_terminated_length": 175.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.23963595181703568, "epoch": 4.631773969430292e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 41958.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 1, "step_time": 21.253444358706474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2839031368494034, "epoch": 9.263547938860583e-05, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.99990736452061e-07, "loss": 0.0, "num_tokens": 68964.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2, "step_time": 16.527244716882706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 134.9375, "completions/mean_terminated_length": 134.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.27176588773727417, "epoch": 0.00013895321908290875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.999814729041222e-07, "loss": 0.0, "num_tokens": 91523.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3, "step_time": 15.215722694993019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.19801294803619385, "epoch": 0.00018527095877721167, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.999722093561833e-07, "loss": 0.0, "num_tokens": 113883.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4, "step_time": 14.227380692958832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 174.6875, "completions/mean_terminated_length": 174.6875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3308049142360687, "epoch": 0.0002315886984715146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 9.999629458082445e-07, "loss": 0.0, "num_tokens": 147846.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 5, "step_time": 20.853050660341978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 182.6875, "completions/mean_terminated_length": 182.6875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3976907953619957, "epoch": 0.0002779064381658175, "frac_reward_zero_std": 0.0, "grad_norm": 0.06878510117530823, "kl": 0.0, "learning_rate": 9.999536822603056e-07, "loss": 0.0094, "num_tokens": 174929.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 6, "step_time": 20.855298921465874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 203.9375, "completions/mean_terminated_length": 203.9375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4255857914686203, "epoch": 0.0003242241778601204, "frac_reward_zero_std": 1.0, "grad_norm": 0.001779966289177537, "kl": 0.002432417415548116, "learning_rate": 9.999444187123667e-07, "loss": 0.0001, "num_tokens": 202720.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 7, "step_time": 21.188715610653162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.34608907252550125, "epoch": 0.00037054191755442334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007893778383731842, "kl": 0.0012565676588565111, "learning_rate": 9.99935155164428e-07, "loss": 0.0001, "num_tokens": 226850.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 8, "step_time": 17.182187285274267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2665010169148445, "epoch": 0.00041685965724872626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009253919124603271, "kl": 0.0012993732816539705, "learning_rate": 9.999258916164892e-07, "loss": 0.0001, "num_tokens": 248829.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 9, "step_time": 14.068961184471846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.38808972388505936, "epoch": 0.0004631773969430292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020301910117268562, "kl": 0.002197462454205379, "learning_rate": 9.999166280685503e-07, "loss": 0.0001, "num_tokens": 297489.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 10, "step_time": 22.830876268446445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.16765369474887848, "epoch": 0.0005094951366373321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005631350795738399, "kl": 0.0007857431482989341, "learning_rate": 9.999073645206112e-07, "loss": 0.0, "num_tokens": 320465.0, "reward": 0.9084742069244385, "reward_std": 0.0, "rewards/reward_func/mean": 0.9084742069244385, "rewards/reward_func/std": 0.0, "step": 11, "step_time": 19.6044539809227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3356877490878105, "epoch": 0.000555812876331635, "frac_reward_zero_std": 1.0, "grad_norm": 0.001956203021109104, "kl": 0.0015761206741444767, "learning_rate": 9.998981009726725e-07, "loss": 0.0001, "num_tokens": 341017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 12, "step_time": 16.55023478344083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.2522463947534561, "epoch": 0.0006021306160259379, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005217111902311444, "kl": 0.0008352900767931715, "learning_rate": 9.998888374247337e-07, "loss": 0.0, "num_tokens": 363679.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 13, "step_time": 22.73781155049801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.35162051767110825, "epoch": 0.0006484483557202408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006630179705098271, "kl": 0.0011639624572126195, "learning_rate": 9.998795738767948e-07, "loss": 0.0001, "num_tokens": 386353.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 14, "step_time": 19.6433484852314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 239.5625, "completions/mean_terminated_length": 239.5625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.16075202450156212, "epoch": 0.0006947660954145438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010145703563466668, "kl": 0.0008467905863653868, "learning_rate": 9.99870310328856e-07, "loss": 0.0, "num_tokens": 411306.0, "reward": 0.7784501910209656, "reward_std": 0.0, "rewards/reward_func/mean": 0.7784501910209656, "rewards/reward_func/std": 0.0, "step": 15, "step_time": 22.901156540960073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2599363848567009, "epoch": 0.0007410838351088467, "frac_reward_zero_std": 1.0, "grad_norm": 0.00056539720389992, "kl": 0.0009439134591957554, "learning_rate": 9.99861046780917e-07, "loss": 0.0, "num_tokens": 434612.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 16, "step_time": 14.680199645459652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.8125, "completions/mean_terminated_length": 130.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2634947672486305, "epoch": 0.0007874015748031496, "frac_reward_zero_std": 1.0, "grad_norm": 0.024002104997634888, "kl": 0.003976634790888056, "learning_rate": 9.998517832329782e-07, "loss": 0.0002, "num_tokens": 456801.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 17, "step_time": 14.279331889003515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 163.1875, "completions/mean_terminated_length": 163.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4323969930410385, "epoch": 0.0008337193144974525, "frac_reward_zero_std": 1.0, "grad_norm": 0.002135026967152953, "kl": 0.0015527470386587083, "learning_rate": 9.998425196850393e-07, "loss": 0.0001, "num_tokens": 492628.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 18, "step_time": 21.967232834547758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.13712666556239128, "epoch": 0.0008800370541917554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003543617494869977, "kl": 0.0005839392761117779, "learning_rate": 9.998332561371004e-07, "loss": 0.0, "num_tokens": 519328.0, "reward": 0.7462587356567383, "reward_std": 0.0, "rewards/reward_func/mean": 0.7462587356567383, "rewards/reward_func/std": 0.0, "step": 19, "step_time": 21.239732574671507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24722861126065254, "epoch": 0.0009263547938860583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014625731855630875, "kl": 0.0013496327155735344, "learning_rate": 9.998239925891615e-07, "loss": 0.0001, "num_tokens": 538920.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 20, "step_time": 13.600695561617613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27628303319215775, "epoch": 0.0009726725335803613, "frac_reward_zero_std": 1.0, "grad_norm": 0.001170438015833497, "kl": 0.001314763183472678, "learning_rate": 9.998147290412229e-07, "loss": 0.0001, "num_tokens": 561419.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 21, "step_time": 14.672110460698605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 153.1875, "completions/mean_terminated_length": 153.1875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.1855221688747406, "epoch": 0.0010189902732746642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012163659557700157, "kl": 0.0011993444641120732, "learning_rate": 9.99805465493284e-07, "loss": 0.0001, "num_tokens": 583918.0, "reward": 0.3678794503211975, "reward_std": 0.0, "rewards/reward_func/mean": 0.3678794503211975, "rewards/reward_func/std": 0.0, "step": 22, "step_time": 17.95040387660265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 201.0625, "completions/mean_terminated_length": 201.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.23063024878501892, "epoch": 0.001065308012968967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008595545659773052, "kl": 0.0009587571257725358, "learning_rate": 9.99796201945345e-07, "loss": 0.0, "num_tokens": 621839.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 23, "step_time": 23.448596190661192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 247.9375, "completions/mean_terminated_length": 247.9375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.19920672848820686, "epoch": 0.00111162575266327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005527918692678213, "kl": 0.0008416420023422688, "learning_rate": 9.99786938397406e-07, "loss": 0.0, "num_tokens": 660942.0, "reward": 0.8787640929222107, "reward_std": 0.0, "rewards/reward_func/mean": 0.8787640929222107, "rewards/reward_func/std": 0.0, "step": 24, "step_time": 28.79375683888793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.34480341523885727, "epoch": 0.001157943492357573, "frac_reward_zero_std": 0.0, "grad_norm": 0.057788483798503876, "kl": 0.0012676734913839027, "learning_rate": 9.997776748494674e-07, "loss": -0.1737, "num_tokens": 682742.0, "reward": 0.24754419922828674, "reward_std": 0.4429076910018921, "rewards/reward_func/mean": 0.24754419922828674, "rewards/reward_func/std": 0.4429076910018921, "step": 25, "step_time": 24.72364231571555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.38395222276449203, "epoch": 0.0012042612320518759, "frac_reward_zero_std": 0.0, "grad_norm": 0.09764150530099869, "kl": 0.001582463999511674, "learning_rate": 9.997684113015285e-07, "loss": -0.0923, "num_tokens": 711834.0, "reward": 0.01261853240430355, "reward_std": 0.0504741296172142, "rewards/reward_func/mean": 0.01261853240430355, "rewards/reward_func/std": 0.0504741333425045, "step": 26, "step_time": 22.57013550028205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3620755672454834, "epoch": 0.0012505789717461788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007554744952358305, "kl": 0.0011183345050085336, "learning_rate": 9.997591477535896e-07, "loss": 0.0001, "num_tokens": 741312.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 27, "step_time": 19.41274269670248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 110.1875, "completions/mean_terminated_length": 110.1875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2566985860466957, "epoch": 0.0012968967114404817, "frac_reward_zero_std": 1.0, "grad_norm": 0.001454993849620223, "kl": 0.0015755675849504769, "learning_rate": 9.997498842056508e-07, "loss": 0.0001, "num_tokens": 761507.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 28, "step_time": 12.01841538771987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2279587686061859, "epoch": 0.0013432144511347846, "frac_reward_zero_std": 0.0, "grad_norm": 0.07249391824007034, "kl": 0.0012631421268451959, "learning_rate": 9.997406206577119e-07, "loss": -0.0367, "num_tokens": 790355.0, "reward": 0.8854333758354187, "reward_std": 0.20494304597377777, "rewards/reward_func/mean": 0.8854333758354187, "rewards/reward_func/std": 0.20494303107261658, "step": 29, "step_time": 20.076055269688368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 110.0, "completions/mean_terminated_length": 110.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.27383508533239365, "epoch": 0.0013895321908290875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010895292507484555, "kl": 0.0013592140458058566, "learning_rate": 9.99731357109773e-07, "loss": 0.0001, "num_tokens": 810163.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 30, "step_time": 12.570972047746181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 153.3125, "completions/mean_terminated_length": 153.3125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.26810621470212936, "epoch": 0.0014358499305233904, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047116258065216243, "kl": 0.0006794522632844746, "learning_rate": 9.997220935618341e-07, "loss": 0.0, "num_tokens": 832680.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 31, "step_time": 16.420401941984892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.1992652639746666, "epoch": 0.0014821676702176934, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044166221050545573, "kl": 0.0008906413859222084, "learning_rate": 9.997128300138952e-07, "loss": 0.0, "num_tokens": 855585.0, "reward": 0.795669436454773, "reward_std": 0.0, "rewards/reward_func/mean": 0.795669436454773, "rewards/reward_func/std": 0.0, "step": 32, "step_time": 18.911770571023226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.24813947081565857, "epoch": 0.0015284854099119963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003291643224656582, "kl": 0.0007134305778890848, "learning_rate": 9.997035664659564e-07, "loss": 0.0, "num_tokens": 883431.0, "reward": 0.8668729066848755, "reward_std": 0.0, "rewards/reward_func/mean": 0.8668729066848755, "rewards/reward_func/std": 0.0, "step": 33, "step_time": 26.373576171696186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4462438002228737, "epoch": 0.0015748031496062992, "frac_reward_zero_std": 0.0, "grad_norm": 0.05519348755478859, "kl": 0.001218096585944295, "learning_rate": 9.996943029180175e-07, "loss": 0.0004, "num_tokens": 909393.0, "reward": 0.16718508303165436, "reward_std": 0.29906976222991943, "rewards/reward_func/mean": 0.16718508303165436, "rewards/reward_func/std": 0.2990697920322418, "step": 34, "step_time": 26.19732155278325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.20621728152036667, "epoch": 0.001621120889300602, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017018754733726382, "kl": 0.0012577880843309686, "learning_rate": 9.996850393700786e-07, "loss": 0.0001, "num_tokens": 930877.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 35, "step_time": 19.29100165143609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.14232603088021278, "epoch": 0.001667438628994905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003708863805513829, "kl": 0.0005206533242017031, "learning_rate": 9.996757758221397e-07, "loss": 0.0, "num_tokens": 963595.0, "reward": 0.8702397346496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.8702397346496582, "rewards/reward_func/std": 0.0, "step": 36, "step_time": 21.33883025869727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 237.375, "completions/mean_terminated_length": 237.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.20478082448244095, "epoch": 0.001713756368689208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004962633247487247, "kl": 0.0007221920968731865, "learning_rate": 9.996665122742009e-07, "loss": 0.0, "num_tokens": 987441.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 37, "step_time": 22.624291632324457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.21290477737784386, "epoch": 0.0017600741083835109, "frac_reward_zero_std": 0.0, "grad_norm": 0.04010778293013573, "kl": 0.0008870865567587316, "learning_rate": 9.996572487262622e-07, "loss": -0.0015, "num_tokens": 1020295.0, "reward": 0.9311375617980957, "reward_std": 0.009968340396881104, "rewards/reward_func/mean": 0.9311375617980957, "rewards/reward_func/std": 0.009968344122171402, "step": 38, "step_time": 32.43071475997567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4042629078030586, "epoch": 0.0018063918480778138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006234828033484519, "kl": 0.0012812706409022212, "learning_rate": 9.996479851783233e-07, "loss": 0.0001, "num_tokens": 1061252.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 39, "step_time": 23.445143539458513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4109330251812935, "epoch": 0.0018527095877721167, "frac_reward_zero_std": 0.0, "grad_norm": 0.06005103141069412, "kl": 0.0015019657730590552, "learning_rate": 9.996387216303845e-07, "loss": 0.0871, "num_tokens": 1083774.0, "reward": 0.4600222110748291, "reward_std": 0.47510889172554016, "rewards/reward_func/mean": 0.4600222110748291, "rewards/reward_func/std": 0.47510892152786255, "step": 40, "step_time": 21.93445473909378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 254.4375, "completions/mean_terminated_length": 254.4375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.19949688762426376, "epoch": 0.0018990273274664196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003561973571777344, "kl": 0.0006408548942999914, "learning_rate": 9.996294580824454e-07, "loss": 0.0, "num_tokens": 1118309.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 41, "step_time": 27.413367446511984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3479044884443283, "epoch": 0.0019453450671607225, "frac_reward_zero_std": 0.0, "grad_norm": 0.05840029940009117, "kl": 0.0015336884825956076, "learning_rate": 9.996201945345067e-07, "loss": -0.1468, "num_tokens": 1145231.0, "reward": 0.6128644347190857, "reward_std": 0.4902915358543396, "rewards/reward_func/mean": 0.6128644347190857, "rewards/reward_func/std": 0.490291565656662, "step": 42, "step_time": 28.557862129062414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.29058726504445076, "epoch": 0.0019916628068550254, "frac_reward_zero_std": 0.0, "grad_norm": 0.07597646862268448, "kl": 0.0010675753292161971, "learning_rate": 9.996109309865678e-07, "loss": 0.0587, "num_tokens": 1165987.0, "reward": 0.2719818353652954, "reward_std": 0.07377496361732483, "rewards/reward_func/mean": 0.2719818353652954, "rewards/reward_func/std": 0.07377497106790543, "step": 43, "step_time": 18.770597979426384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.25469207763671875, "epoch": 0.0020379805465493284, "frac_reward_zero_std": 1.0, "grad_norm": 0.000462218071334064, "kl": 0.0010333134850952774, "learning_rate": 9.99601667438629e-07, "loss": 0.0001, "num_tokens": 1195453.0, "reward": 0.30568957328796387, "reward_std": 0.0, "rewards/reward_func/mean": 0.30568957328796387, "rewards/reward_func/std": 0.0, "step": 44, "step_time": 23.699180126190186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.32476720958948135, "epoch": 0.0020842982862436313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007233992218971252, "kl": 0.0013380103919189423, "learning_rate": 9.9959240389069e-07, "loss": 0.0001, "num_tokens": 1216535.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 45, "step_time": 15.907741725444794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.46384233236312866, "epoch": 0.002130616025937934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005010095192119479, "kl": 0.0013451931299641728, "learning_rate": 9.995831403427512e-07, "loss": 0.0001, "num_tokens": 1243331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 46, "step_time": 23.114768505096436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 229.3125, "completions/mean_terminated_length": 229.3125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.31266774982213974, "epoch": 0.002176933765632237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006749372696503997, "kl": 0.0010217149829259142, "learning_rate": 9.995738767948123e-07, "loss": 0.0001, "num_tokens": 1271976.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 47, "step_time": 24.77662756666541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 178.1875, "completions/mean_terminated_length": 178.1875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.32404256612062454, "epoch": 0.00222325150532654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008625648333691061, "kl": 0.0012001528666587546, "learning_rate": 9.995646132468735e-07, "loss": 0.0001, "num_tokens": 1303483.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 48, "step_time": 21.40724455565214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 116.3125, "completions/mean_terminated_length": 116.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.29034823924303055, "epoch": 0.002269569245020843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007067320984788239, "kl": 0.0011181181180290878, "learning_rate": 9.995553496989346e-07, "loss": 0.0001, "num_tokens": 1324576.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 49, "step_time": 14.518561020493507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3400954380631447, "epoch": 0.002315886984715146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008458670345135033, "kl": 0.0010712836519815028, "learning_rate": 9.995460861509957e-07, "loss": 0.0001, "num_tokens": 1360472.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 50, "step_time": 17.930652901530266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.27026674151420593, "epoch": 0.002362204724409449, "frac_reward_zero_std": 0.0, "grad_norm": 0.06760480254888535, "kl": 0.0009809674374992028, "learning_rate": 9.995368226030568e-07, "loss": -0.0271, "num_tokens": 1391882.0, "reward": 0.9607253074645996, "reward_std": 0.027347400784492493, "rewards/reward_func/mean": 0.9607253074645996, "rewards/reward_func/std": 0.027347411960363388, "step": 51, "step_time": 19.771480850875378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 219.8125, "completions/mean_terminated_length": 219.8125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4704762101173401, "epoch": 0.0024085224641037517, "frac_reward_zero_std": 1.0, "grad_norm": 0.003276088973507285, "kl": 0.0027438767137937248, "learning_rate": 9.995275590551182e-07, "loss": 0.0001, "num_tokens": 1420887.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 52, "step_time": 31.42541616410017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 193.3125, "completions/mean_terminated_length": 193.3125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.41140784323215485, "epoch": 0.0024548402037980546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007755675469525158, "kl": 0.0015211344871204346, "learning_rate": 9.995182955071793e-07, "loss": 0.0001, "num_tokens": 1468060.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 53, "step_time": 25.38789566233754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.27289799973368645, "epoch": 0.0025011579434923575, "frac_reward_zero_std": 0.0, "grad_norm": 0.08337947726249695, "kl": 0.0012199645425425842, "learning_rate": 9.995090319592402e-07, "loss": 0.0885, "num_tokens": 1491323.0, "reward": 0.7094695568084717, "reward_std": 0.42304593324661255, "rewards/reward_func/mean": 0.7094695568084717, "rewards/reward_func/std": 0.42304593324661255, "step": 54, "step_time": 19.423756692558527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 204.5625, "completions/mean_terminated_length": 204.5625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3171669542789459, "epoch": 0.0025474756831866605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005079456022940576, "kl": 0.0010534442990319803, "learning_rate": 9.994997684113015e-07, "loss": 0.0001, "num_tokens": 1514996.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 55, "step_time": 21.686194479465485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.20267057418823242, "epoch": 0.0025937934228809634, "frac_reward_zero_std": 0.0, "grad_norm": 0.09139242023229599, "kl": 0.001020463663735427, "learning_rate": 9.994905048633627e-07, "loss": -0.0276, "num_tokens": 1552345.0, "reward": 0.873737096786499, "reward_std": 0.08094866573810577, "rewards/reward_func/mean": 0.873737096786499, "rewards/reward_func/std": 0.08094867318868637, "step": 56, "step_time": 22.00080531463027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 220.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.30425940454006195, "epoch": 0.0026401111625752663, "frac_reward_zero_std": 0.0, "grad_norm": 0.04377205669879913, "kl": 0.0012595587322721258, "learning_rate": 9.994812413154238e-07, "loss": 0.0086, "num_tokens": 1575275.0, "reward": 0.0071022482588887215, "reward_std": 0.0018939328147098422, "rewards/reward_func/mean": 0.0071022482588887215, "rewards/reward_func/std": 0.001893932931125164, "step": 57, "step_time": 22.13108843192458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.2432107925415039, "epoch": 0.002686428902269569, "frac_reward_zero_std": 0.0, "grad_norm": 0.08007922023534775, "kl": 0.001151241856859997, "learning_rate": 9.99471977767485e-07, "loss": -0.0175, "num_tokens": 1600033.0, "reward": 0.9758262634277344, "reward_std": 0.043243277817964554, "rewards/reward_func/mean": 0.9758262634277344, "rewards/reward_func/std": 0.04324327036738396, "step": 58, "step_time": 25.022456251084805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 173.1875, "completions/mean_terminated_length": 173.1875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.25657934695482254, "epoch": 0.002732746641963872, "frac_reward_zero_std": 0.0, "grad_norm": 0.18090035021305084, "kl": 0.0015526409697486088, "learning_rate": 9.99462714219546e-07, "loss": -0.0344, "num_tokens": 1623444.0, "reward": 0.0714418813586235, "reward_std": 0.02954091690480709, "rewards/reward_func/mean": 0.0714418813586235, "rewards/reward_func/std": 0.02954091690480709, "step": 59, "step_time": 19.224395401775837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2592005953192711, "epoch": 0.002779064381658175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006600880296900868, "kl": 0.0008924548455979675, "learning_rate": 9.994534506716072e-07, "loss": 0.0, "num_tokens": 1648888.0, "reward": 0.9555630087852478, "reward_std": 0.0, "rewards/reward_func/mean": 0.9555630087852478, "rewards/reward_func/std": 0.0, "step": 60, "step_time": 23.297641325742006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 156.4375, "completions/mean_terminated_length": 156.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.41588329523801804, "epoch": 0.002825382121352478, "frac_reward_zero_std": 1.0, "grad_norm": 0.001193221309222281, "kl": 0.0017402568482793868, "learning_rate": 9.994441871236683e-07, "loss": 0.0001, "num_tokens": 1684431.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 61, "step_time": 19.046519339084625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2954227104783058, "epoch": 0.002871699861046781, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009379137773066759, "kl": 0.001311119005549699, "learning_rate": 9.994349235757294e-07, "loss": 0.0001, "num_tokens": 1705695.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 62, "step_time": 14.727204084396362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.39179327338933945, "epoch": 0.002918017600741084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009734238847158849, "kl": 0.0014574054221156985, "learning_rate": 9.994256600277905e-07, "loss": 0.0001, "num_tokens": 1739715.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 63, "step_time": 20.273111023008823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 181.8125, "completions/mean_terminated_length": 181.8125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.24816354736685753, "epoch": 0.0029643353404353867, "frac_reward_zero_std": 0.0, "grad_norm": 0.06950819492340088, "kl": 0.0007540240039816126, "learning_rate": 9.994163964798517e-07, "loss": -0.0679, "num_tokens": 1762288.0, "reward": 0.30373460054397583, "reward_std": 0.16260090470314026, "rewards/reward_func/mean": 0.30373460054397583, "rewards/reward_func/std": 0.16260090470314026, "step": 64, "step_time": 18.781805235892534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.31709306687116623, "epoch": 0.0030106530801296896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012367722811177373, "kl": 0.0016394878621213138, "learning_rate": 9.99407132931913e-07, "loss": 0.0001, "num_tokens": 1786210.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 65, "step_time": 16.068804062902927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3293949067592621, "epoch": 0.0030569708198239925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013366470811888576, "kl": 0.0013818318548146635, "learning_rate": 9.99397869383974e-07, "loss": 0.0001, "num_tokens": 1816860.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 66, "step_time": 17.489098727703094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3310137316584587, "epoch": 0.0031032885595182955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006151251727715135, "kl": 0.0010024872026406229, "learning_rate": 9.99388605836035e-07, "loss": 0.0001, "num_tokens": 1838986.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 67, "step_time": 18.4479684792459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 130.8125, "completions/mean_terminated_length": 130.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29231955111026764, "epoch": 0.0031496062992125984, "frac_reward_zero_std": 1.0, "grad_norm": 0.001049146754667163, "kl": 0.0010573577746981755, "learning_rate": 9.993793422880964e-07, "loss": 0.0001, "num_tokens": 1864583.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 68, "step_time": 15.196908507496119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 152.1875, "completions/mean_terminated_length": 152.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4232371523976326, "epoch": 0.0031959240389069013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007615491631440818, "kl": 0.001500288664828986, "learning_rate": 9.993700787401575e-07, "loss": 0.0001, "num_tokens": 1917050.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 69, "step_time": 23.43141169473529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 198.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.21531255170702934, "epoch": 0.003242241778601204, "frac_reward_zero_std": 0.0, "grad_norm": 0.08702373504638672, "kl": 0.0010492683068150654, "learning_rate": 9.993608151922186e-07, "loss": 0.0175, "num_tokens": 1954658.0, "reward": 0.9878142476081848, "reward_std": 0.033297814428806305, "rewards/reward_func/mean": 0.9878142476081848, "rewards/reward_func/std": 0.033297814428806305, "step": 70, "step_time": 24.59629587084055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 122.25, "completions/mean_terminated_length": 122.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2578287646174431, "epoch": 0.003288559518295507, "frac_reward_zero_std": 1.0, "grad_norm": 0.000850670738145709, "kl": 0.001109077871660702, "learning_rate": 9.993515516442798e-07, "loss": 0.0001, "num_tokens": 1975414.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 71, "step_time": 13.448259372264147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 146.6875, "completions/mean_terminated_length": 146.6875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3487822636961937, "epoch": 0.00333487725798981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007427554810419679, "kl": 0.001320757990470156, "learning_rate": 9.993422880963409e-07, "loss": 0.0001, "num_tokens": 1997297.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 72, "step_time": 16.248118489980698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2556117959320545, "epoch": 0.003381194997684113, "frac_reward_zero_std": 0.0, "grad_norm": 0.057933662086725235, "kl": 0.001071646562195383, "learning_rate": 9.99333024548402e-07, "loss": -0.077, "num_tokens": 2020696.0, "reward": 0.1669032722711563, "reward_std": 0.07607479393482208, "rewards/reward_func/mean": 0.1669032722711563, "rewards/reward_func/std": 0.07607479393482208, "step": 73, "step_time": 20.15308902412653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 191.3125, "completions/mean_terminated_length": 191.3125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2702726535499096, "epoch": 0.003427512737378416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004334551631473005, "kl": 0.0007109080324880779, "learning_rate": 9.993237610004631e-07, "loss": 0.0, "num_tokens": 2045085.0, "reward": 0.6227038502693176, "reward_std": 0.0, "rewards/reward_func/mean": 0.6227038502693176, "rewards/reward_func/std": 0.0, "step": 74, "step_time": 20.211932979524136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 212.5625, "completions/mean_terminated_length": 212.5625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.44178541004657745, "epoch": 0.003473830477072719, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010382463224232197, "kl": 0.0013108167913742363, "learning_rate": 9.993144974525243e-07, "loss": -0.0006, "num_tokens": 2082582.0, "reward": 5.98188080402906e-07, "reward_std": 1.0700713346523116e-06, "rewards/reward_func/mean": 5.98188080402906e-07, "rewards/reward_func/std": 1.0700713346523116e-06, "step": 75, "step_time": 28.365940377116203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 151.8125, "completions/mean_terminated_length": 151.8125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.4370253086090088, "epoch": 0.0035201482167670217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005119937704876065, "kl": 0.0011659206065814942, "learning_rate": 9.993052339045854e-07, "loss": 0.0001, "num_tokens": 2104723.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 76, "step_time": 16.270573265850544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.14407925307750702, "epoch": 0.0035664659564613246, "frac_reward_zero_std": 0.0, "grad_norm": 0.06740093976259232, "kl": 0.0008118026380543597, "learning_rate": 9.992959703566465e-07, "loss": -0.0353, "num_tokens": 2141004.0, "reward": 0.31709614396095276, "reward_std": 0.3274954855442047, "rewards/reward_func/mean": 0.31709614396095276, "rewards/reward_func/std": 0.3274955153465271, "step": 77, "step_time": 22.43112090975046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 150.5625, "completions/mean_terminated_length": 150.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2840983420610428, "epoch": 0.0036127836961556276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011623201426118612, "kl": 0.0012132947449572384, "learning_rate": 9.992867068087076e-07, "loss": 0.0001, "num_tokens": 2177349.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 78, "step_time": 18.93928075954318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4052634462714195, "epoch": 0.0036591014358499305, "frac_reward_zero_std": 1.0, "grad_norm": 0.00051750527927652, "kl": 0.001296806090977043, "learning_rate": 9.992774432607688e-07, "loss": 0.0001, "num_tokens": 2198186.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 79, "step_time": 19.795941203832626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27210986614227295, "epoch": 0.0037054191755442334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011587308254092932, "kl": 0.0010845881624845788, "learning_rate": 9.992681797128299e-07, "loss": 0.0001, "num_tokens": 2220902.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 80, "step_time": 14.425421085208654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 190.8125, "completions/mean_terminated_length": 190.8125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.2371511086821556, "epoch": 0.0037517369152385363, "frac_reward_zero_std": 0.0, "grad_norm": 0.05840691179037094, "kl": 0.0011019718513125554, "learning_rate": 9.99258916164891e-07, "loss": -0.0436, "num_tokens": 2252787.0, "reward": 0.9001584053039551, "reward_std": 0.05953400582075119, "rewards/reward_func/mean": 0.9001584053039551, "rewards/reward_func/std": 0.05953400954604149, "step": 81, "step_time": 21.62469592690468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 185.9375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3472830802202225, "epoch": 0.0037980546549328392, "frac_reward_zero_std": 1.0, "grad_norm": 0.001633723615668714, "kl": 0.0014273902197601274, "learning_rate": 9.992496526169523e-07, "loss": 0.0001, "num_tokens": 2283138.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 82, "step_time": 20.354945544153452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 152.9375, "completions/mean_terminated_length": 152.9375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4043729901313782, "epoch": 0.003844372394627142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005513399373739958, "kl": 0.001414911908796057, "learning_rate": 9.992403890690135e-07, "loss": 0.0001, "num_tokens": 2321425.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 83, "step_time": 20.47370319440961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.24156905710697174, "epoch": 0.003890690134321445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008328308467753232, "kl": 0.0012390459014568478, "learning_rate": 9.992311255210746e-07, "loss": 0.0001, "num_tokens": 2342617.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 84, "step_time": 16.063272561877966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.28923826664686203, "epoch": 0.003937007874015748, "frac_reward_zero_std": 0.0, "grad_norm": 0.2853640615940094, "kl": 0.002009062358411029, "learning_rate": 9.992218619731357e-07, "loss": -0.0715, "num_tokens": 2376203.0, "reward": 0.5002837181091309, "reward_std": 0.29904577136039734, "rewards/reward_func/mean": 0.5002837181091309, "rewards/reward_func/std": 0.29904577136039734, "step": 85, "step_time": 28.082806132733822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 158.8125, "completions/mean_terminated_length": 158.8125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1473250687122345, "epoch": 0.003983325613710051, "frac_reward_zero_std": 0.0, "grad_norm": 0.06612526625394821, "kl": 0.0007407844095723704, "learning_rate": 9.992125984251968e-07, "loss": -0.063, "num_tokens": 2398520.0, "reward": 0.9102447628974915, "reward_std": 0.06027979403734207, "rewards/reward_func/mean": 0.9102447628974915, "rewards/reward_func/std": 0.06027979776263237, "step": 86, "step_time": 19.420553267002106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 177.9375, "completions/mean_terminated_length": 177.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3603175953030586, "epoch": 0.004029643353404354, "frac_reward_zero_std": 0.0, "grad_norm": 0.08079643547534943, "kl": 0.0015568426460959017, "learning_rate": 9.99203334877258e-07, "loss": -0.0448, "num_tokens": 2427959.0, "reward": 0.007332447916269302, "reward_std": 0.02932979166507721, "rewards/reward_func/mean": 0.007332447916269302, "rewards/reward_func/std": 0.02932979352772236, "step": 87, "step_time": 29.163668405264616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 116.0625, "completions/mean_terminated_length": 116.0625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28036900609731674, "epoch": 0.004075961093098657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006897732964716852, "kl": 0.001202350395033136, "learning_rate": 9.99194071329319e-07, "loss": 0.0001, "num_tokens": 2449688.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 88, "step_time": 12.958728298544884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.31368735432624817, "epoch": 0.00412227883279296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016356243286281824, "kl": 0.0013772174424957484, "learning_rate": 9.991848077813802e-07, "loss": 0.0001, "num_tokens": 2476584.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 89, "step_time": 16.863355983048677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 131.25, "completions/mean_terminated_length": 131.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.20497016608715057, "epoch": 0.0041685965724872626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011138824047520757, "kl": 0.0009810531337279826, "learning_rate": 9.991755442334413e-07, "loss": 0.0, "num_tokens": 2496108.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 90, "step_time": 13.149852402508259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23887991160154343, "epoch": 0.0042149143121815655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040979571640491486, "kl": 0.0019737385446205735, "learning_rate": 9.991662806855025e-07, "loss": 0.0001, "num_tokens": 2515560.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 91, "step_time": 13.428842436522245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.16698342561721802, "epoch": 0.004261232051875868, "frac_reward_zero_std": 1.0, "grad_norm": 0.000315846991725266, "kl": 0.0005647511134156957, "learning_rate": 9.991570171375636e-07, "loss": 0.0, "num_tokens": 2544798.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 92, "step_time": 28.36004311963916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3940913528203964, "epoch": 0.004307549791570171, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008397761266678572, "kl": 0.0015314221964217722, "learning_rate": 9.991477535896247e-07, "loss": 0.0001, "num_tokens": 2595377.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 93, "step_time": 24.672285731881857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 186.3125, "completions/mean_terminated_length": 186.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.44038259983062744, "epoch": 0.004353867531264474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005806896951980889, "kl": 0.0015431945794261992, "learning_rate": 9.991384900416858e-07, "loss": 0.0001, "num_tokens": 2618486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 94, "step_time": 21.164959002286196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.331402949988842, "epoch": 0.004400185270958777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008044900605455041, "kl": 0.0013099743518978357, "learning_rate": 9.991292264937472e-07, "loss": 0.0001, "num_tokens": 2651630.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 95, "step_time": 16.487524412572384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.18016548454761505, "epoch": 0.00444650301065308, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039947155164554715, "kl": 0.0006883219321025535, "learning_rate": 9.991199629458083e-07, "loss": 0.0, "num_tokens": 2672847.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 96, "step_time": 17.446692943572998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 106.6875, "completions/mean_terminated_length": 106.6875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.1923675499856472, "epoch": 0.004492820750347383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013796341372653842, "kl": 0.001208607602166012, "learning_rate": 9.991106993978692e-07, "loss": 0.0001, "num_tokens": 2691770.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 97, "step_time": 12.108869213610888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 175.375, "completions/mean_terminated_length": 175.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.40798864513635635, "epoch": 0.004539138490041686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036100484430789948, "kl": 0.0022683410497847944, "learning_rate": 9.991014358499306e-07, "loss": 0.0001, "num_tokens": 2738384.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 98, "step_time": 24.993323389440775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.33561981469392776, "epoch": 0.004585456229735989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005170278600417078, "kl": 0.001088321630959399, "learning_rate": 9.990921723019917e-07, "loss": 0.0001, "num_tokens": 2765506.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 99, "step_time": 20.452530715614557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 172.6875, "completions/mean_terminated_length": 172.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3813248723745346, "epoch": 0.004631773969430292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019012981792911887, "kl": 0.001573115645442158, "learning_rate": 9.990829087540528e-07, "loss": 0.0001, "num_tokens": 2800061.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 100, "step_time": 21.43118468299508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.25710832327604294, "epoch": 0.004678091709124595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004794577253051102, "kl": 0.0007824498461559415, "learning_rate": 9.99073645206114e-07, "loss": 0.0, "num_tokens": 2830154.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 101, "step_time": 19.95168798044324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 172.6875, "completions/mean_terminated_length": 172.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3496035486459732, "epoch": 0.004724409448818898, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009181686327792704, "kl": 0.0015211670834105462, "learning_rate": 9.99064381658175e-07, "loss": 0.0001, "num_tokens": 2880581.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 102, "step_time": 25.15399621427059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.3125, "completions/mean_terminated_length": 228.3125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.17268018424510956, "epoch": 0.0047707271885132005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004131859459448606, "kl": 0.0007152085017878562, "learning_rate": 9.990551181102362e-07, "loss": 0.0, "num_tokens": 2910346.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 103, "step_time": 23.317954447120428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3152758330106735, "epoch": 0.004817044928207503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005372532177716494, "kl": 0.0011037293006666005, "learning_rate": 9.990458545622973e-07, "loss": 0.0001, "num_tokens": 2932686.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 104, "step_time": 16.74936816468835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 240.9375, "completions/mean_terminated_length": 240.9375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.18453437089920044, "epoch": 0.004863362667901806, "frac_reward_zero_std": 0.0, "grad_norm": 0.04752209037542343, "kl": 0.0006047881179256365, "learning_rate": 9.990365910143584e-07, "loss": 0.0259, "num_tokens": 2956525.0, "reward": 0.9409773349761963, "reward_std": 0.16128070652484894, "rewards/reward_func/mean": 0.9409773349761963, "rewards/reward_func/std": 0.16128070652484894, "step": 105, "step_time": 22.666839264333248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 121.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2953808978199959, "epoch": 0.004909680407596109, "frac_reward_zero_std": 1.0, "grad_norm": 0.000951802299823612, "kl": 0.0012901290610898286, "learning_rate": 9.990273274664196e-07, "loss": 0.0001, "num_tokens": 2976331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 106, "step_time": 12.871876332908869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.30508697777986526, "epoch": 0.004955998147290412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011718884343281388, "kl": 0.0012946214410476387, "learning_rate": 9.990180639184807e-07, "loss": 0.0001, "num_tokens": 3012227.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 107, "step_time": 18.159315083175898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1483689248561859, "epoch": 0.005002315886984715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027634147554636, "kl": 0.0009971036633942276, "learning_rate": 9.99008800370542e-07, "loss": 0.0, "num_tokens": 3049013.0, "reward": 0.619507372379303, "reward_std": 0.0, "rewards/reward_func/mean": 0.619507372379303, "rewards/reward_func/std": 0.0, "step": 108, "step_time": 18.731859609484673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2554764822125435, "epoch": 0.005048633626679018, "frac_reward_zero_std": 0.0, "grad_norm": 0.09361075609922409, "kl": 0.0016304054006468505, "learning_rate": 9.98999536822603e-07, "loss": -0.1054, "num_tokens": 3092461.0, "reward": 0.7015107870101929, "reward_std": 0.4194250702857971, "rewards/reward_func/mean": 0.7015107870101929, "rewards/reward_func/std": 0.4194250702857971, "step": 109, "step_time": 27.058271024376154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.24739449098706245, "epoch": 0.005094951366373321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005664157797582448, "kl": 0.0010661086562322453, "learning_rate": 9.98990273274664e-07, "loss": 0.0001, "num_tokens": 3116113.0, "reward": 0.24110545217990875, "reward_std": 0.0, "rewards/reward_func/mean": 0.24110545217990875, "rewards/reward_func/std": 0.0, "step": 110, "step_time": 22.920831225812435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 140.9375, "completions/mean_terminated_length": 140.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.27028920128941536, "epoch": 0.005141269106067624, "frac_reward_zero_std": 1.0, "grad_norm": 0.000512939237523824, "kl": 0.0008654707198729739, "learning_rate": 9.989810097267252e-07, "loss": 0.0, "num_tokens": 3136016.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 111, "step_time": 14.48648601397872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 386.3125, "completions/mean_terminated_length": 386.3125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.27152176201343536, "epoch": 0.005187586845761927, "frac_reward_zero_std": 0.0, "grad_norm": 0.04701411351561546, "kl": 0.0008469039894407615, "learning_rate": 9.989717461787865e-07, "loss": -0.0376, "num_tokens": 3178261.0, "reward": 0.7923111915588379, "reward_std": 0.0785004124045372, "rewards/reward_func/mean": 0.7923111915588379, "rewards/reward_func/std": 0.0785004124045372, "step": 112, "step_time": 39.99580450728536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 170.6875, "completions/mean_terminated_length": 170.6875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.14264867082238197, "epoch": 0.00523390458545623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005562285077758133, "kl": 0.0005924119759583846, "learning_rate": 9.989624826308476e-07, "loss": 0.0, "num_tokens": 3221536.0, "reward": 0.8890097737312317, "reward_std": 0.0, "rewards/reward_func/mean": 0.8890097737312317, "rewards/reward_func/std": 0.0, "step": 113, "step_time": 22.939540166407824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 139.3125, "completions/mean_terminated_length": 139.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2521078996360302, "epoch": 0.005280222325150533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008059011306613684, "kl": 0.001156355268904008, "learning_rate": 9.989532190829088e-07, "loss": 0.0001, "num_tokens": 3242101.0, "reward": 0.0004407913947943598, "reward_std": 0.0, "rewards/reward_func/mean": 0.0004407913947943598, "rewards/reward_func/std": 0.0, "step": 114, "step_time": 16.212469674646854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 236.9375, "completions/mean_terminated_length": 236.9375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.3463738113641739, "epoch": 0.0053265400648448355, "frac_reward_zero_std": 0.0, "grad_norm": 0.052856896072626114, "kl": 0.001222132268594578, "learning_rate": 9.989439555349699e-07, "loss": -0.1089, "num_tokens": 3271172.0, "reward": 0.4277864992618561, "reward_std": 0.5015459060668945, "rewards/reward_func/mean": 0.4277864992618561, "rewards/reward_func/std": 0.5015459060668945, "step": 115, "step_time": 25.421065870672464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.27230648696422577, "epoch": 0.005372857804539138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010126074776053429, "kl": 0.0011005645501427352, "learning_rate": 9.98934691987031e-07, "loss": 0.0001, "num_tokens": 3291558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 116, "step_time": 14.782246958464384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 142.5, "completions/mean_terminated_length": 142.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.268955335021019, "epoch": 0.005419175544233441, "frac_reward_zero_std": 1.0, "grad_norm": 0.000778664369136095, "kl": 0.001104289636714384, "learning_rate": 9.989254284390921e-07, "loss": 0.0001, "num_tokens": 3318830.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 117, "step_time": 17.054896883666515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.27606628835201263, "epoch": 0.005465493283927744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008477095398120582, "kl": 0.0009050240332726389, "learning_rate": 9.989161648911533e-07, "loss": 0.0, "num_tokens": 3359868.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 118, "step_time": 23.64735871180892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 186.875, "completions/mean_terminated_length": 186.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4049151912331581, "epoch": 0.005511811023622047, "frac_reward_zero_std": 0.0, "grad_norm": 0.06125951185822487, "kl": 0.0013729215424973518, "learning_rate": 9.989069013432144e-07, "loss": -0.053, "num_tokens": 3380586.0, "reward": 0.0020324073266237974, "reward_std": 0.00812962930649519, "rewards/reward_func/mean": 0.0020324073266237974, "rewards/reward_func/std": 0.008129630237817764, "step": 119, "step_time": 19.843410819768906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 139.6875, "completions/mean_terminated_length": 139.6875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.23628076165914536, "epoch": 0.00555812876331635, "frac_reward_zero_std": 1.0, "grad_norm": 0.000650152622256428, "kl": 0.0012098945735488087, "learning_rate": 9.988976377952755e-07, "loss": 0.0001, "num_tokens": 3400261.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 120, "step_time": 14.221643339842558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 114.375, "completions/mean_terminated_length": 114.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2281496524810791, "epoch": 0.005604446503010653, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007070648716762662, "kl": 0.001058092893799767, "learning_rate": 9.988883742473366e-07, "loss": 0.0001, "num_tokens": 3419659.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 121, "step_time": 13.323605943471193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 117.625, "completions/mean_terminated_length": 117.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27677495777606964, "epoch": 0.005650764242704956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007823980995453894, "kl": 0.0011881452519446611, "learning_rate": 9.988791106993978e-07, "loss": 0.0001, "num_tokens": 3439109.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 122, "step_time": 12.829030204564333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 165.8125, "completions/mean_terminated_length": 165.8125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.26315319538116455, "epoch": 0.005697081982399259, "frac_reward_zero_std": 0.0, "grad_norm": 0.10150697827339172, "kl": 0.001116703759180382, "learning_rate": 9.988698471514589e-07, "loss": 0.0457, "num_tokens": 3464386.0, "reward": 0.7361464500427246, "reward_std": 0.36629024147987366, "rewards/reward_func/mean": 0.7361464500427246, "rewards/reward_func/std": 0.36629024147987366, "step": 123, "step_time": 18.046367309987545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 139.3125, "completions/mean_terminated_length": 139.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.19785001128911972, "epoch": 0.005743399722093562, "frac_reward_zero_std": 0.0, "grad_norm": 0.08791724592447281, "kl": 0.0007540077349403873, "learning_rate": 9.9886058360352e-07, "loss": -0.0683, "num_tokens": 3492615.0, "reward": 0.47804224491119385, "reward_std": 0.3269173204898834, "rewards/reward_func/mean": 0.47804224491119385, "rewards/reward_func/std": 0.3269173800945282, "step": 124, "step_time": 17.178758315742016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 188.1875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.20732641592621803, "epoch": 0.005789717461787865, "frac_reward_zero_std": 0.0, "grad_norm": 0.05716419219970703, "kl": 0.000678992597386241, "learning_rate": 9.988513200555813e-07, "loss": -0.0529, "num_tokens": 3538618.0, "reward": 0.9079843759536743, "reward_std": 0.0359191857278347, "rewards/reward_func/mean": 0.9079843759536743, "rewards/reward_func/std": 0.035919204354286194, "step": 125, "step_time": 25.90315380319953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 163.5, "completions/mean_terminated_length": 163.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.33654970675706863, "epoch": 0.005836035201482168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006308644078671932, "kl": 0.0012278756767045707, "learning_rate": 9.988420565076425e-07, "loss": 0.0001, "num_tokens": 3567314.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 126, "step_time": 18.876841440796852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.33673005551099777, "epoch": 0.0058823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.09591037034988403, "kl": 0.001308549428358674, "learning_rate": 9.988327929597036e-07, "loss": 0.0127, "num_tokens": 3590446.0, "reward": 0.28263968229293823, "reward_std": 0.19680501520633698, "rewards/reward_func/mean": 0.28263968229293823, "rewards/reward_func/std": 0.19680503010749817, "step": 127, "step_time": 20.69503689929843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 186.1875, "completions/mean_terminated_length": 186.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3700679764151573, "epoch": 0.005928670680870773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010768556967377663, "kl": 0.0012731784372590482, "learning_rate": 9.988235294117647e-07, "loss": 0.0001, "num_tokens": 3618833.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 128, "step_time": 21.974650118499994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 117.0625, "completions/mean_terminated_length": 117.0625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23749050125479698, "epoch": 0.005974988420565076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010812852997332811, "kl": 0.0011148849880555645, "learning_rate": 9.988142658638258e-07, "loss": 0.0001, "num_tokens": 3641634.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 129, "step_time": 14.077108316123486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2998213544487953, "epoch": 0.006021306160259379, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006937870057299733, "kl": 0.0009626495302654803, "learning_rate": 9.98805002315887e-07, "loss": 0.0, "num_tokens": 3666214.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 130, "step_time": 21.575381591916084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 157.25, "completions/mean_terminated_length": 157.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3175879344344139, "epoch": 0.006067623899953682, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014436630299314857, "kl": 0.001178477396024391, "learning_rate": 9.98795738767948e-07, "loss": 0.0001, "num_tokens": 3686746.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 131, "step_time": 16.170368444174528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2595654986798763, "epoch": 0.006113941639647985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011029423912987113, "kl": 0.0011006823042407632, "learning_rate": 9.987864752200092e-07, "loss": 0.0001, "num_tokens": 3709110.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 132, "step_time": 16.029532201588154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3049924820661545, "epoch": 0.006160259379342288, "frac_reward_zero_std": 1.0, "grad_norm": 0.002257963875308633, "kl": 0.0015856284298934042, "learning_rate": 9.987772116720703e-07, "loss": 0.0001, "num_tokens": 3738506.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 133, "step_time": 17.504866629838943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.25285639613866806, "epoch": 0.006206577119036591, "frac_reward_zero_std": 0.0, "grad_norm": 0.04643546789884567, "kl": 0.0009889291250146925, "learning_rate": 9.987679481241315e-07, "loss": -0.0274, "num_tokens": 3760530.0, "reward": 0.9934226274490356, "reward_std": 0.017972838133573532, "rewards/reward_func/mean": 0.9934226274490356, "rewards/reward_func/std": 0.01797284372150898, "step": 134, "step_time": 21.360256396234035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3675421178340912, "epoch": 0.006252894858730894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008711877162568271, "kl": 0.001303886208916083, "learning_rate": 9.987586845761926e-07, "loss": 0.0001, "num_tokens": 3788702.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 135, "step_time": 20.955825198441744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.28805623203516006, "epoch": 0.006299212598425197, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011562267318367958, "kl": 0.0012812073400709778, "learning_rate": 9.987494210282537e-07, "loss": 0.0001, "num_tokens": 3809838.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 136, "step_time": 14.344765190035105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 168.9375, "completions/mean_terminated_length": 168.9375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.17872954905033112, "epoch": 0.0063455303381195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005084734293632209, "kl": 0.0007189370808191597, "learning_rate": 9.987401574803148e-07, "loss": 0.0, "num_tokens": 3831341.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 137, "step_time": 21.364930722862482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 177.5625, "completions/mean_terminated_length": 177.5625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.24735918268561363, "epoch": 0.006391848077813803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010594031773507595, "kl": 0.001022157768602483, "learning_rate": 9.987308939323762e-07, "loss": 0.0001, "num_tokens": 3852998.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 138, "step_time": 19.29247647151351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 235.3125, "completions/mean_terminated_length": 235.3125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.24933470785617828, "epoch": 0.0064381658175081055, "frac_reward_zero_std": 0.0, "grad_norm": 0.05908169969916344, "kl": 0.0011135565146105364, "learning_rate": 9.987216303844373e-07, "loss": 0.0113, "num_tokens": 3876779.0, "reward": 0.9620516300201416, "reward_std": 0.08158649504184723, "rewards/reward_func/mean": 0.9620516300201416, "rewards/reward_func/std": 0.08158650249242783, "step": 139, "step_time": 23.81665090844035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3989019989967346, "epoch": 0.006484483557202408, "frac_reward_zero_std": 0.0, "grad_norm": 0.05581681802868843, "kl": 0.0011897674994543195, "learning_rate": 9.987123668364982e-07, "loss": -0.0434, "num_tokens": 3904037.0, "reward": 0.012167919427156448, "reward_std": 0.048671673983335495, "rewards/reward_func/mean": 0.012167919427156448, "rewards/reward_func/std": 0.04867167770862579, "step": 140, "step_time": 22.690000787377357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 194.6875, "completions/mean_terminated_length": 194.6875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3049769699573517, "epoch": 0.006530801296896711, "frac_reward_zero_std": 0.0, "grad_norm": 0.08445534855127335, "kl": 0.0035717372375074774, "learning_rate": 9.987031032885593e-07, "loss": -0.1389, "num_tokens": 3928576.0, "reward": 0.1740274727344513, "reward_std": 0.1714896857738495, "rewards/reward_func/mean": 0.1740274727344513, "rewards/reward_func/std": 0.1714896857738495, "step": 141, "step_time": 28.506950981914997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.32121188193559647, "epoch": 0.006577119036591014, "frac_reward_zero_std": 1.0, "grad_norm": 0.000673571543302387, "kl": 0.0014779901539441198, "learning_rate": 9.986938397406207e-07, "loss": 0.0001, "num_tokens": 3956654.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 142, "step_time": 18.16906550899148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 193.9375, "completions/mean_terminated_length": 193.9375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.39169544726610184, "epoch": 0.006623436776285317, "frac_reward_zero_std": 0.0, "grad_norm": 0.07600442320108414, "kl": 0.0012212564470246434, "learning_rate": 9.986845761926818e-07, "loss": -0.0145, "num_tokens": 3981229.0, "reward": 0.03668078035116196, "reward_std": 0.06561657041311264, "rewards/reward_func/mean": 0.03668078035116196, "rewards/reward_func/std": 0.06561657786369324, "step": 143, "step_time": 19.928812380880117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.29782533645629883, "epoch": 0.00666975451597962, "frac_reward_zero_std": 0.0, "grad_norm": 0.04033688083291054, "kl": 0.0008177869021892548, "learning_rate": 9.98675312644743e-07, "loss": 0.0433, "num_tokens": 4006077.0, "reward": 0.24205882847309113, "reward_std": 0.42336198687553406, "rewards/reward_func/mean": 0.24205882847309113, "rewards/reward_func/std": 0.42336201667785645, "step": 144, "step_time": 28.188720546662807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 160.4375, "completions/mean_terminated_length": 160.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3879060745239258, "epoch": 0.006716072255673923, "frac_reward_zero_std": 1.0, "grad_norm": 0.001458464190363884, "kl": 0.001835305680288002, "learning_rate": 9.98666049096804e-07, "loss": 0.0001, "num_tokens": 4035540.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 145, "step_time": 20.65491282194853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 137.4375, "completions/mean_terminated_length": 137.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.23363490775227547, "epoch": 0.006762389995368226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007848707609809935, "kl": 0.0009721961687318981, "learning_rate": 9.986567855488652e-07, "loss": 0.0, "num_tokens": 4055195.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 146, "step_time": 14.133691564202309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 148.75, "completions/mean_terminated_length": 148.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3750767633318901, "epoch": 0.006808707735062529, "frac_reward_zero_std": 1.0, "grad_norm": 0.001035345601849258, "kl": 0.0017306151567026973, "learning_rate": 9.986475220009263e-07, "loss": 0.0001, "num_tokens": 4108327.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 147, "step_time": 24.681630488485098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.0, "completions/mean_terminated_length": 123.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.21959315985441208, "epoch": 0.006855025474756832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011265820357948542, "kl": 0.0010525864054216072, "learning_rate": 9.986382584529874e-07, "loss": 0.0001, "num_tokens": 4127991.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 148, "step_time": 13.904216725379229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.37593110650777817, "epoch": 0.006901343214451135, "frac_reward_zero_std": 1.0, "grad_norm": 0.001913378364406526, "kl": 0.0012810016341973096, "learning_rate": 9.986289949050486e-07, "loss": 0.0001, "num_tokens": 4153067.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 149, "step_time": 15.85357840359211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.42250946909189224, "epoch": 0.006947660954145438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005991277866996825, "kl": 0.0015141093172132969, "learning_rate": 9.986197313571097e-07, "loss": 0.0001, "num_tokens": 4197211.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 150, "step_time": 23.63872228562832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 169.8125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2225952185690403, "epoch": 0.0069939786938397405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005857797805219889, "kl": 0.0009670153085608035, "learning_rate": 9.986104678091708e-07, "loss": 0.0, "num_tokens": 4217784.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 151, "step_time": 18.001822039484978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.35866475105285645, "epoch": 0.0070402964335340434, "frac_reward_zero_std": 0.0, "grad_norm": 0.07115766406059265, "kl": 0.0011862863320857286, "learning_rate": 9.98601204261232e-07, "loss": 0.0495, "num_tokens": 4239146.0, "reward": 0.9892370700836182, "reward_std": 0.04305167496204376, "rewards/reward_func/mean": 0.9892370700836182, "rewards/reward_func/std": 0.04305167496204376, "step": 152, "step_time": 22.355235513299704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.26412268728017807, "epoch": 0.007086614173228346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011610282817855477, "kl": 0.001230985508300364, "learning_rate": 9.98591940713293e-07, "loss": 0.0001, "num_tokens": 4266940.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 153, "step_time": 18.30440727248788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.39102543890476227, "epoch": 0.007132931912922649, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010339897125959396, "kl": 0.0018844558508135378, "learning_rate": 9.985826771653542e-07, "loss": 0.0001, "num_tokens": 4319096.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 154, "step_time": 23.435311898589134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.29395852982997894, "epoch": 0.007179249652616952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007433706778101623, "kl": 0.0009080634627025574, "learning_rate": 9.985734136174155e-07, "loss": 0.0, "num_tokens": 4341438.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 155, "step_time": 13.602233476936817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 140.5625, "completions/mean_terminated_length": 140.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.19000105187296867, "epoch": 0.007225567392311255, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004755662230309099, "kl": 0.0007904800295364112, "learning_rate": 9.985641500694766e-07, "loss": 0.0, "num_tokens": 4364679.0, "reward": 0.25042009353637695, "reward_std": 0.0, "rewards/reward_func/mean": 0.25042009353637695, "rewards/reward_func/std": 0.0, "step": 156, "step_time": 14.93125580623746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.22341416031122208, "epoch": 0.007271885132005558, "frac_reward_zero_std": 0.0, "grad_norm": 0.05051978677511215, "kl": 0.0008343583176610991, "learning_rate": 9.985548865215378e-07, "loss": 0.0193, "num_tokens": 4402195.0, "reward": 0.596699595451355, "reward_std": 0.4261772036552429, "rewards/reward_func/mean": 0.596699595451355, "rewards/reward_func/std": 0.4261772334575653, "step": 157, "step_time": 24.41818241775036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 266.1875, "completions/mean_terminated_length": 266.1875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.36868835985660553, "epoch": 0.007318202871699861, "frac_reward_zero_std": 0.0, "grad_norm": 0.05326659977436066, "kl": 0.0010871543636312708, "learning_rate": 9.985456229735989e-07, "loss": -0.2973, "num_tokens": 4443286.0, "reward": 0.3007088899612427, "reward_std": 0.3702571392059326, "rewards/reward_func/mean": 0.3007088899612427, "rewards/reward_func/std": 0.3702571392059326, "step": 158, "step_time": 36.127769846469164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.19336718693375587, "epoch": 0.007364520611394164, "frac_reward_zero_std": 1.0, "grad_norm": 0.000494747597258538, "kl": 0.000735267938580364, "learning_rate": 9.9853635942566e-07, "loss": 0.0, "num_tokens": 4476393.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 159, "step_time": 20.922476079314947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 156.8125, "completions/mean_terminated_length": 156.8125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.1699804738163948, "epoch": 0.007410838351088467, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018062412273138762, "kl": 0.001229065004736185, "learning_rate": 9.985270958777211e-07, "loss": 0.0001, "num_tokens": 4510406.0, "reward": 0.9000876545906067, "reward_std": 0.0, "rewards/reward_func/mean": 0.9000876545906067, "rewards/reward_func/std": 0.0, "step": 160, "step_time": 18.807097870856524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3330295979976654, "epoch": 0.00745715609078277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005328988772816956, "kl": 0.0011647845094557852, "learning_rate": 9.985178323297823e-07, "loss": 0.0001, "num_tokens": 4546508.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 161, "step_time": 18.652416814118624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 128.9375, "completions/mean_terminated_length": 128.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3115380331873894, "epoch": 0.007503473830477073, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018411468481644988, "kl": 0.0017008328577503562, "learning_rate": 9.985085687818434e-07, "loss": 0.0001, "num_tokens": 4575883.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 162, "step_time": 16.12642402946949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.18948416784405708, "epoch": 0.0075497915701713755, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046944760833866894, "kl": 0.000663085505948402, "learning_rate": 9.984993052339045e-07, "loss": 0.0, "num_tokens": 4618527.0, "reward": 0.8574039340019226, "reward_std": 0.0, "rewards/reward_func/mean": 0.8574039340019226, "rewards/reward_func/std": 0.0, "step": 163, "step_time": 22.54652241244912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.2251650206744671, "epoch": 0.0075961093098656784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008289703982882202, "kl": 0.0009271571907447651, "learning_rate": 9.984900416859656e-07, "loss": 0.0, "num_tokens": 4640569.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 164, "step_time": 18.0803255289793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 130.5625, "completions/mean_terminated_length": 130.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3675830289721489, "epoch": 0.007642427049559981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009684573160484433, "kl": 0.0015152069390751421, "learning_rate": 9.984807781380268e-07, "loss": 0.0001, "num_tokens": 4663058.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 165, "step_time": 14.896864034235477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 158.0625, "completions/mean_terminated_length": 158.0625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3842160478234291, "epoch": 0.007688744789254284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019603169057518244, "kl": 0.0025681756669655442, "learning_rate": 9.984715145900879e-07, "loss": 0.0001, "num_tokens": 4693891.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 166, "step_time": 18.503708496689796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 138.6875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.29680949449539185, "epoch": 0.007735062528948587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007003445643931627, "kl": 0.0013395841815508902, "learning_rate": 9.98462251042149e-07, "loss": 0.0001, "num_tokens": 4716878.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 167, "step_time": 15.016962468624115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2283756174147129, "epoch": 0.00778138026864289, "frac_reward_zero_std": 0.0, "grad_norm": 0.06226864829659462, "kl": 0.0007998852815944701, "learning_rate": 9.984529874942104e-07, "loss": 0.001, "num_tokens": 4745587.0, "reward": 0.926367998123169, "reward_std": 0.01877889409661293, "rewards/reward_func/mean": 0.926367998123169, "rewards/reward_func/std": 0.018778905272483826, "step": 168, "step_time": 19.92407266050577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 236.5625, "completions/mean_terminated_length": 236.5625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3746219798922539, "epoch": 0.007827698008337193, "frac_reward_zero_std": 0.0, "grad_norm": 0.05858612433075905, "kl": 0.0010736614640336484, "learning_rate": 9.984437239462715e-07, "loss": -0.1012, "num_tokens": 4772556.0, "reward": 0.3508151173591614, "reward_std": 0.46775349974632263, "rewards/reward_func/mean": 0.3508151173591614, "rewards/reward_func/std": 0.46775349974632263, "step": 169, "step_time": 31.445678532123566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.12431573867797852, "epoch": 0.007874015748031496, "frac_reward_zero_std": 0.0, "grad_norm": 0.10413499921560287, "kl": 0.0013005670480197296, "learning_rate": 9.984344603983326e-07, "loss": -0.0004, "num_tokens": 4795081.0, "reward": 0.4928065836429596, "reward_std": 0.19798803329467773, "rewards/reward_func/mean": 0.4928065836429596, "rewards/reward_func/std": 0.19798806309700012, "step": 170, "step_time": 17.126375176012516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2561582215130329, "epoch": 0.007920333487725799, "frac_reward_zero_std": 0.0, "grad_norm": 0.08171358704566956, "kl": 0.001231180940521881, "learning_rate": 9.984251968503935e-07, "loss": -0.026, "num_tokens": 4816800.0, "reward": 0.9293943643569946, "reward_std": 0.035030219703912735, "rewards/reward_func/mean": 0.9293943643569946, "rewards/reward_func/std": 0.03503022342920303, "step": 171, "step_time": 17.46403419226408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4090711176395416, "epoch": 0.007966651227420102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007271178183145821, "kl": 0.0014456688368227333, "learning_rate": 9.984159333024549e-07, "loss": 0.0001, "num_tokens": 4845136.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 172, "step_time": 22.596364434808493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2239462547004223, "epoch": 0.008012968967114405, "frac_reward_zero_std": 0.0, "grad_norm": 0.07950145751237869, "kl": 0.0008892695477697998, "learning_rate": 9.98406669754516e-07, "loss": -0.0373, "num_tokens": 4868096.0, "reward": 0.9077697992324829, "reward_std": 0.09525498002767563, "rewards/reward_func/mean": 0.9077697992324829, "rewards/reward_func/std": 0.09525497257709503, "step": 173, "step_time": 16.582812402397394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.2988496795296669, "epoch": 0.008059286706808708, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046105749788694084, "kl": 0.0010157113574678078, "learning_rate": 9.98397406206577e-07, "loss": 0.0001, "num_tokens": 4890696.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 174, "step_time": 18.554279018193483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.20798297598958015, "epoch": 0.00810560444650301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003556886513251811, "kl": 0.000614775184658356, "learning_rate": 9.983881426586382e-07, "loss": 0.0, "num_tokens": 4927759.0, "reward": 0.9167169332504272, "reward_std": 0.0, "rewards/reward_func/mean": 0.9167169332504272, "rewards/reward_func/std": 0.0, "step": 175, "step_time": 21.02095464617014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.22096781060099602, "epoch": 0.008151922186197313, "frac_reward_zero_std": 0.0, "grad_norm": 0.052729543298482895, "kl": 0.0010106703994097188, "learning_rate": 9.983788791106994e-07, "loss": -0.0108, "num_tokens": 4967035.0, "reward": 0.9709692001342773, "reward_std": 0.014403305016458035, "rewards/reward_func/mean": 0.9709692001342773, "rewards/reward_func/std": 0.014403297565877438, "step": 176, "step_time": 28.388985190540552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 170.1875, "completions/mean_terminated_length": 170.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.17096561938524246, "epoch": 0.008198239925891616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004381695471238345, "kl": 0.000727768667275086, "learning_rate": 9.983696155627605e-07, "loss": 0.0, "num_tokens": 4992398.0, "reward": 0.9574533700942993, "reward_std": 0.0, "rewards/reward_func/mean": 0.9574533700942993, "rewards/reward_func/std": 0.0, "step": 177, "step_time": 18.1888774856925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.18110191822052002, "epoch": 0.00824455766558592, "frac_reward_zero_std": 0.0, "grad_norm": 0.11362199485301971, "kl": 0.0009543791529722512, "learning_rate": 9.983603520148216e-07, "loss": -0.0196, "num_tokens": 5029049.0, "reward": 0.29507988691329956, "reward_std": 0.032220497727394104, "rewards/reward_func/mean": 0.29507988691329956, "rewards/reward_func/std": 0.032220497727394104, "step": 178, "step_time": 18.264590088278055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 112.0625, "completions/mean_terminated_length": 112.0625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.267382450401783, "epoch": 0.008290875405280222, "frac_reward_zero_std": 1.0, "grad_norm": 0.001289795502088964, "kl": 0.0013163190451450646, "learning_rate": 9.983510884668827e-07, "loss": 0.0001, "num_tokens": 5049274.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 179, "step_time": 13.03090962767601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.30875954031944275, "epoch": 0.008337193144974525, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009180946508422494, "kl": 0.00131464286823757, "learning_rate": 9.983418249189439e-07, "loss": 0.0001, "num_tokens": 5069905.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 180, "step_time": 14.655602425336838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3586525395512581, "epoch": 0.008383510884668828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005821700324304402, "kl": 0.0011948540050070733, "learning_rate": 9.98332561371005e-07, "loss": 0.0001, "num_tokens": 5097506.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 181, "step_time": 15.478844940662384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.33501479029655457, "epoch": 0.008429828624363131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009040228906087577, "kl": 0.0010803402110468596, "learning_rate": 9.983232978230663e-07, "loss": 0.0001, "num_tokens": 5117528.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 182, "step_time": 14.24662160873413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.389257088303566, "epoch": 0.008476146364057434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005249987007118762, "kl": 0.0011284275096841156, "learning_rate": 9.983140342751272e-07, "loss": 0.0001, "num_tokens": 5152240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 183, "step_time": 21.00098342075944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 172.3125, "completions/mean_terminated_length": 172.3125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.35628943890333176, "epoch": 0.008522464103751737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008661292959004641, "kl": 0.001361237169476226, "learning_rate": 9.983047707271884e-07, "loss": 0.0001, "num_tokens": 5177477.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 184, "step_time": 19.700187604874372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4047982096672058, "epoch": 0.00856878184344604, "frac_reward_zero_std": 0.0, "grad_norm": 0.19900654256343842, "kl": 0.0020328431273810565, "learning_rate": 9.982955071792497e-07, "loss": -0.0895, "num_tokens": 5203637.0, "reward": 0.08689714223146439, "reward_std": 0.11963946372270584, "rewards/reward_func/mean": 0.08689714223146439, "rewards/reward_func/std": 0.11963947117328644, "step": 185, "step_time": 24.889085162431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 111.4375, "completions/mean_terminated_length": 111.4375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.27358129620552063, "epoch": 0.008615099583140343, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012420903658494353, "kl": 0.0012404267909005284, "learning_rate": 9.982862436313108e-07, "loss": 0.0001, "num_tokens": 5226348.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 186, "step_time": 12.944620177149773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.17678061872720718, "epoch": 0.008661417322834646, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007334018009714782, "kl": 0.0007218590471893549, "learning_rate": 9.98276980083372e-07, "loss": 0.0, "num_tokens": 5252995.0, "reward": 0.9021315574645996, "reward_std": 0.0, "rewards/reward_func/mean": 0.9021315574645996, "rewards/reward_func/std": 0.0, "step": 187, "step_time": 17.63252827897668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 125.6875, "completions/mean_terminated_length": 125.6875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3704119995236397, "epoch": 0.008707735062528948, "frac_reward_zero_std": 1.0, "grad_norm": 0.001009262283332646, "kl": 0.001398083579260856, "learning_rate": 9.98267716535433e-07, "loss": 0.0001, "num_tokens": 5275918.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 188, "step_time": 15.064791101962328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.22819731384515762, "epoch": 0.008754052802223251, "frac_reward_zero_std": 0.0, "grad_norm": 0.06821474432945251, "kl": 0.0008659957093186677, "learning_rate": 9.982584529874942e-07, "loss": 0.0372, "num_tokens": 5301100.0, "reward": 0.6976216435432434, "reward_std": 0.1710186004638672, "rewards/reward_func/mean": 0.6976216435432434, "rewards/reward_func/std": 0.171018585562706, "step": 189, "step_time": 23.242990478873253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.5625, "completions/mean_terminated_length": 133.5625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.28962938487529755, "epoch": 0.008800370541917554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040439129807055, "kl": 0.002571428776718676, "learning_rate": 9.982491894395553e-07, "loss": 0.0001, "num_tokens": 5321557.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 190, "step_time": 13.96716882660985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 128.3125, "completions/mean_terminated_length": 128.3125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29960859566926956, "epoch": 0.008846688281611857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010171140311285853, "kl": 0.0013723817246500403, "learning_rate": 9.982399258916164e-07, "loss": 0.0001, "num_tokens": 5342570.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 191, "step_time": 15.181181944906712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3717504292726517, "epoch": 0.00889300602130616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006592174177058041, "kl": 0.0011195582919754088, "learning_rate": 9.982306623436776e-07, "loss": 0.0001, "num_tokens": 5373624.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 192, "step_time": 20.758096884936094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.23235268890857697, "epoch": 0.008939323761000463, "frac_reward_zero_std": 0.0, "grad_norm": 0.09054431319236755, "kl": 0.0010597734071779996, "learning_rate": 9.982213987957387e-07, "loss": -0.037, "num_tokens": 5393982.0, "reward": 0.9185318946838379, "reward_std": 0.021724820137023926, "rewards/reward_func/mean": 0.9185318946838379, "rewards/reward_func/std": 0.021724820137023926, "step": 193, "step_time": 16.371366318315268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 217.6875, "completions/mean_terminated_length": 217.6875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.1963135525584221, "epoch": 0.008985641500694766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013173865154385567, "kl": 0.00090785016072914, "learning_rate": 9.982121352477998e-07, "loss": 0.0, "num_tokens": 5430777.0, "reward": 0.7428231239318848, "reward_std": 0.0, "rewards/reward_func/mean": 0.7428231239318848, "rewards/reward_func/std": 0.0, "step": 194, "step_time": 23.588253416121006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 174.6875, "completions/mean_terminated_length": 174.6875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.22125248238444328, "epoch": 0.009031959240389069, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005526018794625998, "kl": 0.0008960505801951513, "learning_rate": 9.98202871699861e-07, "loss": 0.0, "num_tokens": 5461652.0, "reward": 0.5712090730667114, "reward_std": 0.0, "rewards/reward_func/mean": 0.5712090730667114, "rewards/reward_func/std": 0.0, "step": 195, "step_time": 19.9322307407856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3360860273241997, "epoch": 0.009078276980083372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012191702844575047, "kl": 0.0012908496137242764, "learning_rate": 9.98193608151922e-07, "loss": 0.0001, "num_tokens": 5483729.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 196, "step_time": 18.82015247270465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 187.6875, "completions/mean_terminated_length": 187.6875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1839112527668476, "epoch": 0.009124594719777675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014125570887699723, "kl": 0.0009949714440153912, "learning_rate": 9.981843446039832e-07, "loss": 0.0001, "num_tokens": 5505436.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 197, "step_time": 18.444256361573935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3111222982406616, "epoch": 0.009170912459471978, "frac_reward_zero_std": 1.0, "grad_norm": 0.001630751183256507, "kl": 0.0015696244081482291, "learning_rate": 9.981750810560445e-07, "loss": 0.0001, "num_tokens": 5528837.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 198, "step_time": 16.388581547886133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.14439477026462555, "epoch": 0.00921723019916628, "frac_reward_zero_std": 0.0, "grad_norm": 0.11620976775884628, "kl": 0.0008143085142364725, "learning_rate": 9.981658175081056e-07, "loss": 0.0301, "num_tokens": 5559649.0, "reward": 0.9239631295204163, "reward_std": 0.029681755229830742, "rewards/reward_func/mean": 0.9239631295204163, "rewards/reward_func/std": 0.02968175709247589, "step": 199, "step_time": 17.773583106696606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.16612479090690613, "epoch": 0.009263547938860583, "frac_reward_zero_std": 0.0, "grad_norm": 0.07665696740150452, "kl": 0.00095115571457427, "learning_rate": 9.981565539601668e-07, "loss": 0.0186, "num_tokens": 5583899.0, "reward": 0.9961940050125122, "reward_std": 0.015223884023725986, "rewards/reward_func/mean": 0.9961940050125122, "rewards/reward_func/std": 0.015223890542984009, "step": 200, "step_time": 18.268201805651188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.31366194784641266, "epoch": 0.009309865678554886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011350114364176989, "kl": 0.001548691448988393, "learning_rate": 9.981472904122277e-07, "loss": 0.0001, "num_tokens": 5604971.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 201, "step_time": 13.308431796729565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.19328206777572632, "epoch": 0.00935618341824919, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047764976625330746, "kl": 0.0007238638063427061, "learning_rate": 9.98138026864289e-07, "loss": 0.0, "num_tokens": 5657343.0, "reward": 0.7860752940177917, "reward_std": 0.0, "rewards/reward_func/mean": 0.7860752940177917, "rewards/reward_func/std": 0.0, "step": 202, "step_time": 25.874954532831907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 315.0625, "completions/mean_terminated_length": 315.0625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.2514698840677738, "epoch": 0.009402501157943492, "frac_reward_zero_std": 0.0, "grad_norm": 0.04165821149945259, "kl": 0.0008549775666324422, "learning_rate": 9.981287633163501e-07, "loss": -0.1428, "num_tokens": 5696496.0, "reward": 0.6085189580917358, "reward_std": 0.48871079087257385, "rewards/reward_func/mean": 0.6085189580917358, "rewards/reward_func/std": 0.48871076107025146, "step": 203, "step_time": 33.79615079984069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.27936455607414246, "epoch": 0.009448818897637795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006951396935619414, "kl": 0.0011286622902844101, "learning_rate": 9.981194997684113e-07, "loss": 0.0001, "num_tokens": 5719594.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 204, "step_time": 14.518286500126123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3197743594646454, "epoch": 0.009495136637332098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010948445415124297, "kl": 0.001026401572744362, "learning_rate": 9.981102362204724e-07, "loss": 0.0001, "num_tokens": 5743658.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 205, "step_time": 16.007482074201107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2688465192914009, "epoch": 0.009541454377026401, "frac_reward_zero_std": 0.0, "grad_norm": 0.06209837272763252, "kl": 0.000768767815316096, "learning_rate": 9.981009726725335e-07, "loss": -0.0073, "num_tokens": 5769948.0, "reward": 0.9185318946838379, "reward_std": 0.021724820137023926, "rewards/reward_func/mean": 0.9185318946838379, "rewards/reward_func/std": 0.021724820137023926, "step": 206, "step_time": 18.914034850895405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 185.0625, "completions/mean_terminated_length": 185.0625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.20774800330400467, "epoch": 0.009587772116720704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007211709744296968, "kl": 0.0009327216830570251, "learning_rate": 9.980917091245946e-07, "loss": 0.0, "num_tokens": 5798461.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 207, "step_time": 19.299127969890833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 151.0625, "completions/mean_terminated_length": 151.0625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.25372298434376717, "epoch": 0.009634089856415007, "frac_reward_zero_std": 1.0, "grad_norm": 0.00082414411008358, "kl": 0.0011023855477105826, "learning_rate": 9.980824455766558e-07, "loss": 0.0001, "num_tokens": 5819390.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 208, "step_time": 15.844503808766603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 111.6875, "completions/mean_terminated_length": 111.6875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.28323476761579514, "epoch": 0.00968040759610931, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009210107382386923, "kl": 0.0013738409033976495, "learning_rate": 9.98073182028717e-07, "loss": 0.0001, "num_tokens": 5839865.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 209, "step_time": 12.719179343432188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 199.4375, "completions/mean_terminated_length": 199.4375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.1765754520893097, "epoch": 0.009726725335803613, "frac_reward_zero_std": 0.0, "grad_norm": 0.06527356803417206, "kl": 0.0009415536333108321, "learning_rate": 9.98063918480778e-07, "loss": -0.0021, "num_tokens": 5863616.0, "reward": 0.9642957448959351, "reward_std": 0.028563430532813072, "rewards/reward_func/mean": 0.9642957448959351, "rewards/reward_func/std": 0.028563441708683968, "step": 210, "step_time": 20.758552063256502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 180.5625, "completions/mean_terminated_length": 180.5625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.29664914309978485, "epoch": 0.009773043075497916, "frac_reward_zero_std": 0.0, "grad_norm": 0.0858040302991867, "kl": 0.0014906687720213085, "learning_rate": 9.980546549328391e-07, "loss": 0.0008, "num_tokens": 5891945.0, "reward": 0.14649486541748047, "reward_std": 0.1612107902765274, "rewards/reward_func/mean": 0.14649486541748047, "rewards/reward_func/std": 0.1612107902765274, "step": 211, "step_time": 20.227817099541426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.13642069324851036, "epoch": 0.009819360815192218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004876737657468766, "kl": 0.0006350057010422461, "learning_rate": 9.980453913849005e-07, "loss": 0.0, "num_tokens": 5932915.0, "reward": 0.19180183112621307, "reward_std": 0.0, "rewards/reward_func/mean": 0.19180183112621307, "rewards/reward_func/std": 0.0, "step": 212, "step_time": 21.081323496997356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.1973336823284626, "epoch": 0.009865678554886521, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011800038628280163, "kl": 0.0010994782787747681, "learning_rate": 9.980361278369616e-07, "loss": 0.0001, "num_tokens": 5952339.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 213, "step_time": 15.241963744163513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.22050614282488823, "epoch": 0.009911996294580824, "frac_reward_zero_std": 0.0, "grad_norm": 0.08256851136684418, "kl": 0.0009374968358315527, "learning_rate": 9.980268642890225e-07, "loss": 0.0, "num_tokens": 5991779.0, "reward": 0.42199599742889404, "reward_std": 0.015191474929451942, "rewards/reward_func/mean": 0.42199599742889404, "rewards/reward_func/std": 0.015191479586064816, "step": 214, "step_time": 24.130947835743427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 188.1875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.33391566574573517, "epoch": 0.009958314034275127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005107586039230227, "kl": 0.0011112367501482368, "learning_rate": 9.980176007410839e-07, "loss": 0.0001, "num_tokens": 6025622.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 215, "step_time": 24.09608830884099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 126.4375, "completions/mean_terminated_length": 126.4375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.23009155690670013, "epoch": 0.01000463177396943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006373866344802082, "kl": 0.0009257519559469074, "learning_rate": 9.98008337193145e-07, "loss": 0.0, "num_tokens": 6045117.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 216, "step_time": 14.371771406382322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.23760268464684486, "epoch": 0.010050949513663733, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010695838136598468, "kl": 0.0011337489413563162, "learning_rate": 9.979990736452061e-07, "loss": 0.0001, "num_tokens": 6066523.0, "reward": 0.7446697354316711, "reward_std": 0.0, "rewards/reward_func/mean": 0.7446697354316711, "rewards/reward_func/std": 0.0, "step": 217, "step_time": 17.035848531872034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3174886256456375, "epoch": 0.010097267253358036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005763350054621696, "kl": 0.000995459602563642, "learning_rate": 9.979898100972672e-07, "loss": 0.0, "num_tokens": 6104593.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 218, "step_time": 22.565702576190233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.37874598801136017, "epoch": 0.010143584993052339, "frac_reward_zero_std": 0.0, "grad_norm": 0.06810466200113297, "kl": 0.0009915903065120801, "learning_rate": 9.979805465493284e-07, "loss": 0.0573, "num_tokens": 6140043.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 219, "step_time": 26.506840966641903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.34261541068553925, "epoch": 0.010189902732746642, "frac_reward_zero_std": 0.0, "grad_norm": 0.060994748026132584, "kl": 0.0011211848468519747, "learning_rate": 9.979712830013895e-07, "loss": 0.0147, "num_tokens": 6163463.0, "reward": 0.3214479684829712, "reward_std": 0.3117648959159851, "rewards/reward_func/mean": 0.3214479684829712, "rewards/reward_func/std": 0.3117648959159851, "step": 220, "step_time": 26.26476990059018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.339463971555233, "epoch": 0.010236220472440945, "frac_reward_zero_std": 1.0, "grad_norm": 0.001397659070789814, "kl": 0.0020507503650151193, "learning_rate": 9.979620194534506e-07, "loss": 0.0001, "num_tokens": 6188338.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 221, "step_time": 16.402330487966537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 161.125, "completions/mean_terminated_length": 161.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.20277191698551178, "epoch": 0.010282538212135248, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009595077717676759, "kl": 0.001125899056205526, "learning_rate": 9.979527559055117e-07, "loss": 0.0001, "num_tokens": 6223380.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 222, "step_time": 21.72225385531783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.43114665895700455, "epoch": 0.01032885595182955, "frac_reward_zero_std": 1.0, "grad_norm": 0.001142045366577804, "kl": 0.0017447507416363806, "learning_rate": 9.979434923575729e-07, "loss": 0.0001, "num_tokens": 6251513.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 223, "step_time": 19.1215389855206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3638230934739113, "epoch": 0.010375173691523853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011755140731111169, "kl": 0.0018604571232572198, "learning_rate": 9.97934228809634e-07, "loss": 0.0001, "num_tokens": 6278355.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 224, "step_time": 19.442218646407127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 248.3125, "completions/mean_terminated_length": 248.3125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.22825674712657928, "epoch": 0.010421491431218156, "frac_reward_zero_std": 0.0, "grad_norm": 0.0447196401655674, "kl": 0.0009762654372025281, "learning_rate": 9.979249652616953e-07, "loss": -0.0198, "num_tokens": 6317448.0, "reward": 0.7119600772857666, "reward_std": 0.026777047663927078, "rewards/reward_func/mean": 0.7119600772857666, "rewards/reward_func/std": 0.026777038350701332, "step": 225, "step_time": 29.166859570890665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 118.6875, "completions/mean_terminated_length": 118.6875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.38523825258016586, "epoch": 0.01046780917091246, "frac_reward_zero_std": 1.0, "grad_norm": 0.000769888749346137, "kl": 0.001298567367484793, "learning_rate": 9.979157017137562e-07, "loss": 0.0001, "num_tokens": 6344451.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 226, "step_time": 15.558392085134983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.22717266157269478, "epoch": 0.010514126910606762, "frac_reward_zero_std": 1.0, "grad_norm": 0.000636727549135685, "kl": 0.0008928571623982862, "learning_rate": 9.979064381658174e-07, "loss": 0.0, "num_tokens": 6364075.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 227, "step_time": 13.442558009177446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.44085489213466644, "epoch": 0.010560444650301065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010319107677787542, "kl": 0.0018189146649092436, "learning_rate": 9.978971746178787e-07, "loss": 0.0001, "num_tokens": 6415092.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 228, "step_time": 25.613100692629814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 125.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2653019055724144, "epoch": 0.010606762389995368, "frac_reward_zero_std": 1.0, "grad_norm": 0.001061088521964848, "kl": 0.0015032147348392755, "learning_rate": 9.978879110699398e-07, "loss": 0.0001, "num_tokens": 6450776.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 229, "step_time": 17.40074209868908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 103.3125, "completions/mean_terminated_length": 103.3125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.26146209239959717, "epoch": 0.010653080129689671, "frac_reward_zero_std": 1.0, "grad_norm": 0.001286666956730187, "kl": 0.001344879623502493, "learning_rate": 9.97878647522001e-07, "loss": 0.0001, "num_tokens": 6470685.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 230, "step_time": 12.195948231965303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 188.5625, "completions/mean_terminated_length": 188.5625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3215632736682892, "epoch": 0.010699397869383974, "frac_reward_zero_std": 0.0, "grad_norm": 0.14584235846996307, "kl": 0.0015411792846862227, "learning_rate": 9.97869383974062e-07, "loss": 0.0219, "num_tokens": 6508198.0, "reward": 0.45201778411865234, "reward_std": 0.12053807079792023, "rewards/reward_func/mean": 0.45201778411865234, "rewards/reward_func/std": 0.12053807824850082, "step": 231, "step_time": 24.014050632715225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.34048623591661453, "epoch": 0.010745715609078277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010352181270718575, "kl": 0.001357596556772478, "learning_rate": 9.978601204261232e-07, "loss": 0.0001, "num_tokens": 6534802.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 232, "step_time": 17.13857477903366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.42318225651979446, "epoch": 0.01079203334877258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006775745423510671, "kl": 0.001376514817820862, "learning_rate": 9.978508568781843e-07, "loss": 0.0001, "num_tokens": 6562758.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 233, "step_time": 19.38410897180438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 181.6875, "completions/mean_terminated_length": 181.6875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2940324395895004, "epoch": 0.010838351088466883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004660946724470705, "kl": 0.0008864189876476303, "learning_rate": 9.978415933302454e-07, "loss": 0.0, "num_tokens": 6586465.0, "reward": 0.20764601230621338, "reward_std": 0.0, "rewards/reward_func/mean": 0.20764601230621338, "rewards/reward_func/std": 0.0, "step": 234, "step_time": 22.67731310427189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2967539206147194, "epoch": 0.010884668828161186, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027921830769628286, "kl": 0.0015929857036098838, "learning_rate": 9.978323297823066e-07, "loss": 0.0001, "num_tokens": 6616963.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 235, "step_time": 16.60131949931383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2843145579099655, "epoch": 0.010930986567855489, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012012934312224388, "kl": 0.0016003412602003664, "learning_rate": 9.978230662343677e-07, "loss": 0.0001, "num_tokens": 6636545.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 236, "step_time": 14.213252019137144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4105694890022278, "epoch": 0.010977304307549791, "frac_reward_zero_std": 0.0, "grad_norm": 0.05327938124537468, "kl": 0.0012937210267409682, "learning_rate": 9.978138026864288e-07, "loss": 0.5985, "num_tokens": 6670723.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 237, "step_time": 75.6249905526638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2223428264260292, "epoch": 0.011023622047244094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013035659212619066, "kl": 0.0011440668313298374, "learning_rate": 9.9780453913849e-07, "loss": 0.0001, "num_tokens": 6690563.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 238, "step_time": 12.680224448442459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3829054981470108, "epoch": 0.011069939786938397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005088613252155483, "kl": 0.001057111716363579, "learning_rate": 9.97795275590551e-07, "loss": 0.0001, "num_tokens": 6712389.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 239, "step_time": 19.046410162001848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.34127364307641983, "epoch": 0.0111162575266327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007454964215867221, "kl": 0.0012433588854037225, "learning_rate": 9.977860120426122e-07, "loss": 0.0001, "num_tokens": 6732457.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 240, "step_time": 14.262619711458683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.18717513233423233, "epoch": 0.011162575266327003, "frac_reward_zero_std": 0.0, "grad_norm": 0.08767064660787582, "kl": 0.0008982947038020939, "learning_rate": 9.977767484946733e-07, "loss": -0.07, "num_tokens": 6760348.0, "reward": 0.28112655878067017, "reward_std": 0.015021427534520626, "rewards/reward_func/mean": 0.28112655878067017, "rewards/reward_func/std": 0.015021426603198051, "step": 241, "step_time": 19.440937858074903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.33736418187618256, "epoch": 0.011208893006021306, "frac_reward_zero_std": 0.0, "grad_norm": 0.16001756489276886, "kl": 0.001747905946103856, "learning_rate": 9.977674849467347e-07, "loss": -0.0886, "num_tokens": 6793562.0, "reward": 0.37241876125335693, "reward_std": 0.4516417384147644, "rewards/reward_func/mean": 0.37241876125335693, "rewards/reward_func/std": 0.4516417384147644, "step": 242, "step_time": 24.757107455283403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3585934340953827, "epoch": 0.011255210745715609, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005876480718143284, "kl": 0.0012372637866064906, "learning_rate": 9.977582213987958e-07, "loss": 0.0001, "num_tokens": 6823503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 243, "step_time": 19.782935816794634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24060847610235214, "epoch": 0.011301528485409912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005929553299210966, "kl": 0.0009411770151928067, "learning_rate": 9.97748957850857e-07, "loss": 0.0, "num_tokens": 6844905.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 244, "step_time": 14.155857503414154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 169.6875, "completions/mean_terminated_length": 169.6875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3826494663953781, "epoch": 0.011347846225104215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006471690139733255, "kl": 0.0013720959541387856, "learning_rate": 9.97739694302918e-07, "loss": 0.0001, "num_tokens": 6870548.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 245, "step_time": 18.97964360564947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2920665517449379, "epoch": 0.011394163964798518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008388461428694427, "kl": 0.0009667183912824839, "learning_rate": 9.977304307549792e-07, "loss": 0.0, "num_tokens": 6902932.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 246, "step_time": 17.692711248993874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 169.9375, "completions/mean_terminated_length": 169.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2462501898407936, "epoch": 0.01144048170449282, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006977981538511813, "kl": 0.0010372063552495092, "learning_rate": 9.977211672070403e-07, "loss": 0.0001, "num_tokens": 6926275.0, "reward": 0.8781879544258118, "reward_std": 0.0, "rewards/reward_func/mean": 0.8781879544258118, "rewards/reward_func/std": 0.0, "step": 247, "step_time": 17.286343712359667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2867204025387764, "epoch": 0.011486799444187124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008923127315938473, "kl": 0.0012497162679210305, "learning_rate": 9.977119036591014e-07, "loss": 0.0001, "num_tokens": 6947207.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 248, "step_time": 14.922904722392559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 134.5625, "completions/mean_terminated_length": 134.5625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3154388293623924, "epoch": 0.011533117183881426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008344214293174446, "kl": 0.0013363700127229095, "learning_rate": 9.977026401111625e-07, "loss": 0.0001, "num_tokens": 6983040.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 249, "step_time": 18.25295503437519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 125.1875, "completions/mean_terminated_length": 125.1875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2925817146897316, "epoch": 0.01157943492357573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024473611265420914, "kl": 0.001579622970893979, "learning_rate": 9.976933765632237e-07, "loss": 0.0001, "num_tokens": 7004899.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 250, "step_time": 14.662612289190292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 203.8125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3871491327881813, "epoch": 0.011625752663270032, "frac_reward_zero_std": 0.0, "grad_norm": 0.06019943207502365, "kl": 0.0012392482603900135, "learning_rate": 9.976841130152848e-07, "loss": 0.0422, "num_tokens": 7029920.0, "reward": 0.17361781001091003, "reward_std": 0.3732667565345764, "rewards/reward_func/mean": 0.17361781001091003, "rewards/reward_func/std": 0.3732668161392212, "step": 251, "step_time": 24.29412142932415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 119.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2839593477547169, "epoch": 0.011672070402964335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010288519551977515, "kl": 0.001156790676759556, "learning_rate": 9.97674849467346e-07, "loss": 0.0001, "num_tokens": 7050618.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 252, "step_time": 13.062179304659367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 118.9375, "completions/mean_terminated_length": 118.9375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24643199145793915, "epoch": 0.011718388142658638, "frac_reward_zero_std": 1.0, "grad_norm": 0.000641032587736845, "kl": 0.0011265594221185893, "learning_rate": 9.97665585919407e-07, "loss": 0.0001, "num_tokens": 7073193.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 253, "step_time": 14.38772228360176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3889130726456642, "epoch": 0.011764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.07069052755832672, "kl": 0.0013481812202371657, "learning_rate": 9.976563223714682e-07, "loss": -0.0847, "num_tokens": 7096248.0, "reward": 0.11693838238716125, "reward_std": 0.3195364773273468, "rewards/reward_func/mean": 0.11693838238716125, "rewards/reward_func/std": 0.3195364773273468, "step": 254, "step_time": 20.836565881967545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 121.9375, "completions/mean_terminated_length": 121.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2889237403869629, "epoch": 0.011811023622047244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007356893620453775, "kl": 0.0010288926860084757, "learning_rate": 9.976470588235295e-07, "loss": 0.0001, "num_tokens": 7117655.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 255, "step_time": 14.183229140937328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4106031656265259, "epoch": 0.011857341361741547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008677243022248149, "kl": 0.0016055423475336283, "learning_rate": 9.976377952755906e-07, "loss": 0.0001, "num_tokens": 7151011.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 256, "step_time": 19.09181548282504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 110.125, "completions/mean_terminated_length": 110.125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2853569909930229, "epoch": 0.01190365910143585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014247329672798514, "kl": 0.0015982910699676722, "learning_rate": 9.976285317276515e-07, "loss": 0.0001, "num_tokens": 7170469.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 257, "step_time": 12.658961690962315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3763571232557297, "epoch": 0.011949976841130153, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006330300820991397, "kl": 0.0013355592382140458, "learning_rate": 9.976192681797129e-07, "loss": 0.0001, "num_tokens": 7202131.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 258, "step_time": 18.526979483664036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 162.1875, "completions/mean_terminated_length": 162.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4172009825706482, "epoch": 0.011996294580824456, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007413196726702154, "kl": 0.0014627919590566307, "learning_rate": 9.97610004631774e-07, "loss": 0.0001, "num_tokens": 7236982.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 259, "step_time": 19.692689403891563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3919227793812752, "epoch": 0.012042612320518759, "frac_reward_zero_std": 0.0, "grad_norm": 0.08192694187164307, "kl": 0.0018703019595704973, "learning_rate": 9.976007410838351e-07, "loss": 0.006, "num_tokens": 7258924.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.4787135720252991, "step": 260, "step_time": 19.39155102148652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 204.5625, "completions/mean_terminated_length": 204.5625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.43711964786052704, "epoch": 0.012088930060213061, "frac_reward_zero_std": 0.0, "grad_norm": 0.0016617421060800552, "kl": 0.0020323360804468393, "learning_rate": 9.975914775358962e-07, "loss": -0.0003, "num_tokens": 7283941.0, "reward": 1.378031839749383e-07, "reward_std": 5.512127358997532e-07, "rewards/reward_func/mean": 1.378031839749383e-07, "rewards/reward_func/std": 5.512127358997532e-07, "step": 261, "step_time": 24.59879645705223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.14695752039551735, "epoch": 0.012135247799907364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004047244437970221, "kl": 0.0005786035471828654, "learning_rate": 9.975822139879574e-07, "loss": 0.0, "num_tokens": 7307798.0, "reward": 0.8172460198402405, "reward_std": 0.0, "rewards/reward_func/mean": 0.8172460198402405, "rewards/reward_func/std": 0.0, "step": 262, "step_time": 19.629223205149174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3128410279750824, "epoch": 0.012181565539601667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019142779055982828, "kl": 0.0016148071445059031, "learning_rate": 9.975729504400185e-07, "loss": 0.0001, "num_tokens": 7331632.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 263, "step_time": 14.436136823147535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.18640683591365814, "epoch": 0.01222788327929597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004958834615536034, "kl": 0.0007326693012146279, "learning_rate": 9.975636868920796e-07, "loss": 0.0, "num_tokens": 7385101.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 264, "step_time": 27.704222440719604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 107.3125, "completions/mean_terminated_length": 107.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.24617928266525269, "epoch": 0.012274201018990273, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006484591867774725, "kl": 0.0009265294938813895, "learning_rate": 9.975544233441407e-07, "loss": 0.0, "num_tokens": 7404402.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 265, "step_time": 12.239355891942978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 131.8125, "completions/mean_terminated_length": 131.8125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.26922178268432617, "epoch": 0.012320518758684576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007232644129544497, "kl": 0.0010748645290732384, "learning_rate": 9.975451597962019e-07, "loss": 0.0001, "num_tokens": 7425023.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 266, "step_time": 13.867826867848635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3577575385570526, "epoch": 0.012366836498378879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008738681790418923, "kl": 0.0012663750094361603, "learning_rate": 9.97535896248263e-07, "loss": 0.0001, "num_tokens": 7448555.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 267, "step_time": 17.042743027210236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.21464881673455238, "epoch": 0.012413154238073182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010989999864250422, "kl": 0.0009238123311661184, "learning_rate": 9.975266327003243e-07, "loss": 0.0, "num_tokens": 7468689.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 268, "step_time": 15.706818025559187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.332606703042984, "epoch": 0.012459471977767485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013649666216224432, "kl": 0.0015467848279513419, "learning_rate": 9.975173691523852e-07, "loss": 0.0001, "num_tokens": 7505561.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 269, "step_time": 22.800201173871756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 150.8125, "completions/mean_terminated_length": 150.8125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.15520111098885536, "epoch": 0.012505789717461788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007402778137475252, "kl": 0.00071650069730822, "learning_rate": 9.975081056044464e-07, "loss": 0.0, "num_tokens": 7530086.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 270, "step_time": 16.660070817917585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.38500913232564926, "epoch": 0.01255210745715609, "frac_reward_zero_std": 1.0, "grad_norm": 0.001494114869274199, "kl": 0.0017256204737350345, "learning_rate": 9.974988420565075e-07, "loss": 0.0001, "num_tokens": 7556344.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 271, "step_time": 24.9211747944355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3571956604719162, "epoch": 0.012598425196850394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005984654417261481, "kl": 0.0011825784167740494, "learning_rate": 9.974895785085688e-07, "loss": 0.0001, "num_tokens": 7578334.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 272, "step_time": 22.358461305499077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.24652643501758575, "epoch": 0.012644742936544696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016675933729857206, "kl": 0.00128700424102135, "learning_rate": 9.9748031496063e-07, "loss": 0.0001, "num_tokens": 7599490.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 273, "step_time": 14.426620122045279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.1960732415318489, "epoch": 0.012691060676239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004313480167184025, "kl": 0.0006078024889575318, "learning_rate": 9.97471051412691e-07, "loss": 0.0, "num_tokens": 7627594.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 274, "step_time": 19.362862575799227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 125.8125, "completions/mean_terminated_length": 125.8125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2697870396077633, "epoch": 0.012737378415933302, "frac_reward_zero_std": 1.0, "grad_norm": 0.00127905432600528, "kl": 0.0012629003031179309, "learning_rate": 9.974617878647522e-07, "loss": 0.0001, "num_tokens": 7651095.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 275, "step_time": 15.04151302203536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29414065927267075, "epoch": 0.012783696155627605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010536487679928541, "kl": 0.0011426027631387115, "learning_rate": 9.974525243168133e-07, "loss": 0.0001, "num_tokens": 7670573.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 276, "step_time": 13.268250782042742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29729852825403214, "epoch": 0.012830013895321908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012455489486455917, "kl": 0.0014235415437724441, "learning_rate": 9.974432607688744e-07, "loss": 0.0001, "num_tokens": 7692069.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 277, "step_time": 14.937441002577543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.38496117293834686, "epoch": 0.012876331635016211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006876222905702889, "kl": 0.001357803848804906, "learning_rate": 9.974339972209356e-07, "loss": 0.0001, "num_tokens": 7746199.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 278, "step_time": 24.39612015336752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 153.8125, "completions/mean_terminated_length": 153.8125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4385102689266205, "epoch": 0.012922649374710514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004172384215053171, "kl": 0.0011424672266002744, "learning_rate": 9.974247336729967e-07, "loss": 0.0001, "num_tokens": 7774548.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 279, "step_time": 18.329782836139202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 205.3125, "completions/mean_terminated_length": 205.3125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3062855675816536, "epoch": 0.012968967114404817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020328606478869915, "kl": 0.0012164927611593157, "learning_rate": 9.974154701250578e-07, "loss": 0.0001, "num_tokens": 7799625.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 280, "step_time": 22.674007039517164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 199.4375, "completions/mean_terminated_length": 199.4375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.14915741235017776, "epoch": 0.01301528485409912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004247923498041928, "kl": 0.000717098475433886, "learning_rate": 9.97406206577119e-07, "loss": 0.0, "num_tokens": 7833552.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 281, "step_time": 22.713849186897278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3701997995376587, "epoch": 0.013061602593793423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006304908310994506, "kl": 0.0010913874139077961, "learning_rate": 9.9739694302918e-07, "loss": 0.0001, "num_tokens": 7854460.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 282, "step_time": 16.44683262333274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 154.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3792154788970947, "epoch": 0.013107920333487726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010001423070207238, "kl": 0.001630262704566121, "learning_rate": 9.973876794812412e-07, "loss": 0.0001, "num_tokens": 7878971.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 283, "step_time": 17.895399875938892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.208527822047472, "epoch": 0.013154238073182029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006816262030042708, "kl": 0.0008669499948155135, "learning_rate": 9.973784159333023e-07, "loss": 0.0, "num_tokens": 7908409.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 284, "step_time": 23.0435762219131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 171.3125, "completions/mean_terminated_length": 171.3125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.39356208592653275, "epoch": 0.013200555812876331, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012899527791887522, "kl": 0.0014151684008538723, "learning_rate": 9.973691523853637e-07, "loss": 0.0001, "num_tokens": 7941534.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 285, "step_time": 19.861068926751614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.24289053678512573, "epoch": 0.013246873552570634, "frac_reward_zero_std": 0.0, "grad_norm": 0.07930362224578857, "kl": 0.000889660048414953, "learning_rate": 9.973598888374248e-07, "loss": -0.0088, "num_tokens": 7963902.0, "reward": 0.8952482342720032, "reward_std": 0.03075244091451168, "rewards/reward_func/mean": 0.8952482342720032, "rewards/reward_func/std": 0.030752435326576233, "step": 286, "step_time": 17.126583348959684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 210.6875, "completions/mean_terminated_length": 210.6875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.39860232919454575, "epoch": 0.013293191292264937, "frac_reward_zero_std": 0.0, "grad_norm": 0.08467237651348114, "kl": 0.0015759188390802592, "learning_rate": 9.97350625289486e-07, "loss": -0.0881, "num_tokens": 7999417.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 287, "step_time": 26.797319907695055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.25086531043052673, "epoch": 0.01333950903195924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007963802199810743, "kl": 0.001119767752243206, "learning_rate": 9.973413617415468e-07, "loss": 0.0001, "num_tokens": 8019803.0, "reward": 0.780767560005188, "reward_std": 0.0, "rewards/reward_func/mean": 0.780767560005188, "rewards/reward_func/std": 0.0, "step": 288, "step_time": 15.925499644130468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.1875, "completions/mean_terminated_length": 120.1875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.27041003108024597, "epoch": 0.013385826771653543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007142223767004907, "kl": 0.001016193040413782, "learning_rate": 9.973320981936082e-07, "loss": 0.0001, "num_tokens": 8042414.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 289, "step_time": 14.222120333462954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.3460284397006035, "epoch": 0.013432144511347846, "frac_reward_zero_std": 0.0, "grad_norm": 0.057437408715486526, "kl": 0.0011360525531927124, "learning_rate": 9.973228346456693e-07, "loss": -0.0572, "num_tokens": 8079592.0, "reward": 0.20799149572849274, "reward_std": 0.15038849413394928, "rewards/reward_func/mean": 0.20799149572849274, "rewards/reward_func/std": 0.15038849413394928, "step": 290, "step_time": 26.500631351023912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.20297640562057495, "epoch": 0.013478462251042149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009243394597433507, "kl": 0.0009111710969591513, "learning_rate": 9.973135710977304e-07, "loss": 0.0, "num_tokens": 8103234.0, "reward": 0.6347364187240601, "reward_std": 0.0, "rewards/reward_func/mean": 0.6347364187240601, "rewards/reward_func/std": 0.0, "step": 291, "step_time": 18.534159436821938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.39861272275447845, "epoch": 0.013524779990736452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004690671630669385, "kl": 0.0011920387914869934, "learning_rate": 9.973043075497915e-07, "loss": 0.0001, "num_tokens": 8130866.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 292, "step_time": 20.164982356131077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.36604560166597366, "epoch": 0.013571097730430755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008623342146165669, "kl": 0.0014578664267901331, "learning_rate": 9.972950440018527e-07, "loss": 0.0001, "num_tokens": 8153020.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 293, "step_time": 17.66485656797886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.41096896678209305, "epoch": 0.013617415470125058, "frac_reward_zero_std": 1.0, "grad_norm": 0.001160994521342218, "kl": 0.002139343094313517, "learning_rate": 9.972857804539138e-07, "loss": 0.0001, "num_tokens": 8185294.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 294, "step_time": 22.892805226147175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 134.5625, "completions/mean_terminated_length": 134.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.30392035841941833, "epoch": 0.01366373320981936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015385417500510812, "kl": 0.0014921021938789636, "learning_rate": 9.97276516905975e-07, "loss": 0.0001, "num_tokens": 8205271.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 295, "step_time": 14.012457262724638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.24468612298369408, "epoch": 0.013710050949513664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004593682533595711, "kl": 0.0008445656712865457, "learning_rate": 9.97267253358036e-07, "loss": 0.0, "num_tokens": 8232601.0, "reward": 0.2555498778820038, "reward_std": 0.0, "rewards/reward_func/mean": 0.2555498778820038, "rewards/reward_func/std": 0.0, "step": 296, "step_time": 23.20956961810589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.13571469858288765, "epoch": 0.013756368689207966, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005670030950568616, "kl": 0.0007186480070231482, "learning_rate": 9.972579898100972e-07, "loss": 0.0, "num_tokens": 8257934.0, "reward": 0.9375209808349609, "reward_std": 0.0, "rewards/reward_func/mean": 0.9375209808349609, "rewards/reward_func/std": 0.0, "step": 297, "step_time": 19.63731164112687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 134.1875, "completions/mean_terminated_length": 134.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.25146742910146713, "epoch": 0.01380268642890227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010619927197694778, "kl": 0.0012564520293381065, "learning_rate": 9.972487262621585e-07, "loss": 0.0001, "num_tokens": 8277809.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 298, "step_time": 14.803503945469856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2383860982954502, "epoch": 0.013849004168596572, "frac_reward_zero_std": 0.0, "grad_norm": 0.07985582202672958, "kl": 0.0009646453545428813, "learning_rate": 9.972394627142196e-07, "loss": 0.054, "num_tokens": 8303631.0, "reward": 0.8738229274749756, "reward_std": 0.24180229008197784, "rewards/reward_func/mean": 0.8738229274749756, "rewards/reward_func/std": 0.24180230498313904, "step": 299, "step_time": 25.31433679163456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 126.1875, "completions/mean_terminated_length": 126.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2671867311000824, "epoch": 0.013895321908290875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011980956187471747, "kl": 0.00120316629181616, "learning_rate": 9.972301991662805e-07, "loss": 0.0001, "num_tokens": 8332450.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 300, "step_time": 15.832930944859982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2855543717741966, "epoch": 0.013941639647985178, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007711020880378783, "kl": 0.0012660915090236813, "learning_rate": 9.972209356183417e-07, "loss": 0.0001, "num_tokens": 8353486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 301, "step_time": 14.268948875367641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 151.5625, "completions/mean_terminated_length": 151.5625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4135436415672302, "epoch": 0.013987957387679481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005681651527993381, "kl": 0.0010966947593260556, "learning_rate": 9.97211672070403e-07, "loss": 0.0001, "num_tokens": 8385031.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 302, "step_time": 18.032295767217875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.40466737747192383, "epoch": 0.014034275127373784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006597894825972617, "kl": 0.0015541419852524996, "learning_rate": 9.972024085224641e-07, "loss": 0.0001, "num_tokens": 8421425.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 303, "step_time": 23.78118661046028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.33878885209560394, "epoch": 0.014080592867068087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011138013796880841, "kl": 0.001382181013468653, "learning_rate": 9.971931449745252e-07, "loss": 0.0001, "num_tokens": 8443795.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 304, "step_time": 16.13924302533269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 161.0625, "completions/mean_terminated_length": 161.0625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3237125128507614, "epoch": 0.01412691060676239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006071277312003076, "kl": 0.0012195921153761446, "learning_rate": 9.971838814265864e-07, "loss": 0.0001, "num_tokens": 8470948.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 305, "step_time": 19.79815885797143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.47993937134742737, "epoch": 0.014173228346456693, "frac_reward_zero_std": 0.0, "grad_norm": 0.07259546965360641, "kl": 0.0017538362299092114, "learning_rate": 9.971746178786475e-07, "loss": 0.1489, "num_tokens": 8493404.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 306, "step_time": 29.653807297348976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 200.5625, "completions/mean_terminated_length": 200.5625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2899518199265003, "epoch": 0.014219546086150996, "frac_reward_zero_std": 0.0, "grad_norm": 0.0738273486495018, "kl": 0.0010956964979413897, "learning_rate": 9.971653543307086e-07, "loss": -0.037, "num_tokens": 8519717.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 307, "step_time": 22.862802632153034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.38063275814056396, "epoch": 0.014265863825845299, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008101991261355579, "kl": 0.0019126106635667384, "learning_rate": 9.971560907827697e-07, "loss": 0.0001, "num_tokens": 8549221.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 308, "step_time": 18.756974667310715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 149.4375, "completions/mean_terminated_length": 149.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.19209444895386696, "epoch": 0.014312181565539601, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005579253192991018, "kl": 0.000857373554026708, "learning_rate": 9.971468272348309e-07, "loss": 0.0, "num_tokens": 8570252.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 309, "step_time": 16.16516475379467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 236.1875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.2595669776201248, "epoch": 0.014358499305233904, "frac_reward_zero_std": 0.0, "grad_norm": 0.05039333924651146, "kl": 0.0008414801704930142, "learning_rate": 9.97137563686892e-07, "loss": -0.0361, "num_tokens": 8592431.0, "reward": 0.6965094804763794, "reward_std": 0.18601341545581818, "rewards/reward_func/mean": 0.6965094804763794, "rewards/reward_func/std": 0.18601341545581818, "step": 310, "step_time": 22.22887173295021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 291.0625, "completions/mean_terminated_length": 291.0625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2998119741678238, "epoch": 0.014404817044928207, "frac_reward_zero_std": 0.0, "grad_norm": 0.06676234304904938, "kl": 0.0010350022348575294, "learning_rate": 9.971283001389531e-07, "loss": -0.0599, "num_tokens": 8632096.0, "reward": 0.8322875499725342, "reward_std": 0.22784748673439026, "rewards/reward_func/mean": 0.8322875499725342, "rewards/reward_func/std": 0.22784748673439026, "step": 311, "step_time": 31.391690842807293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2928355559706688, "epoch": 0.01445113478462251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009526091744191945, "kl": 0.0013767742784693837, "learning_rate": 9.971190365910142e-07, "loss": 0.0001, "num_tokens": 8655738.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 312, "step_time": 15.893251542001963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 180.1875, "completions/mean_terminated_length": 180.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3426474630832672, "epoch": 0.014497452524316813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005128518678247929, "kl": 0.0011518497776705772, "learning_rate": 9.971097730430754e-07, "loss": 0.0001, "num_tokens": 8686589.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 313, "step_time": 20.093024745583534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.268556572496891, "epoch": 0.014543770264011116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005823679966852069, "kl": 0.0009972721163649112, "learning_rate": 9.971005094951365e-07, "loss": 0.0001, "num_tokens": 8710559.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 314, "step_time": 18.503258530050516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 145.6875, "completions/mean_terminated_length": 145.6875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.37556004524230957, "epoch": 0.014590088003705419, "frac_reward_zero_std": 1.0, "grad_norm": 0.001303147291764617, "kl": 0.0015021058497950435, "learning_rate": 9.970912459471978e-07, "loss": 0.0001, "num_tokens": 8741338.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 315, "step_time": 20.34307497739792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 127.6875, "completions/mean_terminated_length": 127.6875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2628978341817856, "epoch": 0.014636405743399722, "frac_reward_zero_std": 1.0, "grad_norm": 0.000976155512034893, "kl": 0.0014791136200074106, "learning_rate": 9.97081982399259e-07, "loss": 0.0001, "num_tokens": 8761637.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 316, "step_time": 14.552318941801786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 209.8125, "completions/mean_terminated_length": 209.8125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.2628732994198799, "epoch": 0.014682723483094025, "frac_reward_zero_std": 0.0, "grad_norm": 0.09145770221948624, "kl": 0.0013813844125252217, "learning_rate": 9.9707271885132e-07, "loss": -0.036, "num_tokens": 8790562.0, "reward": 0.8237026929855347, "reward_std": 0.18207906186580658, "rewards/reward_func/mean": 0.8237026929855347, "rewards/reward_func/std": 0.18207907676696777, "step": 317, "step_time": 22.769946806132793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 138.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3076990395784378, "epoch": 0.014729041222788328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011036385549232364, "kl": 0.0017139567353297025, "learning_rate": 9.97063455303381e-07, "loss": 0.0001, "num_tokens": 8811389.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 318, "step_time": 15.330391250550747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2163013182580471, "epoch": 0.01477535896248263, "frac_reward_zero_std": 0.0, "grad_norm": 0.07880755513906479, "kl": 0.0011645381164271384, "learning_rate": 9.970541917554423e-07, "loss": 0.0183, "num_tokens": 8832513.0, "reward": 0.8862214088439941, "reward_std": 0.04104293882846832, "rewards/reward_func/mean": 0.8862214088439941, "rewards/reward_func/std": 0.04104295372962952, "step": 319, "step_time": 15.956313017755747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.38351473957300186, "epoch": 0.014821676702176934, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007289415807463229, "kl": 0.0014589433558285236, "learning_rate": 9.970449282075035e-07, "loss": 0.0001, "num_tokens": 8855965.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 320, "step_time": 18.83362502232194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 187.5625, "completions/mean_terminated_length": 187.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.33954180777072906, "epoch": 0.014867994441871236, "frac_reward_zero_std": 0.0, "grad_norm": 0.08285839110612869, "kl": 0.0012769268942065537, "learning_rate": 9.970356646595646e-07, "loss": 0.0345, "num_tokens": 8879094.0, "reward": 0.8875277638435364, "reward_std": 0.23965653777122498, "rewards/reward_func/mean": 0.8875277638435364, "rewards/reward_func/std": 0.23965655267238617, "step": 321, "step_time": 21.82441758364439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 133.9375, "completions/mean_terminated_length": 133.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3537861630320549, "epoch": 0.01491431218156554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008833041065372527, "kl": 0.0018543840560596436, "learning_rate": 9.970264011116257e-07, "loss": 0.0001, "num_tokens": 8926021.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 322, "step_time": 21.40423509478569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.25622427463531494, "epoch": 0.014960629921259842, "frac_reward_zero_std": 0.0, "grad_norm": 0.05758245661854744, "kl": 0.0009644064848544076, "learning_rate": 9.970171375636868e-07, "loss": 0.0055, "num_tokens": 8948891.0, "reward": 0.9782751798629761, "reward_std": 0.03886253759264946, "rewards/reward_func/mean": 0.9782751798629761, "rewards/reward_func/std": 0.03886254131793976, "step": 323, "step_time": 21.875840231776237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 206.3125, "completions/mean_terminated_length": 206.3125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.27018605917692184, "epoch": 0.015006947660954145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006027469644322991, "kl": 0.001118411382776685, "learning_rate": 9.97007874015748e-07, "loss": 0.0001, "num_tokens": 8979616.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 324, "step_time": 23.039261762052774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.32924505323171616, "epoch": 0.015053265400648448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006592704448848963, "kl": 0.001322623051237315, "learning_rate": 9.96998610467809e-07, "loss": 0.0001, "num_tokens": 9036930.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 325, "step_time": 28.929474364966154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 177.6875, "completions/mean_terminated_length": 177.6875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2153313159942627, "epoch": 0.015099583140342751, "frac_reward_zero_std": 1.0, "grad_norm": 0.000623970408923924, "kl": 0.0009110145183512941, "learning_rate": 9.969893469198702e-07, "loss": 0.0, "num_tokens": 9057949.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 326, "step_time": 17.84788030385971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.221485685557127, "epoch": 0.015145900880037054, "frac_reward_zero_std": 1.0, "grad_norm": 0.003161739557981491, "kl": 0.0013649599568452686, "learning_rate": 9.969800833719313e-07, "loss": 0.0001, "num_tokens": 9077795.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 327, "step_time": 14.65321834385395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.22197071090340614, "epoch": 0.015192218619731357, "frac_reward_zero_std": 0.0, "grad_norm": 0.08835534751415253, "kl": 0.0013470058620441705, "learning_rate": 9.969708198239927e-07, "loss": -0.0439, "num_tokens": 9099611.0, "reward": 0.5268779397010803, "reward_std": 0.06872842460870743, "rewards/reward_func/mean": 0.5268779397010803, "rewards/reward_func/std": 0.06872842460870743, "step": 328, "step_time": 17.72628043591976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.14309130609035492, "epoch": 0.01523853635942566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003652969899121672, "kl": 0.0005090439590276219, "learning_rate": 9.969615562760538e-07, "loss": 0.0, "num_tokens": 9134081.0, "reward": 0.9607894420623779, "reward_std": 0.0, "rewards/reward_func/mean": 0.9607894420623779, "rewards/reward_func/std": 0.0, "step": 329, "step_time": 21.34814863279462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4148356765508652, "epoch": 0.015284854099119963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006424064631573856, "kl": 0.001229463203344494, "learning_rate": 9.96952292728115e-07, "loss": 0.0001, "num_tokens": 9155745.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 330, "step_time": 23.497181992977858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.32033444195985794, "epoch": 0.015331171838814266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008130758069455624, "kl": 0.0012869780184701085, "learning_rate": 9.969430291801758e-07, "loss": 0.0001, "num_tokens": 9175809.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 331, "step_time": 15.519235752522945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2561548873782158, "epoch": 0.015377489578508569, "frac_reward_zero_std": 0.0, "grad_norm": 0.09667813777923584, "kl": 0.0011559998383745551, "learning_rate": 9.969337656322372e-07, "loss": 0.0057, "num_tokens": 9196565.0, "reward": 0.8072022795677185, "reward_std": 0.17560791969299316, "rewards/reward_func/mean": 0.8072022795677185, "rewards/reward_func/std": 0.17560791969299316, "step": 332, "step_time": 17.828052032738924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3886435329914093, "epoch": 0.015423807318202871, "frac_reward_zero_std": 0.0, "grad_norm": 0.0727555900812149, "kl": 0.0022208106820471585, "learning_rate": 9.969245020842983e-07, "loss": -0.15, "num_tokens": 9230295.0, "reward": 0.1661306470632553, "reward_std": 0.35719001293182373, "rewards/reward_func/mean": 0.1661306470632553, "rewards/reward_func/std": 0.35719001293182373, "step": 333, "step_time": 29.55166383087635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.38166338950395584, "epoch": 0.015470125057897174, "frac_reward_zero_std": 1.0, "grad_norm": 0.001050234423018992, "kl": 0.0015453605155926198, "learning_rate": 9.969152385363594e-07, "loss": 0.0001, "num_tokens": 9269242.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 334, "step_time": 22.607544537633657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.36232052743434906, "epoch": 0.015516442797591477, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012394188670441508, "kl": 0.0012941665190737695, "learning_rate": 9.969059749884205e-07, "loss": 0.0001, "num_tokens": 9295750.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 335, "step_time": 18.219283301383257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3761717975139618, "epoch": 0.01556276053728578, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016768844798207283, "kl": 0.0014611249644076452, "learning_rate": 9.968967114404817e-07, "loss": 0.0001, "num_tokens": 9317465.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 336, "step_time": 18.309650901705027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 155.6875, "completions/mean_terminated_length": 155.6875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.21403182297945023, "epoch": 0.015609078276980083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008231138344854116, "kl": 0.0009362594864796847, "learning_rate": 9.968874478925428e-07, "loss": 0.0, "num_tokens": 9346772.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 337, "step_time": 19.34696962684393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.33988913148641586, "epoch": 0.015655396016674386, "frac_reward_zero_std": 1.0, "grad_norm": 0.004363682121038437, "kl": 0.002511500206310302, "learning_rate": 9.96878184344604e-07, "loss": 0.0001, "num_tokens": 9372294.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 338, "step_time": 19.320641227066517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3785661906003952, "epoch": 0.01570171375636869, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015458170091733336, "kl": 0.0016905050433706492, "learning_rate": 9.96868920796665e-07, "loss": 0.0001, "num_tokens": 9393844.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 339, "step_time": 20.65398971363902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 165.1875, "completions/mean_terminated_length": 165.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4234740734100342, "epoch": 0.015748031496062992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007248983602039516, "kl": 0.0012707824644166976, "learning_rate": 9.968596572487262e-07, "loss": 0.0001, "num_tokens": 9417047.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 340, "step_time": 17.50099764764309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 181.6875, "completions/mean_terminated_length": 181.6875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3823828175663948, "epoch": 0.015794349235757295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007550466689281166, "kl": 0.0013036046584602445, "learning_rate": 9.968503937007873e-07, "loss": 0.0001, "num_tokens": 9440706.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 341, "step_time": 19.994986213743687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.42326007038354874, "epoch": 0.015840666975451598, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004449597036000341, "kl": 0.0012013615923933685, "learning_rate": 9.968411301528486e-07, "loss": 0.0001, "num_tokens": 9461901.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 342, "step_time": 19.592860255390406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 117.6875, "completions/mean_terminated_length": 117.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.25822774320840836, "epoch": 0.0158869847151459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022317171096801758, "kl": 0.001495075732236728, "learning_rate": 9.968318666049095e-07, "loss": 0.0001, "num_tokens": 9482216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 343, "step_time": 13.19613303616643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 107.6875, "completions/mean_terminated_length": 107.6875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.30405353754758835, "epoch": 0.015933302454840204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007039046613499522, "kl": 0.0011159517744090408, "learning_rate": 9.968226030569707e-07, "loss": 0.0001, "num_tokens": 9503075.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 344, "step_time": 12.520101103931665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.32665305584669113, "epoch": 0.015979620194534506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006344412686303258, "kl": 0.0013704613083973527, "learning_rate": 9.96813339509032e-07, "loss": 0.0001, "num_tokens": 9538001.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 345, "step_time": 20.899267457425594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 159.0625, "completions/mean_terminated_length": 159.0625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.21141353622078896, "epoch": 0.01602593793422881, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004040278145112097, "kl": 0.0006534120911965147, "learning_rate": 9.968040759610931e-07, "loss": 0.0, "num_tokens": 9567522.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 346, "step_time": 17.915325086563826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 237.5, "completions/mean_terminated_length": 237.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3383919596672058, "epoch": 0.016072255673923112, "frac_reward_zero_std": 0.0, "grad_norm": 0.059448838233947754, "kl": 0.0011732338170986623, "learning_rate": 9.967948124131542e-07, "loss": -0.0441, "num_tokens": 9605194.0, "reward": 0.3904230296611786, "reward_std": 0.3123384416103363, "rewards/reward_func/mean": 0.3904230296611786, "rewards/reward_func/std": 0.3123384416103363, "step": 347, "step_time": 32.66446267068386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 148.5625, "completions/mean_terminated_length": 148.5625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3652502968907356, "epoch": 0.016118573413617415, "frac_reward_zero_std": 1.0, "grad_norm": 0.001007839571684599, "kl": 0.001655446772929281, "learning_rate": 9.967855488652154e-07, "loss": 0.0001, "num_tokens": 9628371.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 348, "step_time": 16.39089348167181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 161.5625, "completions/mean_terminated_length": 161.5625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3960127979516983, "epoch": 0.016164891153311718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006725586717948318, "kl": 0.001324401848251, "learning_rate": 9.967762853172765e-07, "loss": 0.0001, "num_tokens": 9653612.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 349, "step_time": 17.55132332444191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 172.0625, "completions/mean_terminated_length": 172.0625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.20959357172250748, "epoch": 0.01621120889300602, "frac_reward_zero_std": 0.0, "grad_norm": 0.08087478578090668, "kl": 0.0009161113994196057, "learning_rate": 9.967670217693376e-07, "loss": 0.0043, "num_tokens": 9683773.0, "reward": 0.34488698840141296, "reward_std": 0.09196986258029938, "rewards/reward_func/mean": 0.34488698840141296, "rewards/reward_func/std": 0.09196987003087997, "step": 350, "step_time": 19.12022588402033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 188.9375, "completions/mean_terminated_length": 188.9375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3701925128698349, "epoch": 0.016257526632700324, "frac_reward_zero_std": 1.0, "grad_norm": 0.000581101980060339, "kl": 0.0014219658623915166, "learning_rate": 9.967577582213987e-07, "loss": 0.0001, "num_tokens": 9711388.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 351, "step_time": 20.990219868719578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 200.4375, "completions/mean_terminated_length": 200.4375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.36787600815296173, "epoch": 0.016303844372394627, "frac_reward_zero_std": 0.0, "grad_norm": 0.07708732038736343, "kl": 0.0013787887146463618, "learning_rate": 9.967484946734599e-07, "loss": -0.1613, "num_tokens": 9733347.0, "reward": 0.21756526827812195, "reward_std": 0.39111003279685974, "rewards/reward_func/mean": 0.21756526827812195, "rewards/reward_func/std": 0.39111006259918213, "step": 352, "step_time": 25.04696473479271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.18251312896609306, "epoch": 0.01635016211208893, "frac_reward_zero_std": 0.0, "grad_norm": 0.06357643008232117, "kl": 0.001038837610394694, "learning_rate": 9.96739231125521e-07, "loss": -0.0092, "num_tokens": 9771056.0, "reward": 0.7122365236282349, "reward_std": 0.23176871240139008, "rewards/reward_func/mean": 0.7122365236282349, "rewards/reward_func/std": 0.23176871240139008, "step": 353, "step_time": 22.896654035896063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.16217869520187378, "epoch": 0.016396479851783233, "frac_reward_zero_std": 1.0, "grad_norm": 0.00035185046726837754, "kl": 0.0006140474142739549, "learning_rate": 9.967299675775821e-07, "loss": 0.0, "num_tokens": 9797290.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 354, "step_time": 22.49565551057458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 171.0625, "completions/mean_terminated_length": 171.0625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.39254141598939896, "epoch": 0.016442797591477536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011137102264910936, "kl": 0.00146244463394396, "learning_rate": 9.967207040296432e-07, "loss": 0.0001, "num_tokens": 9845755.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 355, "step_time": 24.57612419500947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.19935456290841103, "epoch": 0.01648911533117184, "frac_reward_zero_std": 1.0, "grad_norm": 0.000986173516139388, "kl": 0.0009092786349356174, "learning_rate": 9.967114404817044e-07, "loss": 0.0, "num_tokens": 9865203.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 356, "step_time": 13.655712105333805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.21422869712114334, "epoch": 0.01653543307086614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006323560373857617, "kl": 0.0007846915978007019, "learning_rate": 9.967021769337655e-07, "loss": 0.0, "num_tokens": 9886929.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 357, "step_time": 14.202369064092636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.17644475027918816, "epoch": 0.016581750810560444, "frac_reward_zero_std": 0.0, "grad_norm": 0.07472806423902512, "kl": 0.0007407554658129811, "learning_rate": 9.966929133858266e-07, "loss": -0.0007, "num_tokens": 9909873.0, "reward": 0.9164585471153259, "reward_std": 0.03093360736966133, "rewards/reward_func/mean": 0.9164585471153259, "rewards/reward_func/std": 0.03093361109495163, "step": 358, "step_time": 17.908420998603106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 135.0625, "completions/mean_terminated_length": 135.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.36569082736968994, "epoch": 0.016628068550254747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007995835039764643, "kl": 0.0013750218786299229, "learning_rate": 9.96683649837888e-07, "loss": 0.0001, "num_tokens": 9945794.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 359, "step_time": 18.089062709361315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 110.8125, "completions/mean_terminated_length": 110.8125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.26153818517923355, "epoch": 0.01667438628994905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011588912457227707, "kl": 0.0012160380429122597, "learning_rate": 9.96674386289949e-07, "loss": 0.0001, "num_tokens": 9965647.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 360, "step_time": 12.353236109018326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2633475586771965, "epoch": 0.016720704029643353, "frac_reward_zero_std": 0.0, "grad_norm": 0.07660730928182602, "kl": 0.0011788938863901421, "learning_rate": 9.9666512274201e-07, "loss": 0.0141, "num_tokens": 9995535.0, "reward": 0.9712770581245422, "reward_std": 0.11489176005125046, "rewards/reward_func/mean": 0.9712770581245422, "rewards/reward_func/std": 0.11489175260066986, "step": 361, "step_time": 24.44135208800435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2666449323296547, "epoch": 0.016767021769337656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006468050414696336, "kl": 0.0010750905494205654, "learning_rate": 9.966558591940713e-07, "loss": 0.0001, "num_tokens": 10016587.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 362, "step_time": 14.386831555515528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.32977041602134705, "epoch": 0.01681333950903196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009706729324534535, "kl": 0.0012442492734408006, "learning_rate": 9.966465956461325e-07, "loss": 0.0001, "num_tokens": 10043947.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 363, "step_time": 17.256726995110512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3330072611570358, "epoch": 0.016859657248726262, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009713650797493756, "kl": 0.0013287764450069517, "learning_rate": 9.966373320981936e-07, "loss": 0.0001, "num_tokens": 10079988.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 364, "step_time": 18.95854353159666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3273973688483238, "epoch": 0.016905974988420565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006421417347155511, "kl": 0.0012357144441921264, "learning_rate": 9.966280685502547e-07, "loss": 0.0001, "num_tokens": 10100770.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 365, "step_time": 17.175654880702496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 202.8125, "completions/mean_terminated_length": 202.8125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.30211323499679565, "epoch": 0.016952292728114868, "frac_reward_zero_std": 1.0, "grad_norm": 0.007135492283850908, "kl": 0.008094704942777753, "learning_rate": 9.966188050023158e-07, "loss": 0.0004, "num_tokens": 10126799.0, "reward": 0.8668779134750366, "reward_std": 0.0, "rewards/reward_func/mean": 0.8668779134750366, "rewards/reward_func/std": 0.0, "step": 366, "step_time": 24.911034680902958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 157.8125, "completions/mean_terminated_length": 157.8125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3083705008029938, "epoch": 0.01699861046780917, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006156533490866423, "kl": 0.0010671147028915584, "learning_rate": 9.96609541454377e-07, "loss": 0.0001, "num_tokens": 10152428.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 367, "step_time": 17.69282505661249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 184.9375, "completions/mean_terminated_length": 184.9375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3511301353573799, "epoch": 0.017044928207503474, "frac_reward_zero_std": 0.0, "grad_norm": 0.08407903462648392, "kl": 0.0012649931013584137, "learning_rate": 9.96600277906438e-07, "loss": -0.007, "num_tokens": 10175419.0, "reward": 0.4160774350166321, "reward_std": 0.4874512851238251, "rewards/reward_func/mean": 0.4160774350166321, "rewards/reward_func/std": 0.48745131492614746, "step": 368, "step_time": 19.776455257087946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.6875, "completions/mean_terminated_length": 125.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2581064775586128, "epoch": 0.017091245947197777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013785755727440119, "kl": 0.00167951884213835, "learning_rate": 9.965910143584992e-07, "loss": 0.0001, "num_tokens": 10195782.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 369, "step_time": 13.841001875698566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.22950803488492966, "epoch": 0.01713756368689208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007128478027880192, "kl": 0.0009088489605346695, "learning_rate": 9.965817508105603e-07, "loss": 0.0, "num_tokens": 10221691.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 370, "step_time": 19.305286843329668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 214.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.31679457426071167, "epoch": 0.017183881426586382, "frac_reward_zero_std": 0.0, "grad_norm": 0.048416707664728165, "kl": 0.0011358647898305207, "learning_rate": 9.965724872626215e-07, "loss": -0.065, "num_tokens": 10251579.0, "reward": 0.9204668998718262, "reward_std": 0.2546977400779724, "rewards/reward_func/mean": 0.9204668998718262, "rewards/reward_func/std": 0.2546977400779724, "step": 371, "step_time": 24.533843584358692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 184.0625, "completions/mean_terminated_length": 184.0625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.1564195305109024, "epoch": 0.017230199166280685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004370823735371232, "kl": 0.0006024371250532568, "learning_rate": 9.965632237146828e-07, "loss": 0.0, "num_tokens": 10286716.0, "reward": 0.8385766744613647, "reward_std": 0.0, "rewards/reward_func/mean": 0.8385766744613647, "rewards/reward_func/std": 0.0, "step": 372, "step_time": 21.68499232083559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 113.9375, "completions/mean_terminated_length": 113.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.282099224627018, "epoch": 0.017276516905974988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009552930714562535, "kl": 0.0012493654940044507, "learning_rate": 9.96553960166744e-07, "loss": 0.0001, "num_tokens": 10307515.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 373, "step_time": 15.568065013736486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 183.75, "completions/mean_terminated_length": 183.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4235863834619522, "epoch": 0.01732283464566929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008119754493236542, "kl": 0.0013793996185995638, "learning_rate": 9.965446966188048e-07, "loss": 0.0001, "num_tokens": 10337463.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 374, "step_time": 21.269214287400246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2954377830028534, "epoch": 0.017369152385363594, "frac_reward_zero_std": 1.0, "grad_norm": 0.00681885052472353, "kl": 0.003083104733377695, "learning_rate": 9.965354330708662e-07, "loss": 0.0002, "num_tokens": 10373789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 375, "step_time": 17.8924512937665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 208.1875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.22619051113724709, "epoch": 0.017415470125057897, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009632823057472706, "kl": 0.001133952711825259, "learning_rate": 9.965261695229273e-07, "loss": 0.0001, "num_tokens": 10404960.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 376, "step_time": 22.028299398720264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.37804020941257477, "epoch": 0.0174617878647522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007527231355197728, "kl": 0.0013605688291136175, "learning_rate": 9.965169059749884e-07, "loss": 0.0001, "num_tokens": 10432332.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 377, "step_time": 18.87598704174161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 152.5625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.42359697818756104, "epoch": 0.017508105604446503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008097761310636997, "kl": 0.001501502498285845, "learning_rate": 9.965076424270495e-07, "loss": 0.0001, "num_tokens": 10483685.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 378, "step_time": 22.994311198592186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.31661297380924225, "epoch": 0.017554423344140806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008577098487876356, "kl": 0.0014004443655721843, "learning_rate": 9.964983788791107e-07, "loss": 0.0001, "num_tokens": 10505093.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 379, "step_time": 15.779139500111341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.387944720685482, "epoch": 0.01760074108383511, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007746884948574007, "kl": 0.0016006549121811986, "learning_rate": 9.964891153311718e-07, "loss": 0.0001, "num_tokens": 10564198.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 380, "step_time": 29.4970294944942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 185.9375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.42155060917139053, "epoch": 0.01764705882352941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006630704738199711, "kl": 0.0011422887037042528, "learning_rate": 9.96479851783233e-07, "loss": 0.0001, "num_tokens": 10590917.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 381, "step_time": 19.99205644056201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.33681362122297287, "epoch": 0.017693376563223714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008519992115907371, "kl": 0.001492333714850247, "learning_rate": 9.96470588235294e-07, "loss": 0.0001, "num_tokens": 10618075.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 382, "step_time": 20.17256711423397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 154.9375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.322447806596756, "epoch": 0.017739694302918017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006841674912720919, "kl": 0.0012063782196491957, "learning_rate": 9.964613246873552e-07, "loss": 0.0001, "num_tokens": 10639658.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 383, "step_time": 16.244937404990196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.1852053627371788, "epoch": 0.01778601204261232, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005796203622594476, "kl": 0.0007577495707664639, "learning_rate": 9.964520611394163e-07, "loss": 0.0, "num_tokens": 10678235.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 384, "step_time": 26.06255165860057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3715161606669426, "epoch": 0.017832329782306623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007285014144144952, "kl": 0.0013633314811158925, "learning_rate": 9.964427975914776e-07, "loss": 0.0001, "num_tokens": 10699711.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 385, "step_time": 19.731825433671474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.26311708986759186, "epoch": 0.017878647522000926, "frac_reward_zero_std": 0.0, "grad_norm": 0.07385022193193436, "kl": 0.001333654799964279, "learning_rate": 9.964335340435385e-07, "loss": -0.0118, "num_tokens": 10734239.0, "reward": 0.9915130138397217, "reward_std": 0.015181982889771461, "rewards/reward_func/mean": 0.9915130138397217, "rewards/reward_func/std": 0.015181982889771461, "step": 386, "step_time": 23.218905702233315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.24060431122779846, "epoch": 0.01792496526169523, "frac_reward_zero_std": 0.0, "grad_norm": 0.12353584170341492, "kl": 0.0016247373714577407, "learning_rate": 9.964242704955997e-07, "loss": -0.0333, "num_tokens": 10759387.0, "reward": 0.13478919863700867, "reward_std": 0.11306636780500412, "rewards/reward_func/mean": 0.13478919863700867, "rewards/reward_func/std": 0.11306636780500412, "step": 387, "step_time": 20.02836049720645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.29737626016139984, "epoch": 0.017971283001389532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006051292875781655, "kl": 0.0011766636307584122, "learning_rate": 9.964150069476608e-07, "loss": 0.0001, "num_tokens": 10780139.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 388, "step_time": 13.97381442412734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24731147661805153, "epoch": 0.018017600741083835, "frac_reward_zero_std": 1.0, "grad_norm": 0.00118044123519212, "kl": 0.0010196207003900781, "learning_rate": 9.964057433997221e-07, "loss": 0.0001, "num_tokens": 10799963.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 389, "step_time": 13.613224472850561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4084602892398834, "epoch": 0.018063918480778138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006236043991521001, "kl": 0.0011252271651756018, "learning_rate": 9.963964798517833e-07, "loss": 0.0001, "num_tokens": 10833107.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 390, "step_time": 19.000489212572575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2630554661154747, "epoch": 0.01811023622047244, "frac_reward_zero_std": 1.0, "grad_norm": 0.000608460686635226, "kl": 0.0009470109653193504, "learning_rate": 9.963872163038444e-07, "loss": 0.0, "num_tokens": 10859032.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 391, "step_time": 16.39188402891159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3491543009877205, "epoch": 0.018156553960166744, "frac_reward_zero_std": 0.0, "grad_norm": 0.0723331868648529, "kl": 0.0010232469794573262, "learning_rate": 9.963779527559055e-07, "loss": 0.0248, "num_tokens": 10879879.0, "reward": 0.6944708228111267, "reward_std": 0.4141024053096771, "rewards/reward_func/mean": 0.6944708228111267, "rewards/reward_func/std": 0.4141024053096771, "step": 392, "step_time": 21.867709532380104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 118.4375, "completions/mean_terminated_length": 118.4375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27682629972696304, "epoch": 0.018202871699861047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010108175920322537, "kl": 0.0014717382146045566, "learning_rate": 9.963686892079666e-07, "loss": 0.0001, "num_tokens": 10900654.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 393, "step_time": 13.679325930774212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.42017994076013565, "epoch": 0.01824918943955535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009347493760287762, "kl": 0.0015324850683100522, "learning_rate": 9.963594256600278e-07, "loss": 0.0001, "num_tokens": 10924332.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 394, "step_time": 19.324608132243156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 121.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23197819292545319, "epoch": 0.018295507179249652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008783585508354008, "kl": 0.0011164712195750326, "learning_rate": 9.963501621120889e-07, "loss": 0.0001, "num_tokens": 10943642.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 395, "step_time": 14.114130672067404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 194.5625, "completions/mean_terminated_length": 194.5625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.241884246468544, "epoch": 0.018341824918943955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007886892999522388, "kl": 0.0010119648650288582, "learning_rate": 9.9634089856415e-07, "loss": 0.0001, "num_tokens": 10967651.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 396, "step_time": 20.774743512272835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.1793980784714222, "epoch": 0.018388142658638258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007735713152214885, "kl": 0.0008657000289531425, "learning_rate": 9.963316350162111e-07, "loss": 0.0, "num_tokens": 10991209.0, "reward": 0.9574533700942993, "reward_std": 0.0, "rewards/reward_func/mean": 0.9574533700942993, "rewards/reward_func/std": 0.0, "step": 397, "step_time": 19.160449791699648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4707822874188423, "epoch": 0.01843446039833256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004372514085844159, "kl": 0.0012005920871160924, "learning_rate": 9.963223714682723e-07, "loss": 0.0001, "num_tokens": 11026235.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 398, "step_time": 20.6530371196568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 114.6875, "completions/mean_terminated_length": 114.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2661431133747101, "epoch": 0.018480778138026864, "frac_reward_zero_std": 1.0, "grad_norm": 0.000946751213632524, "kl": 0.0011715970758814365, "learning_rate": 9.963131079203334e-07, "loss": 0.0001, "num_tokens": 11046150.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 399, "step_time": 13.03931139409542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3072554022073746, "epoch": 0.018527095877721167, "frac_reward_zero_std": 1.0, "grad_norm": 0.043054718524217606, "kl": 0.007840050500817597, "learning_rate": 9.963038443723945e-07, "loss": 0.0004, "num_tokens": 11067684.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 400, "step_time": 16.174638397991657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.24510730803012848, "epoch": 0.01857341361741547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007515113684348762, "kl": 0.0011971300991717726, "learning_rate": 9.962945808244556e-07, "loss": 0.0001, "num_tokens": 11099912.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 401, "step_time": 19.840820968151093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 193.0625, "completions/mean_terminated_length": 193.0625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.18754839524626732, "epoch": 0.018619731357109773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005197523278184235, "kl": 0.000750317718484439, "learning_rate": 9.96285317276517e-07, "loss": 0.0, "num_tokens": 11125481.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 402, "step_time": 21.889157086610794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 144.75, "completions/mean_terminated_length": 144.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.20500480011105537, "epoch": 0.018666049096804076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009929519146680832, "kl": 0.0012351693003438413, "learning_rate": 9.96276053728578e-07, "loss": 0.0001, "num_tokens": 11146165.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 403, "step_time": 15.625893365591764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 124.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2573729380965233, "epoch": 0.01871236683649838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011042029364034534, "kl": 0.001303912722505629, "learning_rate": 9.962667901806392e-07, "loss": 0.0001, "num_tokens": 11165529.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 404, "step_time": 13.343416448682547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3041428029537201, "epoch": 0.01875868457619268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005377588095143437, "kl": 0.0009079412993742153, "learning_rate": 9.962575266327003e-07, "loss": 0.0, "num_tokens": 11187141.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 405, "step_time": 14.533459562808275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.16587964445352554, "epoch": 0.018805002315886984, "frac_reward_zero_std": 1.0, "grad_norm": 0.001329808379523456, "kl": 0.0007990232334122993, "learning_rate": 9.962482630847615e-07, "loss": 0.0, "num_tokens": 11222169.0, "reward": 0.9091564416885376, "reward_std": 0.0, "rewards/reward_func/mean": 0.9091564416885376, "rewards/reward_func/std": 0.0, "step": 406, "step_time": 21.331826210021973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3556923270225525, "epoch": 0.018851320055581287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015332532348111272, "kl": 0.002596778911538422, "learning_rate": 9.962389995368226e-07, "loss": 0.0001, "num_tokens": 11279503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 407, "step_time": 24.147029418498278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3321279138326645, "epoch": 0.01889763779527559, "frac_reward_zero_std": 1.0, "grad_norm": 0.00116622110363096, "kl": 0.0012340332905296236, "learning_rate": 9.962297359888837e-07, "loss": 0.0001, "num_tokens": 11300841.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 408, "step_time": 18.532040812075138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.2635272741317749, "epoch": 0.018943955534969893, "frac_reward_zero_std": 0.0, "grad_norm": 0.08912134915590286, "kl": 0.0022788423666497692, "learning_rate": 9.962204724409448e-07, "loss": -0.0253, "num_tokens": 11327393.0, "reward": 0.8944562077522278, "reward_std": 0.07349126785993576, "rewards/reward_func/mean": 0.8944562077522278, "rewards/reward_func/std": 0.07349127531051636, "step": 409, "step_time": 22.85629679635167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.19783100858330727, "epoch": 0.018990273274664196, "frac_reward_zero_std": 0.0, "grad_norm": 0.07475640624761581, "kl": 0.0014248900697566569, "learning_rate": 9.96211208893006e-07, "loss": 0.0045, "num_tokens": 11364684.0, "reward": 0.9977275133132935, "reward_std": 0.009089890867471695, "rewards/reward_func/mean": 0.9977275133132935, "rewards/reward_func/std": 0.009089887142181396, "step": 410, "step_time": 22.609322797507048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 238.5625, "completions/mean_terminated_length": 238.5625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.28156252205371857, "epoch": 0.0190365910143585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005377965862862766, "kl": 0.0008603379392297938, "learning_rate": 9.96201945345067e-07, "loss": 0.0, "num_tokens": 11387413.0, "reward": 0.9017226696014404, "reward_std": 0.0, "rewards/reward_func/mean": 0.9017226696014404, "rewards/reward_func/std": 0.0, "step": 411, "step_time": 23.784574691206217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.17503974959254265, "epoch": 0.019082908754052802, "frac_reward_zero_std": 0.0, "grad_norm": 0.06746372580528259, "kl": 0.0008583853050367907, "learning_rate": 9.961926817971282e-07, "loss": -0.0901, "num_tokens": 11419353.0, "reward": 0.8225798606872559, "reward_std": 0.23737215995788574, "rewards/reward_func/mean": 0.8225798606872559, "rewards/reward_func/std": 0.23737215995788574, "step": 412, "step_time": 20.757130481302738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 231.1875, "completions/mean_terminated_length": 231.1875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3314223960042, "epoch": 0.019129226493747105, "frac_reward_zero_std": 0.0, "grad_norm": 0.06072376295924187, "kl": 0.0010566960263531655, "learning_rate": 9.961834182491893e-07, "loss": -0.0666, "num_tokens": 11457596.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.5625, "rewards/reward_func/std": 0.5123475790023804, "step": 413, "step_time": 27.22598084807396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.26997319608926773, "epoch": 0.019175544233441408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008293652208521962, "kl": 0.0010149930603802204, "learning_rate": 9.961741547012505e-07, "loss": 0.0001, "num_tokens": 11480199.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 414, "step_time": 14.77862561494112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 173.0625, "completions/mean_terminated_length": 173.0625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4134935438632965, "epoch": 0.01922186197313571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008249669335782528, "kl": 0.001957462925929576, "learning_rate": 9.961648911533118e-07, "loss": 0.0001, "num_tokens": 11509320.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 415, "step_time": 20.258842054754496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 203.8125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.39648277312517166, "epoch": 0.019268179712830014, "frac_reward_zero_std": 1.0, "grad_norm": 0.002531531034037471, "kl": 0.001678121363511309, "learning_rate": 9.96155627605373e-07, "loss": 0.0001, "num_tokens": 11535669.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 416, "step_time": 21.772209532558918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.45256636291742325, "epoch": 0.019314497452524317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009294411283917725, "kl": 0.0014201795274857432, "learning_rate": 9.961463640574338e-07, "loss": 0.0001, "num_tokens": 11561291.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 417, "step_time": 22.37670413777232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2436264269053936, "epoch": 0.01936081519221862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012272412423044443, "kl": 0.001068125871825032, "learning_rate": 9.96137100509495e-07, "loss": 0.0001, "num_tokens": 11581721.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 418, "step_time": 13.968612048774958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 192.6875, "completions/mean_terminated_length": 192.6875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.26581909507513046, "epoch": 0.019407132931912922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004420572950039059, "kl": 0.0008505451696692035, "learning_rate": 9.961278369615563e-07, "loss": 0.0, "num_tokens": 11606228.0, "reward": 0.9146912097930908, "reward_std": 0.0, "rewards/reward_func/mean": 0.9146912097930908, "rewards/reward_func/std": 0.0, "step": 419, "step_time": 19.52475217729807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.23862052708864212, "epoch": 0.019453450671607225, "frac_reward_zero_std": 0.0, "grad_norm": 0.089634008705616, "kl": 0.000901429884834215, "learning_rate": 9.961185734136174e-07, "loss": 0.0194, "num_tokens": 11627816.0, "reward": 0.8680884838104248, "reward_std": 0.23149026930332184, "rewards/reward_func/mean": 0.8680884838104248, "rewards/reward_func/std": 0.23149026930332184, "step": 420, "step_time": 22.91817284375429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.41315415501594543, "epoch": 0.019499768411301528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009287703433074057, "kl": 0.0015140810573939234, "learning_rate": 9.961093098656785e-07, "loss": 0.0001, "num_tokens": 11649564.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 421, "step_time": 18.555513385683298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 163.5625, "completions/mean_terminated_length": 163.5625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3890605941414833, "epoch": 0.01954608615099583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006287552532739937, "kl": 0.0012807418243028224, "learning_rate": 9.961000463177397e-07, "loss": 0.0001, "num_tokens": 11697301.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 422, "step_time": 22.50880254805088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.40944691747426987, "epoch": 0.019592403890690134, "frac_reward_zero_std": 1.0, "grad_norm": 0.01755422353744507, "kl": 0.004535476444289088, "learning_rate": 9.960907827698008e-07, "loss": 0.0002, "num_tokens": 11732965.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 423, "step_time": 21.59409398585558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 139.4375, "completions/mean_terminated_length": 139.4375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.31128521263599396, "epoch": 0.019638721630384437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010005016811192036, "kl": 0.0014900580572430044, "learning_rate": 9.96081519221862e-07, "loss": 0.0001, "num_tokens": 11766268.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 424, "step_time": 18.541524816304445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1634841077029705, "epoch": 0.01968503937007874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008766019018366933, "kl": 0.0009638566698413342, "learning_rate": 9.96072255673923e-07, "loss": 0.0, "num_tokens": 11790020.0, "reward": 0.7958667874336243, "reward_std": 0.0, "rewards/reward_func/mean": 0.7958667874336243, "rewards/reward_func/std": 0.0, "step": 425, "step_time": 15.64836959168315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 118.375, "completions/mean_terminated_length": 118.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27606356143951416, "epoch": 0.019731357109773043, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008227747166529298, "kl": 0.0011175721156178042, "learning_rate": 9.960629921259842e-07, "loss": 0.0001, "num_tokens": 11813402.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 426, "step_time": 14.213473957031965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.1671118028461933, "epoch": 0.019777674849467346, "frac_reward_zero_std": 0.0, "grad_norm": 0.059474650770425797, "kl": 0.0008242270705522969, "learning_rate": 9.960537285780453e-07, "loss": 0.011, "num_tokens": 11847298.0, "reward": 0.9714365601539612, "reward_std": 0.029500193893909454, "rewards/reward_func/mean": 0.9714365601539612, "rewards/reward_func/std": 0.029500195756554604, "step": 427, "step_time": 20.827589195221663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 173.5625, "completions/mean_terminated_length": 173.5625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2779013514518738, "epoch": 0.01982399258916165, "frac_reward_zero_std": 0.0, "grad_norm": 0.06270330399274826, "kl": 0.001159066567197442, "learning_rate": 9.960444650301064e-07, "loss": -0.0458, "num_tokens": 11870059.0, "reward": 0.4364950656890869, "reward_std": 0.04501661658287048, "rewards/reward_func/mean": 0.4364950656890869, "rewards/reward_func/std": 0.04501662030816078, "step": 428, "step_time": 19.117902901023626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 134.3125, "completions/mean_terminated_length": 134.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.324234202504158, "epoch": 0.01987031032885595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010779553558677435, "kl": 0.0014260683383326977, "learning_rate": 9.960352014821675e-07, "loss": 0.0001, "num_tokens": 11898240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 429, "step_time": 16.36806231737137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 184.4375, "completions/mean_terminated_length": 184.4375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.21080321818590164, "epoch": 0.019916628068550254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005658076261170208, "kl": 0.0007707481126999483, "learning_rate": 9.960259379342287e-07, "loss": 0.0, "num_tokens": 11939351.0, "reward": 0.8611735105514526, "reward_std": 0.0, "rewards/reward_func/mean": 0.8611735105514526, "rewards/reward_func/std": 0.0, "step": 430, "step_time": 24.574472688138485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 190.4375, "completions/mean_terminated_length": 190.4375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2915353327989578, "epoch": 0.019962945808244557, "frac_reward_zero_std": 0.0, "grad_norm": 0.05731703341007233, "kl": 0.0008475943322991952, "learning_rate": 9.960166743862898e-07, "loss": -0.0397, "num_tokens": 11960782.0, "reward": 0.5870636701583862, "reward_std": 0.47294896841049194, "rewards/reward_func/mean": 0.5870636701583862, "rewards/reward_func/std": 0.47294896841049194, "step": 431, "step_time": 19.433797158300877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 176.4375, "completions/mean_terminated_length": 176.4375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.18618595600128174, "epoch": 0.02000926354793886, "frac_reward_zero_std": 1.0, "grad_norm": 0.00048418284859508276, "kl": 0.000596129655605182, "learning_rate": 9.960074108383511e-07, "loss": 0.0, "num_tokens": 12015493.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 432, "step_time": 26.345379684120417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 182.8125, "completions/mean_terminated_length": 182.8125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.35556047409772873, "epoch": 0.020055581287633163, "frac_reward_zero_std": 1.0, "grad_norm": 0.000724254350643605, "kl": 0.0011773772130254656, "learning_rate": 9.959981472904123e-07, "loss": 0.0001, "num_tokens": 12068578.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 433, "step_time": 27.155123606324196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.1845146119594574, "epoch": 0.020101899027327466, "frac_reward_zero_std": 1.0, "grad_norm": 0.000514318875502795, "kl": 0.000833735044579953, "learning_rate": 9.959888837424734e-07, "loss": 0.0, "num_tokens": 12098552.0, "reward": 0.3916056156158447, "reward_std": 0.0, "rewards/reward_func/mean": 0.3916056156158447, "rewards/reward_func/std": 0.0, "step": 434, "step_time": 17.356296803802252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 110.0, "completions/mean_terminated_length": 110.0, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.27629970014095306, "epoch": 0.02014821676702177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009520486346445978, "kl": 0.0012683477252721786, "learning_rate": 9.959796201945345e-07, "loss": 0.0001, "num_tokens": 12118168.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 435, "step_time": 12.731515988707542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.1562896929681301, "epoch": 0.020194534506716072, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003385805175639689, "kl": 0.0005081234630779363, "learning_rate": 9.959703566465956e-07, "loss": 0.0, "num_tokens": 12151799.0, "reward": 0.9000876545906067, "reward_std": 0.0, "rewards/reward_func/mean": 0.9000876545906067, "rewards/reward_func/std": 0.0, "step": 436, "step_time": 20.731846310198307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 111.5625, "completions/mean_terminated_length": 111.5625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27672744169831276, "epoch": 0.020240852246410375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011817258782684803, "kl": 0.0015236828476190567, "learning_rate": 9.959610930986568e-07, "loss": 0.0001, "num_tokens": 12173632.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 437, "step_time": 13.193745326250792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 162.0625, "completions/mean_terminated_length": 162.0625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.16578193753957748, "epoch": 0.020287169986104678, "frac_reward_zero_std": 0.0, "grad_norm": 0.10385134816169739, "kl": 0.0011733440915122628, "learning_rate": 9.959518295507179e-07, "loss": 0.0098, "num_tokens": 12218161.0, "reward": 0.8985783457756042, "reward_std": 0.05031924694776535, "rewards/reward_func/mean": 0.8985783457756042, "rewards/reward_func/std": 0.05031923949718475, "step": 438, "step_time": 22.68815726041794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 203.5, "completions/mean_terminated_length": 203.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.18357586860656738, "epoch": 0.02033348772579898, "frac_reward_zero_std": 0.0, "grad_norm": 0.045642703771591187, "kl": 0.0006995180883677676, "learning_rate": 9.95942566002779e-07, "loss": -0.0237, "num_tokens": 12258713.0, "reward": 0.04365791007876396, "reward_std": 0.010465163737535477, "rewards/reward_func/mean": 0.04365791007876396, "rewards/reward_func/std": 0.010465164668858051, "step": 439, "step_time": 23.417887415736914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3608877509832382, "epoch": 0.020379805465493284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020068450830876827, "kl": 0.002231194492196664, "learning_rate": 9.959333024548401e-07, "loss": 0.0001, "num_tokens": 12289719.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 440, "step_time": 20.223703049123287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 139.1875, "completions/mean_terminated_length": 139.1875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.21962684765458107, "epoch": 0.020426123205187587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006896913982927799, "kl": 0.0009931822860380635, "learning_rate": 9.959240389069013e-07, "loss": 0.0, "num_tokens": 12320538.0, "reward": 0.09207873791456223, "reward_std": 0.0, "rewards/reward_func/mean": 0.09207873791456223, "rewards/reward_func/std": 0.0, "step": 441, "step_time": 16.85022407770157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 135.0625, "completions/mean_terminated_length": 135.0625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.32932811975479126, "epoch": 0.02047244094488189, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010856821900233626, "kl": 0.001221887971041724, "learning_rate": 9.959147753589624e-07, "loss": 0.0001, "num_tokens": 12343419.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 442, "step_time": 16.05213586986065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29498112946748734, "epoch": 0.020518758684576192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008743335492908955, "kl": 0.0013112361484672874, "learning_rate": 9.959055118110235e-07, "loss": 0.0001, "num_tokens": 12365243.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 443, "step_time": 15.129100944846869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 127.0625, "completions/mean_terminated_length": 127.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3172316774725914, "epoch": 0.020565076424270495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007861247868277133, "kl": 0.001013173081446439, "learning_rate": 9.958962482630846e-07, "loss": 0.0001, "num_tokens": 12401052.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 444, "step_time": 18.39372304826975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2586454674601555, "epoch": 0.020611394163964798, "frac_reward_zero_std": 0.0, "grad_norm": 0.07765398174524307, "kl": 0.0010438812023494393, "learning_rate": 9.95886984715146e-07, "loss": -0.0045, "num_tokens": 12425165.0, "reward": 0.9533541202545166, "reward_std": 0.07145605236291885, "rewards/reward_func/mean": 0.9533541202545166, "rewards/reward_func/std": 0.07145605981349945, "step": 445, "step_time": 19.860888108611107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.27377403527498245, "epoch": 0.0206577119036591, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006911946693435311, "kl": 0.000904399566934444, "learning_rate": 9.95877721167207e-07, "loss": 0.0, "num_tokens": 12446450.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 446, "step_time": 17.764220606535673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4193510413169861, "epoch": 0.020704029643353404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007914117304608226, "kl": 0.00129216568893753, "learning_rate": 9.958684576192682e-07, "loss": 0.0001, "num_tokens": 12470876.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 447, "step_time": 17.249667938798666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 206.0625, "completions/mean_terminated_length": 206.0625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.21897048875689507, "epoch": 0.020750347383047707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007031591376289725, "kl": 0.0008966055902419612, "learning_rate": 9.958591940713291e-07, "loss": 0.0, "num_tokens": 12508829.0, "reward": 0.7165313363075256, "reward_std": 0.0, "rewards/reward_func/mean": 0.7165313363075256, "rewards/reward_func/std": 0.0, "step": 448, "step_time": 23.306960482150316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.37646690756082535, "epoch": 0.02079666512274201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007410895777866244, "kl": 0.0014292299165390432, "learning_rate": 9.958499305233905e-07, "loss": 0.0001, "num_tokens": 12538409.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 449, "step_time": 18.985281493514776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 149.0625, "completions/mean_terminated_length": 149.0625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2667747214436531, "epoch": 0.020842982862436313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006505842902697623, "kl": 0.0009782446722965688, "learning_rate": 9.958406669754516e-07, "loss": 0.0, "num_tokens": 12559786.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 450, "step_time": 15.077436048537493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.39849378913640976, "epoch": 0.020889300602130616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008802087977528572, "kl": 0.0013437908492051065, "learning_rate": 9.958314034275127e-07, "loss": 0.0001, "num_tokens": 12583704.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 451, "step_time": 18.90094792470336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2805650979280472, "epoch": 0.02093561834182492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007345624617300928, "kl": 0.001147609727922827, "learning_rate": 9.958221398795738e-07, "loss": 0.0001, "num_tokens": 12605346.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 452, "step_time": 13.880728926509619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 206.5625, "completions/mean_terminated_length": 206.5625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.26133378595113754, "epoch": 0.02098193608151922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008001459063962102, "kl": 0.0011924623395316303, "learning_rate": 9.95812876331635e-07, "loss": 0.0001, "num_tokens": 12642955.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 453, "step_time": 24.59642492234707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 229.1875, "completions/mean_terminated_length": 229.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.37597081810235977, "epoch": 0.021028253821213524, "frac_reward_zero_std": 0.0, "grad_norm": 0.07688658684492111, "kl": 0.0026821557548828423, "learning_rate": 9.95803612783696e-07, "loss": -0.2149, "num_tokens": 12676414.0, "reward": 0.3576716184616089, "reward_std": 0.47980526089668274, "rewards/reward_func/mean": 0.3576716184616089, "rewards/reward_func/std": 0.47980526089668274, "step": 454, "step_time": 30.136327359825373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.39297641068696976, "epoch": 0.021074571560907827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009784556459635496, "kl": 0.0014437072968576103, "learning_rate": 9.957943492357572e-07, "loss": 0.0001, "num_tokens": 12708710.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 455, "step_time": 22.129728976637125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 172.6875, "completions/mean_terminated_length": 172.6875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.11051014810800552, "epoch": 0.02112088930060213, "frac_reward_zero_std": 1.0, "grad_norm": 0.00026434153551235795, "kl": 0.00039409926830558106, "learning_rate": 9.957850856878183e-07, "loss": 0.0, "num_tokens": 12743441.0, "reward": 0.5044883489608765, "reward_std": 0.0, "rewards/reward_func/mean": 0.5044883489608765, "rewards/reward_func/std": 0.0, "step": 456, "step_time": 19.395647291094065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4058089703321457, "epoch": 0.021167207040296433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007130375597625971, "kl": 0.0013419112074188888, "learning_rate": 9.957758221398795e-07, "loss": 0.0001, "num_tokens": 12785155.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 457, "step_time": 20.202812299132347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.40577686578035355, "epoch": 0.021213524779990736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009446601616218686, "kl": 0.0013670599437318742, "learning_rate": 9.957665585919406e-07, "loss": 0.0001, "num_tokens": 12811517.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 458, "step_time": 19.652188416570425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.40780486911535263, "epoch": 0.02125984251968504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006535582942888141, "kl": 0.0013698584225494415, "learning_rate": 9.95757295044002e-07, "loss": 0.0001, "num_tokens": 12844875.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 459, "step_time": 20.958265770226717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 127.8125, "completions/mean_terminated_length": 127.8125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2913421094417572, "epoch": 0.021306160259379342, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007478291518054903, "kl": 0.0011158910201629624, "learning_rate": 9.957480314960628e-07, "loss": 0.0001, "num_tokens": 12868040.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 460, "step_time": 16.351258099079132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2619505301117897, "epoch": 0.021352477999073645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005713357822969556, "kl": 0.0010241223353659734, "learning_rate": 9.95738767948124e-07, "loss": 0.0001, "num_tokens": 12889377.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 461, "step_time": 13.749565534293652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 120.9375, "completions/mean_terminated_length": 120.9375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3522017151117325, "epoch": 0.021398795738767948, "frac_reward_zero_std": 1.0, "grad_norm": 0.001095265382900834, "kl": 0.0015934919065330178, "learning_rate": 9.957295044001853e-07, "loss": 0.0001, "num_tokens": 12911456.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 462, "step_time": 14.89537600800395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 198.5625, "completions/mean_terminated_length": 198.5625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.39398887008428574, "epoch": 0.02144511347846225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012047621421515942, "kl": 0.0014141463616397232, "learning_rate": 9.957202408522464e-07, "loss": 0.0001, "num_tokens": 12940873.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 463, "step_time": 22.350401777774096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 146.3125, "completions/mean_terminated_length": 146.3125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3431501239538193, "epoch": 0.021491431218156554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072228494100272655, "kl": 0.004282706417143345, "learning_rate": 9.957109773043076e-07, "loss": 0.0002, "num_tokens": 12969886.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 464, "step_time": 17.743618417531252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.16300039738416672, "epoch": 0.021537748957850857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005739906919188797, "kl": 0.0008377209451282397, "learning_rate": 9.957017137563687e-07, "loss": 0.0, "num_tokens": 12991456.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 465, "step_time": 16.602536369115114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3950920030474663, "epoch": 0.02158406669754516, "frac_reward_zero_std": 0.0, "grad_norm": 0.07530498504638672, "kl": 0.001105832023313269, "learning_rate": 9.956924502084298e-07, "loss": -0.0647, "num_tokens": 13014336.0, "reward": 0.21374839544296265, "reward_std": 0.2851552963256836, "rewards/reward_func/mean": 0.21374839544296265, "rewards/reward_func/std": 0.2851552963256836, "step": 466, "step_time": 24.43825474753976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 182.125, "completions/mean_terminated_length": 182.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.32652004808187485, "epoch": 0.021630384437239462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004810943210031837, "kl": 0.0010445124644320458, "learning_rate": 9.95683186660491e-07, "loss": 0.0001, "num_tokens": 13039746.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 467, "step_time": 18.553403332829475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 152.6875, "completions/mean_terminated_length": 152.6875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.2733049914240837, "epoch": 0.021676702176933765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011022585676982999, "kl": 0.0011483504786156118, "learning_rate": 9.95673923112552e-07, "loss": 0.0001, "num_tokens": 13066621.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 468, "step_time": 16.856421019881964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.2896248959004879, "epoch": 0.021723019916628068, "frac_reward_zero_std": 0.0, "grad_norm": 0.05923737585544586, "kl": 0.0010491950379218906, "learning_rate": 9.956646595646132e-07, "loss": -0.0386, "num_tokens": 13096899.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 469, "step_time": 29.257147755473852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.287365585565567, "epoch": 0.02176933765632237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010540344519540668, "kl": 0.0010919710621237755, "learning_rate": 9.956553960166743e-07, "loss": 0.0001, "num_tokens": 13116612.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 470, "step_time": 14.792682588100433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.27139056473970413, "epoch": 0.021815655396016674, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026716021820902824, "kl": 0.001344893971690908, "learning_rate": 9.956461324687354e-07, "loss": 0.0001, "num_tokens": 13140085.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 471, "step_time": 15.204796615988016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 231.8125, "completions/mean_terminated_length": 231.8125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.30421893298625946, "epoch": 0.021861973135710977, "frac_reward_zero_std": 0.0, "grad_norm": 0.06558000296354294, "kl": 0.000971016037510708, "learning_rate": 9.956368689207966e-07, "loss": -0.0746, "num_tokens": 13171570.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 472, "step_time": 25.866084907203913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 283.3125, "completions/mean_terminated_length": 283.3125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.471021831035614, "epoch": 0.02190829087540528, "frac_reward_zero_std": 0.0, "grad_norm": 0.05412129685282707, "kl": 0.0016661942936480045, "learning_rate": 9.956276053728577e-07, "loss": 0.0941, "num_tokens": 13200359.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 473, "step_time": 33.09069042280316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 129.6875, "completions/mean_terminated_length": 129.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24906113743782043, "epoch": 0.021954608615099583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006597902975045145, "kl": 0.0008146143518388271, "learning_rate": 9.956183418249188e-07, "loss": 0.0, "num_tokens": 13225858.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 474, "step_time": 15.069956101477146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 115.75, "completions/mean_terminated_length": 115.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.24458341300487518, "epoch": 0.022000926354793886, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016917382599785924, "kl": 0.0013197254738770425, "learning_rate": 9.956090782769801e-07, "loss": 0.0001, "num_tokens": 13245422.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 475, "step_time": 13.75100864097476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2562222480773926, "epoch": 0.02204724409448819, "frac_reward_zero_std": 0.0, "grad_norm": 0.09810595959424973, "kl": 0.0012838072143495083, "learning_rate": 9.955998147290413e-07, "loss": -0.0063, "num_tokens": 13277136.0, "reward": 0.5895459055900574, "reward_std": 0.0756286084651947, "rewards/reward_func/mean": 0.5895459055900574, "rewards/reward_func/std": 0.0756286159157753, "step": 476, "step_time": 20.08931627869606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 157.4375, "completions/mean_terminated_length": 157.4375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.17066820710897446, "epoch": 0.02209356183418249, "frac_reward_zero_std": 0.0, "grad_norm": 0.06698978692293167, "kl": 0.0012453247618395835, "learning_rate": 9.955905511811024e-07, "loss": -0.0196, "num_tokens": 13298839.0, "reward": 0.8975909948348999, "reward_std": 0.027001073583960533, "rewards/reward_func/mean": 0.8975909948348999, "rewards/reward_func/std": 0.027001069858670235, "step": 477, "step_time": 15.904055442661047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 244.75, "completions/mean_terminated_length": 244.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.37555670738220215, "epoch": 0.022139879573876795, "frac_reward_zero_std": 0.0, "grad_norm": 0.058952413499355316, "kl": 0.0011130543425679207, "learning_rate": 9.955812876331633e-07, "loss": -0.2187, "num_tokens": 13326419.0, "reward": 0.35912901163101196, "reward_std": 0.46907129883766174, "rewards/reward_func/mean": 0.35912901163101196, "rewards/reward_func/std": 0.46907132863998413, "step": 478, "step_time": 30.928753718733788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3450111076235771, "epoch": 0.022186197313571097, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016331137157976627, "kl": 0.001576541137183085, "learning_rate": 9.955720240852246e-07, "loss": 0.0001, "num_tokens": 13362325.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 479, "step_time": 16.370044983923435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 233.4375, "completions/mean_terminated_length": 233.4375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.25906096026301384, "epoch": 0.0222325150532654, "frac_reward_zero_std": 0.0, "grad_norm": 0.07396862655878067, "kl": 0.0010420548933325335, "learning_rate": 9.955627605372858e-07, "loss": -0.1119, "num_tokens": 13402044.0, "reward": 0.1792462170124054, "reward_std": 0.33070844411849976, "rewards/reward_func/mean": 0.1792462170124054, "rewards/reward_func/std": 0.33070847392082214, "step": 480, "step_time": 29.113693229854107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 139.8125, "completions/mean_terminated_length": 139.8125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.34055541455745697, "epoch": 0.022278832792959703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013534241588786244, "kl": 0.001348479330772534, "learning_rate": 9.955534969893469e-07, "loss": 0.0001, "num_tokens": 13437481.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 481, "step_time": 19.76912584528327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.42396819591522217, "epoch": 0.022325150532654006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005905788857489824, "kl": 0.0013149116421118379, "learning_rate": 9.95544233441408e-07, "loss": 0.0001, "num_tokens": 13459581.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 482, "step_time": 19.408626589924097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4352477863430977, "epoch": 0.02237146827234831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007822702755220234, "kl": 0.001167302776593715, "learning_rate": 9.955349698934691e-07, "loss": 0.0001, "num_tokens": 13482039.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 483, "step_time": 22.192662086337805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.2297377660870552, "epoch": 0.022417786012042612, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005211597308516502, "kl": 0.0008147654589265585, "learning_rate": 9.955257063455303e-07, "loss": 0.0, "num_tokens": 13510611.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 484, "step_time": 20.270441822707653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3842536062002182, "epoch": 0.022464103751736915, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008047992596402764, "kl": 0.0013092006847728044, "learning_rate": 9.955164427975914e-07, "loss": 0.0001, "num_tokens": 13536119.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 485, "step_time": 20.586454547941685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.27248460054397583, "epoch": 0.022510421491431218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005648453952744603, "kl": 0.0011865806591231376, "learning_rate": 9.955071792496525e-07, "loss": 0.0001, "num_tokens": 13555804.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 486, "step_time": 14.476582117378712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 125.1875, "completions/mean_terminated_length": 125.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.27011676877737045, "epoch": 0.02255673923112552, "frac_reward_zero_std": 1.0, "grad_norm": 0.001830546883866191, "kl": 0.0018114371341653168, "learning_rate": 9.954979157017136e-07, "loss": 0.0001, "num_tokens": 13575711.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 487, "step_time": 13.421096365898848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.37179628759622574, "epoch": 0.022603056970819824, "frac_reward_zero_std": 1.0, "grad_norm": 0.001344718155451119, "kl": 0.001363539631711319, "learning_rate": 9.954886521537748e-07, "loss": 0.0001, "num_tokens": 13612651.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 488, "step_time": 23.876486036926508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3013797104358673, "epoch": 0.022649374710514127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032518936786800623, "kl": 0.0018434434314258397, "learning_rate": 9.95479388605836e-07, "loss": 0.0001, "num_tokens": 13632490.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 489, "step_time": 15.357919406145811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 149.6875, "completions/mean_terminated_length": 149.6875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.32749275863170624, "epoch": 0.02269569245020843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018983208574354649, "kl": 0.0014090609329286963, "learning_rate": 9.954701250578972e-07, "loss": 0.0001, "num_tokens": 13673333.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 490, "step_time": 21.186318166553974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 132.1875, "completions/mean_terminated_length": 132.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2657178193330765, "epoch": 0.022742010189902732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010834588902071118, "kl": 0.001249116670805961, "learning_rate": 9.954608615099581e-07, "loss": 0.0001, "num_tokens": 13694792.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 491, "step_time": 14.536582689732313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.33631060272455215, "epoch": 0.022788327929597035, "frac_reward_zero_std": 1.0, "grad_norm": 0.001670774887315929, "kl": 0.0016863863274920732, "learning_rate": 9.954515979620195e-07, "loss": 0.0001, "num_tokens": 13724054.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 492, "step_time": 17.81245766952634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.4623207077383995, "epoch": 0.02283464566929134, "frac_reward_zero_std": 0.0, "grad_norm": 0.06237829104065895, "kl": 0.001233977556694299, "learning_rate": 9.954423344140806e-07, "loss": 0.0939, "num_tokens": 13751518.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 493, "step_time": 27.27083507925272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.2014639787375927, "epoch": 0.02288096340898564, "frac_reward_zero_std": 0.0, "grad_norm": 0.05373659357428551, "kl": 0.000706080420059152, "learning_rate": 9.954330708661417e-07, "loss": 0.0154, "num_tokens": 13774441.0, "reward": 0.9466155767440796, "reward_std": 0.02992909401655197, "rewards/reward_func/mean": 0.9466155767440796, "rewards/reward_func/std": 0.029929067939519882, "step": 494, "step_time": 19.53807992488146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 166.0625, "completions/mean_terminated_length": 166.0625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.407911978662014, "epoch": 0.022927281148679944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012372437631711364, "kl": 0.0017013985197991133, "learning_rate": 9.954238073182029e-07, "loss": 0.0001, "num_tokens": 13804762.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 495, "step_time": 19.689786564558744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 132.3125, "completions/mean_terminated_length": 132.3125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3699936494231224, "epoch": 0.022973598888374247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007731465739198029, "kl": 0.0011656057758955285, "learning_rate": 9.95414543770264e-07, "loss": 0.0001, "num_tokens": 13827263.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 496, "step_time": 16.126573752611876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 194.6875, "completions/mean_terminated_length": 194.6875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3789226859807968, "epoch": 0.02301991662806855, "frac_reward_zero_std": 0.0, "grad_norm": 0.07127277553081512, "kl": 0.008387453213799745, "learning_rate": 9.95405280222325e-07, "loss": -0.0577, "num_tokens": 13856154.0, "reward": 0.03359834849834442, "reward_std": 0.1343933790922165, "rewards/reward_func/mean": 0.03359834849834442, "rewards/reward_func/std": 0.13439339399337769, "step": 497, "step_time": 22.471235185861588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3219013437628746, "epoch": 0.023066234367762853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009430369827896357, "kl": 0.0014280521718319505, "learning_rate": 9.953960166743862e-07, "loss": 0.0001, "num_tokens": 13879857.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 498, "step_time": 14.805292218923569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3636002764105797, "epoch": 0.023112552107457156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008227949729189277, "kl": 0.0012659696803893894, "learning_rate": 9.953867531264474e-07, "loss": 0.0001, "num_tokens": 13916253.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 499, "step_time": 18.45577147603035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 232.3125, "completions/mean_terminated_length": 232.3125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.38027258962392807, "epoch": 0.02315886984715146, "frac_reward_zero_std": 0.0, "grad_norm": 0.0514298640191555, "kl": 0.0013120945077389479, "learning_rate": 9.953774895785085e-07, "loss": -0.1189, "num_tokens": 13950914.0, "reward": 0.1100812703371048, "reward_std": 0.30393990874290466, "rewards/reward_func/mean": 0.1100812703371048, "rewards/reward_func/std": 0.30393993854522705, "step": 500, "step_time": 28.57885792478919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 192.5625, "completions/mean_terminated_length": 192.5625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.20426442846655846, "epoch": 0.02320518758684576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004283256712369621, "kl": 0.0006900559674249962, "learning_rate": 9.953682260305696e-07, "loss": 0.0, "num_tokens": 13978923.0, "reward": 0.9555630087852478, "reward_std": 0.0, "rewards/reward_func/mean": 0.9555630087852478, "rewards/reward_func/std": 0.0, "step": 501, "step_time": 20.524554256349802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.1600031666457653, "epoch": 0.023251505326540065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004713285597972572, "kl": 0.0007226398593047634, "learning_rate": 9.95358962482631e-07, "loss": 0.0, "num_tokens": 13999361.0, "reward": 0.5697828531265259, "reward_std": 0.0, "rewards/reward_func/mean": 0.5697828531265259, "rewards/reward_func/std": 0.0, "step": 502, "step_time": 14.741988241672516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.2466079480946064, "epoch": 0.023297823066234367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005793596501462162, "kl": 0.0008950445044320077, "learning_rate": 9.953496989346918e-07, "loss": 0.0, "num_tokens": 14030511.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 503, "step_time": 24.83066874742508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3423362225294113, "epoch": 0.02334414080592867, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007454920560121536, "kl": 0.001252439513336867, "learning_rate": 9.95340435386753e-07, "loss": 0.0001, "num_tokens": 14050760.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 504, "step_time": 15.452152878046036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.37029866874217987, "epoch": 0.023390458545622973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006513205007649958, "kl": 0.0011104259465355426, "learning_rate": 9.953311718388143e-07, "loss": 0.0001, "num_tokens": 14072466.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 505, "step_time": 17.8039546944201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 140.6875, "completions/mean_terminated_length": 140.6875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.19547295570373535, "epoch": 0.023436776285317276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007712005754001439, "kl": 0.0010060306667583063, "learning_rate": 9.953219082908754e-07, "loss": 0.0001, "num_tokens": 14097037.0, "reward": 0.4203503727912903, "reward_std": 0.0, "rewards/reward_func/mean": 0.4203503727912903, "rewards/reward_func/std": 0.0, "step": 506, "step_time": 15.219443429261446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3754529803991318, "epoch": 0.02348309402501158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011129234917461872, "kl": 0.0015130334359128028, "learning_rate": 9.953126447429366e-07, "loss": 0.0001, "num_tokens": 14141049.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 507, "step_time": 22.853277012705803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4032375067472458, "epoch": 0.023529411764705882, "frac_reward_zero_std": 1.0, "grad_norm": 0.001507847453467548, "kl": 0.0017335480370093137, "learning_rate": 9.953033811949977e-07, "loss": 0.0001, "num_tokens": 14173304.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 508, "step_time": 22.368615679442883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 302.6875, "completions/mean_terminated_length": 302.6875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "entropy": 0.25965646654367447, "epoch": 0.023575729504400185, "frac_reward_zero_std": 0.0, "grad_norm": 0.05408618599176407, "kl": 0.0009873932285699993, "learning_rate": 9.952941176470588e-07, "loss": 0.0077, "num_tokens": 14204499.0, "reward": 0.923896312713623, "reward_std": 0.015388688072562218, "rewards/reward_func/mean": 0.923896312713623, "rewards/reward_func/std": 0.015388698317110538, "step": 509, "step_time": 32.79489414393902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 107.8125, "completions/mean_terminated_length": 107.8125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.27770448476076126, "epoch": 0.023622047244094488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007120308000594378, "kl": 0.001090057980036363, "learning_rate": 9.9528485409912e-07, "loss": 0.0001, "num_tokens": 14224224.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 510, "step_time": 12.823708292096853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 133.9375, "completions/mean_terminated_length": 133.9375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3049366846680641, "epoch": 0.02366836498378879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012418744154274464, "kl": 0.0012951338139828295, "learning_rate": 9.95275590551181e-07, "loss": 0.0001, "num_tokens": 14260191.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 511, "step_time": 17.79165342450142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 122.0, "completions/mean_terminated_length": 122.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.29965297132730484, "epoch": 0.023714682723483094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014361172216013074, "kl": 0.0014124661247478798, "learning_rate": 9.952663270032422e-07, "loss": 0.0001, "num_tokens": 14280591.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 512, "step_time": 14.293915253132582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.16042153909802437, "epoch": 0.023761000463177397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005578518030233681, "kl": 0.0007172806363087147, "learning_rate": 9.952570634553033e-07, "loss": 0.0, "num_tokens": 14301999.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 513, "step_time": 16.87550877034664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27408745139837265, "epoch": 0.0238073182028717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013380669988691807, "kl": 0.001637225184822455, "learning_rate": 9.952477999073644e-07, "loss": 0.0001, "num_tokens": 14323767.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 514, "step_time": 15.414053175598383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.37802645564079285, "epoch": 0.023853635942566002, "frac_reward_zero_std": 0.0, "grad_norm": 0.06617993861436844, "kl": 0.000993274530628696, "learning_rate": 9.952385363594256e-07, "loss": -0.0897, "num_tokens": 14345673.0, "reward": 0.2039203941822052, "reward_std": 0.3123822808265686, "rewards/reward_func/mean": 0.2039203941822052, "rewards/reward_func/std": 0.3123822510242462, "step": 515, "step_time": 23.67078560963273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.19579781964421272, "epoch": 0.023899953682260305, "frac_reward_zero_std": 0.0, "grad_norm": 0.0758872702717781, "kl": 0.0009364501020172611, "learning_rate": 9.952292728114867e-07, "loss": -0.0277, "num_tokens": 14369527.0, "reward": 0.8958791494369507, "reward_std": 0.04135281220078468, "rewards/reward_func/mean": 0.8958791494369507, "rewards/reward_func/std": 0.041352808475494385, "step": 516, "step_time": 16.928487829864025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29442620277404785, "epoch": 0.02394627142195461, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006312267505563796, "kl": 0.0012187010725028813, "learning_rate": 9.952200092635478e-07, "loss": 0.0001, "num_tokens": 14391319.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 517, "step_time": 15.052225556224585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 138.8125, "completions/mean_terminated_length": 138.8125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.24572154507040977, "epoch": 0.02399258916164891, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008521306444890797, "kl": 0.0010103998938575387, "learning_rate": 9.95210745715609e-07, "loss": 0.0001, "num_tokens": 14412100.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 518, "step_time": 14.790097005665302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3711855635046959, "epoch": 0.024038906901343214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004503102391026914, "kl": 0.001057341054547578, "learning_rate": 9.952014821676703e-07, "loss": 0.0001, "num_tokens": 14436751.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 519, "step_time": 17.75486048310995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2414422668516636, "epoch": 0.024085224641037517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006374597433023155, "kl": 0.0008082981657935306, "learning_rate": 9.951922186197314e-07, "loss": 0.0, "num_tokens": 14456766.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 520, "step_time": 15.141897857189178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 193.5625, "completions/mean_terminated_length": 193.5625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.41844432801008224, "epoch": 0.02413154238073182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013825239147990942, "kl": 0.0015546581416856498, "learning_rate": 9.951829550717923e-07, "loss": 0.0001, "num_tokens": 14478535.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 521, "step_time": 25.464532054960728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 115.25, "completions/mean_terminated_length": 115.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.26581552624702454, "epoch": 0.024177860120426123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017415828770026565, "kl": 0.0015278060163836926, "learning_rate": 9.951736915238536e-07, "loss": 0.0001, "num_tokens": 14497899.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 522, "step_time": 13.22960414364934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 176.5625, "completions/mean_terminated_length": 176.5625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.34090740233659744, "epoch": 0.024224177860120426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005656389403156936, "kl": 0.0011169943463755772, "learning_rate": 9.951644279759148e-07, "loss": 0.0001, "num_tokens": 14524932.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 523, "step_time": 21.30846729502082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.18956785276532173, "epoch": 0.02427049559981473, "frac_reward_zero_std": 1.0, "grad_norm": 0.000552937388420105, "kl": 0.0009237581252818927, "learning_rate": 9.95155164427976e-07, "loss": 0.0, "num_tokens": 14545590.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 524, "step_time": 15.692125141620636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 280.8125, "completions/mean_terminated_length": 280.8125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.1719597429037094, "epoch": 0.02431681333950903, "frac_reward_zero_std": 0.0, "grad_norm": 0.04176125302910805, "kl": 0.0007282002043211833, "learning_rate": 9.95145900880037e-07, "loss": -0.1117, "num_tokens": 14578867.0, "reward": 0.6490556001663208, "reward_std": 0.3030133843421936, "rewards/reward_func/mean": 0.6490556001663208, "rewards/reward_func/std": 0.3030133843421936, "step": 525, "step_time": 30.006157591938972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 195.9375, "completions/mean_terminated_length": 195.9375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.2237008437514305, "epoch": 0.024363131079203335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009089101804420352, "kl": 0.0010247548634652048, "learning_rate": 9.951366373320981e-07, "loss": 0.0001, "num_tokens": 14609650.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 526, "step_time": 21.09304867312312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.1688799485564232, "epoch": 0.024409448818897637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007246082532219589, "kl": 0.0008973616058938205, "learning_rate": 9.951273737841593e-07, "loss": 0.0, "num_tokens": 14642344.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 527, "step_time": 17.77213069051504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 205.4375, "completions/mean_terminated_length": 205.4375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.41943180561065674, "epoch": 0.02445576655859194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006336580845527351, "kl": 0.0012316417414695024, "learning_rate": 9.951181102362204e-07, "loss": 0.0001, "num_tokens": 14667151.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 528, "step_time": 22.779205039143562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 132.125, "completions/mean_terminated_length": 132.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2875938192009926, "epoch": 0.024502084298286243, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009506710339337587, "kl": 0.0011937796953134239, "learning_rate": 9.951088466882815e-07, "loss": 0.0001, "num_tokens": 14689745.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 529, "step_time": 15.668328743427992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 112.6875, "completions/mean_terminated_length": 112.6875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2612050920724869, "epoch": 0.024548402037980546, "frac_reward_zero_std": 1.0, "grad_norm": 0.003256069030612707, "kl": 0.0018359064706601202, "learning_rate": 9.950995831403426e-07, "loss": 0.0001, "num_tokens": 14709964.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 530, "step_time": 12.744639925658703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 171.5625, "completions/mean_terminated_length": 171.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3694435805082321, "epoch": 0.02459471977767485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018149253446608782, "kl": 0.0023198953131213784, "learning_rate": 9.950903195924038e-07, "loss": 0.0001, "num_tokens": 14738821.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 531, "step_time": 20.87821977958083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 147.0625, "completions/mean_terminated_length": 147.0625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2528420351445675, "epoch": 0.024641037517369152, "frac_reward_zero_std": 0.0, "grad_norm": 0.07727481424808502, "kl": 0.0011586096952669322, "learning_rate": 9.950810560444651e-07, "loss": 0.0991, "num_tokens": 14761974.0, "reward": 0.9899153709411621, "reward_std": 0.027556534856557846, "rewards/reward_func/mean": 0.9899153709411621, "rewards/reward_func/std": 0.027556534856557846, "step": 532, "step_time": 17.73023172095418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.36899401247501373, "epoch": 0.024687355257063455, "frac_reward_zero_std": 0.0, "grad_norm": 0.08863791823387146, "kl": 0.0012757852673530579, "learning_rate": 9.950717924965262e-07, "loss": -0.0945, "num_tokens": 14787492.0, "reward": 0.4648560881614685, "reward_std": 0.4801013171672821, "rewards/reward_func/mean": 0.4648560881614685, "rewards/reward_func/std": 0.4801013171672821, "step": 533, "step_time": 24.023204747587442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.2912716940045357, "epoch": 0.024733672996757758, "frac_reward_zero_std": 0.0, "grad_norm": 0.06058639660477638, "kl": 0.0009023299644468352, "learning_rate": 9.950625289485871e-07, "loss": -0.0476, "num_tokens": 14820624.0, "reward": 0.8208504915237427, "reward_std": 0.24021044373512268, "rewards/reward_func/mean": 0.8208504915237427, "rewards/reward_func/std": 0.24021045863628387, "step": 534, "step_time": 25.64229280874133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4075919911265373, "epoch": 0.02477999073645206, "frac_reward_zero_std": 0.0, "grad_norm": 0.06277602910995483, "kl": 0.001133722806116566, "learning_rate": 9.950532654006485e-07, "loss": -0.0851, "num_tokens": 14848068.0, "reward": 0.3356561064720154, "reward_std": 0.4581764340400696, "rewards/reward_func/mean": 0.3356561064720154, "rewards/reward_func/std": 0.45817646384239197, "step": 535, "step_time": 23.9300565905869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24887803941965103, "epoch": 0.024826308476146364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006139138713479042, "kl": 0.0011030811874661595, "learning_rate": 9.950440018527096e-07, "loss": 0.0001, "num_tokens": 14867770.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 536, "step_time": 13.15186096355319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24105515331029892, "epoch": 0.024872626215840667, "frac_reward_zero_std": 1.0, "grad_norm": 0.001104863709770143, "kl": 0.0011861571983899921, "learning_rate": 9.950347383047707e-07, "loss": 0.0001, "num_tokens": 14887486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 537, "step_time": 13.874327003955841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.17102652788162231, "epoch": 0.02491894395553497, "frac_reward_zero_std": 0.0, "grad_norm": 0.06881299614906311, "kl": 0.0006279955705394968, "learning_rate": 9.950254747568319e-07, "loss": 0.0741, "num_tokens": 14908040.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 538, "step_time": 16.973441254347563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2013726904988289, "epoch": 0.024965261695229272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014457677025347948, "kl": 0.001240854966454208, "learning_rate": 9.95016211208893e-07, "loss": 0.0001, "num_tokens": 14929746.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 539, "step_time": 15.499807421118021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.22709523141384125, "epoch": 0.025011579434923575, "frac_reward_zero_std": 1.0, "grad_norm": 0.004049050621688366, "kl": 0.0022631227038800716, "learning_rate": 9.950069476609541e-07, "loss": 0.0001, "num_tokens": 14949382.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 540, "step_time": 14.647448178380728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.41529327630996704, "epoch": 0.02505789717461788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012068174546584487, "kl": 0.0019127448613289744, "learning_rate": 9.949976841130152e-07, "loss": 0.0001, "num_tokens": 14996570.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 541, "step_time": 21.505676943808794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27272979170084, "epoch": 0.02510421491431218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013653446221724153, "kl": 0.0012410607014317065, "learning_rate": 9.949884205650764e-07, "loss": 0.0001, "num_tokens": 15018394.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 542, "step_time": 14.302406802773476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.5625, "completions/mean_terminated_length": 179.5625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3766372799873352, "epoch": 0.025150532654006484, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007904847734607756, "kl": 0.0015201152418740094, "learning_rate": 9.949791570171375e-07, "loss": 0.0001, "num_tokens": 15041555.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 543, "step_time": 19.756406288594007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3292296230792999, "epoch": 0.025196850393700787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007243999862112105, "kl": 0.0012820618867408484, "learning_rate": 9.949698934691986e-07, "loss": 0.0001, "num_tokens": 15066489.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 544, "step_time": 15.174732618033886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 131.4375, "completions/mean_terminated_length": 131.4375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3490144908428192, "epoch": 0.02524316813339509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008763514342717826, "kl": 0.0013646596344187856, "learning_rate": 9.9496062992126e-07, "loss": 0.0001, "num_tokens": 15088240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 545, "step_time": 15.217640075832605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.12979639321565628, "epoch": 0.025289485873089393, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023179855197668076, "kl": 0.002074635813187342, "learning_rate": 9.949513663733209e-07, "loss": 0.0001, "num_tokens": 15112067.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 546, "step_time": 17.45606468990445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4022025465965271, "epoch": 0.025335803612783696, "frac_reward_zero_std": 0.0, "grad_norm": 0.06028041988611221, "kl": 0.0012326524301897734, "learning_rate": 9.94942102825382e-07, "loss": 0.0407, "num_tokens": 15138451.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 547, "step_time": 26.755181174725294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.2725084200501442, "epoch": 0.025382121352478, "frac_reward_zero_std": 0.0, "grad_norm": 0.0626678466796875, "kl": 0.0014349300763569772, "learning_rate": 9.94932839277443e-07, "loss": -0.0022, "num_tokens": 15175061.0, "reward": 0.9788992404937744, "reward_std": 0.005628441926091909, "rewards/reward_func/mean": 0.9788992404937744, "rewards/reward_func/std": 0.00562844006344676, "step": 548, "step_time": 28.6032482534647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 164.5625, "completions/mean_terminated_length": 164.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.39379655569791794, "epoch": 0.0254284390921723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008524972945451736, "kl": 0.001492806535679847, "learning_rate": 9.949235757295044e-07, "loss": 0.0001, "num_tokens": 15228350.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 549, "step_time": 25.498115357011557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4791732355952263, "epoch": 0.025474756831866605, "frac_reward_zero_std": 0.0, "grad_norm": 0.06944435089826584, "kl": 0.0016198160010389984, "learning_rate": 9.949143121815656e-07, "loss": 0.091, "num_tokens": 15251138.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 550, "step_time": 22.150676514953375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 262.6875, "completions/mean_terminated_length": 262.6875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.2680952474474907, "epoch": 0.025521074571560907, "frac_reward_zero_std": 0.0, "grad_norm": 0.04636561870574951, "kl": 0.0008887053554644808, "learning_rate": 9.949050486336267e-07, "loss": -0.0425, "num_tokens": 15274813.0, "reward": 0.8137565851211548, "reward_std": 0.2170029729604721, "rewards/reward_func/mean": 0.8137565851211548, "rewards/reward_func/std": 0.2170029729604721, "step": 551, "step_time": 29.685348197817802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2995965778827667, "epoch": 0.02556739231125521, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009323018603026867, "kl": 0.001274977985303849, "learning_rate": 9.948957850856878e-07, "loss": 0.0001, "num_tokens": 15301294.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 552, "step_time": 15.274673901498318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 177.1875, "completions/mean_terminated_length": 177.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1796923540532589, "epoch": 0.025613710050949513, "frac_reward_zero_std": 0.0, "grad_norm": 0.0792279988527298, "kl": 0.0010075290047097951, "learning_rate": 9.94886521537749e-07, "loss": -0.0282, "num_tokens": 15339537.0, "reward": 0.8645535707473755, "reward_std": 0.19388173520565033, "rewards/reward_func/mean": 0.8645535707473755, "rewards/reward_func/std": 0.19388172030448914, "step": 553, "step_time": 22.450017869472504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3709481582045555, "epoch": 0.025660027790643816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012022164883092046, "kl": 0.0018011342617683113, "learning_rate": 9.9487725798981e-07, "loss": 0.0001, "num_tokens": 15368469.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 554, "step_time": 19.368321228772402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3679218143224716, "epoch": 0.02570634553033812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007441905909217894, "kl": 0.0014579891285393387, "learning_rate": 9.948679944418712e-07, "loss": 0.0001, "num_tokens": 15390825.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 555, "step_time": 16.342542689293623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.29407284408807755, "epoch": 0.025752663270032422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011675918940454721, "kl": 0.0012941797176608816, "learning_rate": 9.948587308939323e-07, "loss": 0.0001, "num_tokens": 15426867.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 556, "step_time": 19.005365189164877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.357125461101532, "epoch": 0.025798981009726725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027116169221699238, "kl": 0.0019961106590926647, "learning_rate": 9.948494673459934e-07, "loss": 0.0001, "num_tokens": 15451245.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 557, "step_time": 20.6795228458941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 104.375, "completions/mean_terminated_length": 104.375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.23302504047751427, "epoch": 0.025845298749421028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006331351469270885, "kl": 0.000980119570158422, "learning_rate": 9.948402037980546e-07, "loss": 0.0, "num_tokens": 15472035.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 558, "step_time": 12.356189779937267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 178.8125, "completions/mean_terminated_length": 178.8125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2559828981757164, "epoch": 0.02589161648911533, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005925624864175916, "kl": 0.0008678560116095468, "learning_rate": 9.948309402501157e-07, "loss": 0.0, "num_tokens": 15497216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 559, "step_time": 24.370823446661234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 157.3125, "completions/mean_terminated_length": 157.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.16670601069927216, "epoch": 0.025937934228809634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007910909480415285, "kl": 0.000819086650153622, "learning_rate": 9.948216767021768e-07, "loss": 0.0, "num_tokens": 15534069.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 560, "step_time": 21.16285178437829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 119.1875, "completions/mean_terminated_length": 119.1875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.35410934686660767, "epoch": 0.025984251968503937, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035102088004350662, "kl": 0.0024094951804727316, "learning_rate": 9.94812413154238e-07, "loss": 0.0001, "num_tokens": 15561608.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 561, "step_time": 15.391472723335028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 118.4375, "completions/mean_terminated_length": 118.4375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.35063062608242035, "epoch": 0.02603056970819824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019810523372143507, "kl": 0.0023975164513103664, "learning_rate": 9.948031496062993e-07, "loss": 0.0001, "num_tokens": 15597423.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 562, "step_time": 17.099945228546858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.21170199662446976, "epoch": 0.026076887447892542, "frac_reward_zero_std": 0.0, "grad_norm": 0.08928564190864563, "kl": 0.001233914343174547, "learning_rate": 9.947938860583604e-07, "loss": -0.0256, "num_tokens": 15634654.0, "reward": 0.660243809223175, "reward_std": 0.1326272338628769, "rewards/reward_func/mean": 0.660243809223175, "rewards/reward_func/std": 0.1326272338628769, "step": 563, "step_time": 21.962135393172503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2716743163764477, "epoch": 0.026123205187586845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008933998178690672, "kl": 0.0010183459962718189, "learning_rate": 9.947846225104215e-07, "loss": 0.0001, "num_tokens": 15657159.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 564, "step_time": 16.57527555525303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.39399532228708267, "epoch": 0.02616952292728115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012033042730763555, "kl": 0.0012665592366829515, "learning_rate": 9.947753589624827e-07, "loss": 0.0001, "num_tokens": 15682171.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 565, "step_time": 16.046187974512577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 177.1875, "completions/mean_terminated_length": 177.1875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3469313308596611, "epoch": 0.02621584066697545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013318386627361178, "kl": 0.001602203119546175, "learning_rate": 9.947660954145438e-07, "loss": 0.0001, "num_tokens": 15714926.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 566, "step_time": 22.901175644248724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 237.6875, "completions/mean_terminated_length": 237.6875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.4689154475927353, "epoch": 0.026262158406669754, "frac_reward_zero_std": 0.0, "grad_norm": 0.07163114845752716, "kl": 0.0014194734394550323, "learning_rate": 9.94756831866605e-07, "loss": -0.163, "num_tokens": 15754969.0, "reward": 0.07219797372817993, "reward_std": 0.24258717894554138, "rewards/reward_func/mean": 0.07219797372817993, "rewards/reward_func/std": 0.24258717894554138, "step": 567, "step_time": 35.50100315362215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 199.1875, "completions/mean_terminated_length": 199.1875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4303721934556961, "epoch": 0.026308476146364057, "frac_reward_zero_std": 0.0, "grad_norm": 0.08563347905874252, "kl": 0.0017922719707712531, "learning_rate": 9.94747568318666e-07, "loss": -0.0331, "num_tokens": 15786908.0, "reward": 0.22827517986297607, "reward_std": 0.4083510637283325, "rewards/reward_func/mean": 0.22827517986297607, "rewards/reward_func/std": 0.4083510637283325, "step": 568, "step_time": 22.553745798766613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 135.0625, "completions/mean_terminated_length": 135.0625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3000262975692749, "epoch": 0.02635479388605836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014759069308638573, "kl": 0.0014760128979105502, "learning_rate": 9.947383047707272e-07, "loss": 0.0001, "num_tokens": 15811437.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 569, "step_time": 15.757252767682076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.33487706631422043, "epoch": 0.026401111625752663, "frac_reward_zero_std": 1.0, "grad_norm": 0.000654570001643151, "kl": 0.0012129094975534827, "learning_rate": 9.947290412227883e-07, "loss": 0.0001, "num_tokens": 15843091.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 570, "step_time": 21.559228021651506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.16004950553178787, "epoch": 0.026447429365446966, "frac_reward_zero_std": 0.0, "grad_norm": 0.053859077394008636, "kl": 0.0007608709565829486, "learning_rate": 9.947197776748494e-07, "loss": 0.0176, "num_tokens": 15871244.0, "reward": 0.9782751798629761, "reward_std": 0.03886254131793976, "rewards/reward_func/mean": 0.9782751798629761, "rewards/reward_func/std": 0.03886253759264946, "step": 571, "step_time": 17.66007414087653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 178.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.36595743894577026, "epoch": 0.02649374710514127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010515021858736873, "kl": 0.0011732606799341738, "learning_rate": 9.947105141269105e-07, "loss": 0.0001, "num_tokens": 15892852.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 572, "step_time": 20.06080413982272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 179.9375, "completions/mean_terminated_length": 179.9375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.17148252204060555, "epoch": 0.02654006484483557, "frac_reward_zero_std": 1.0, "grad_norm": 0.00039555327384732664, "kl": 0.0006037524726707488, "learning_rate": 9.947012505789717e-07, "loss": 0.0, "num_tokens": 15926035.0, "reward": 0.9214109182357788, "reward_std": 0.0, "rewards/reward_func/mean": 0.9214109182357788, "rewards/reward_func/std": 0.0, "step": 573, "step_time": 21.273090057075024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.29171208292245865, "epoch": 0.026586382584529875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013545382535085082, "kl": 0.0012165867956355214, "learning_rate": 9.946919870310328e-07, "loss": 0.0001, "num_tokens": 15954263.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 574, "step_time": 17.165605064481497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.24007542058825493, "epoch": 0.026632700324224177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016633195336908102, "kl": 0.0016958365449681878, "learning_rate": 9.946827234830941e-07, "loss": 0.0001, "num_tokens": 15973587.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 575, "step_time": 13.618320003151894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2145506590604782, "epoch": 0.02667901806391848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006946335779502988, "kl": 0.000943855571676977, "learning_rate": 9.946734599351552e-07, "loss": 0.0, "num_tokens": 16007629.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 576, "step_time": 23.893839932978153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 225.9375, "completions/mean_terminated_length": 225.9375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.19930892810225487, "epoch": 0.026725335803612783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003200446080882102, "kl": 0.0006423304002964869, "learning_rate": 9.946641963872162e-07, "loss": 0.0, "num_tokens": 16038444.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 577, "step_time": 24.372179005295038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3417799845337868, "epoch": 0.026771653543307086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006592239369638264, "kl": 0.0012571831757668406, "learning_rate": 9.946549328392773e-07, "loss": 0.0001, "num_tokens": 16062406.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 578, "step_time": 17.421500850468874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.29721803963184357, "epoch": 0.02681797128300139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014690555399283767, "kl": 0.0014924766728654504, "learning_rate": 9.946456692913386e-07, "loss": 0.0001, "num_tokens": 16082150.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 579, "step_time": 12.813174404203892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 126.3125, "completions/mean_terminated_length": 126.3125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2668394297361374, "epoch": 0.026864289022695692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011392865562811494, "kl": 0.0012131782714277506, "learning_rate": 9.946364057433997e-07, "loss": 0.0001, "num_tokens": 16104267.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 580, "step_time": 13.761691994965076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3839539736509323, "epoch": 0.026910606762389995, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009683132520876825, "kl": 0.0014040677051525563, "learning_rate": 9.946271421954609e-07, "loss": 0.0001, "num_tokens": 16128806.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 581, "step_time": 14.585123918950558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3227327987551689, "epoch": 0.026956924502084298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008882497786544263, "kl": 0.0011051030887756497, "learning_rate": 9.94617878647522e-07, "loss": 0.0001, "num_tokens": 16151552.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 582, "step_time": 16.934905491769314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 201.1875, "completions/mean_terminated_length": 201.1875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4244815856218338, "epoch": 0.0270032422417786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009207507828250527, "kl": 0.0014205464394763112, "learning_rate": 9.946086150995831e-07, "loss": 0.0001, "num_tokens": 16183907.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 583, "step_time": 26.793815910816193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 168.4375, "completions/mean_terminated_length": 168.4375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.14582490175962448, "epoch": 0.027049559981472904, "frac_reward_zero_std": 1.0, "grad_norm": 0.000684766098856926, "kl": 0.0006677401397610083, "learning_rate": 9.945993515516442e-07, "loss": 0.0, "num_tokens": 16219594.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 584, "step_time": 21.359646912664175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.20227548852562904, "epoch": 0.027095877721167207, "frac_reward_zero_std": 0.0, "grad_norm": 0.04735409840941429, "kl": 0.0008678182202856988, "learning_rate": 9.945900880037054e-07, "loss": 0.0129, "num_tokens": 16246934.0, "reward": 0.9480330944061279, "reward_std": 0.012785443104803562, "rewards/reward_func/mean": 0.9480330944061279, "rewards/reward_func/std": 0.012785449624061584, "step": 585, "step_time": 20.081316489726305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2501451075077057, "epoch": 0.02714219546086151, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009449580684304237, "kl": 0.0010224088036920875, "learning_rate": 9.945808244557665e-07, "loss": 0.0001, "num_tokens": 16270748.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 586, "step_time": 15.71675755828619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.19648923724889755, "epoch": 0.027188513200555812, "frac_reward_zero_std": 0.0, "grad_norm": 0.13904128968715668, "kl": 0.0014718308666488156, "learning_rate": 9.945715609078276e-07, "loss": -0.0463, "num_tokens": 16292158.0, "reward": 0.9261720180511475, "reward_std": 0.1320675164461136, "rewards/reward_func/mean": 0.9261720180511475, "rewards/reward_func/std": 0.1320675015449524, "step": 587, "step_time": 18.391562066972256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1223986130207777, "epoch": 0.027234830940250115, "frac_reward_zero_std": 1.0, "grad_norm": 0.00037939532194286585, "kl": 0.0005342196382116526, "learning_rate": 9.945622973598887e-07, "loss": 0.0, "num_tokens": 16314776.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 588, "step_time": 15.983282055705786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 174.9375, "completions/mean_terminated_length": 174.9375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.41517775505781174, "epoch": 0.02728114867994442, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009759520762600005, "kl": 0.001477598212659359, "learning_rate": 9.945530338119499e-07, "loss": 0.0001, "num_tokens": 16346375.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 589, "step_time": 21.28076909482479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.0625, "completions/mean_terminated_length": 149.0625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.34951143711805344, "epoch": 0.02732746641963872, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010513507295399904, "kl": 0.0013794851256534457, "learning_rate": 9.94543770264011e-07, "loss": 0.0001, "num_tokens": 16367064.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 590, "step_time": 17.958604458719492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.29924070835113525, "epoch": 0.027373784159333024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008221545722335577, "kl": 0.0011459146771812811, "learning_rate": 9.945345067160721e-07, "loss": 0.0001, "num_tokens": 16394343.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 591, "step_time": 15.76231737434864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.21035076677799225, "epoch": 0.027420101899027327, "frac_reward_zero_std": 0.0, "grad_norm": 0.07679693400859833, "kl": 0.0009747492003953084, "learning_rate": 9.945252431681334e-07, "loss": -0.0168, "num_tokens": 16430713.0, "reward": 0.907114565372467, "reward_std": 0.1997753530740738, "rewards/reward_func/mean": 0.907114565372467, "rewards/reward_func/std": 0.1997753530740738, "step": 592, "step_time": 24.519479889422655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3544777110219002, "epoch": 0.02746641963872163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010482023935765028, "kl": 0.0015595678996760398, "learning_rate": 9.945159796201946e-07, "loss": 0.0001, "num_tokens": 16485491.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 593, "step_time": 24.91678934916854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 131.8125, "completions/mean_terminated_length": 131.8125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31441253423690796, "epoch": 0.027512737378415933, "frac_reward_zero_std": 1.0, "grad_norm": 0.001301216776482761, "kl": 0.00176118579111062, "learning_rate": 9.945067160722557e-07, "loss": 0.0001, "num_tokens": 16513808.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 594, "step_time": 17.711910124868155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3171621486544609, "epoch": 0.027559055118110236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007246560999192297, "kl": 0.0011956649977946654, "learning_rate": 9.944974525243166e-07, "loss": 0.0001, "num_tokens": 16534913.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 595, "step_time": 16.61210546270013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.242061547935009, "epoch": 0.02760537285780454, "frac_reward_zero_std": 1.0, "grad_norm": 0.002108033047989011, "kl": 0.0013999624352436513, "learning_rate": 9.94488188976378e-07, "loss": 0.0001, "num_tokens": 16554509.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 596, "step_time": 14.333650436252356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 115.4375, "completions/mean_terminated_length": 115.4375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.31971098482608795, "epoch": 0.02765169059749884, "frac_reward_zero_std": 1.0, "grad_norm": 0.001012361142784357, "kl": 0.0013101025542709976, "learning_rate": 9.94478925428439e-07, "loss": 0.0001, "num_tokens": 16575684.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 597, "step_time": 13.299234211444855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.36262890696525574, "epoch": 0.027698008337193145, "frac_reward_zero_std": 0.0, "grad_norm": 0.0730704814195633, "kl": 0.0011368568520992994, "learning_rate": 9.944696618805002e-07, "loss": -0.0493, "num_tokens": 16598876.0, "reward": 0.2151769995689392, "reward_std": 0.38492029905319214, "rewards/reward_func/mean": 0.2151769995689392, "rewards/reward_func/std": 0.38492029905319214, "step": 598, "step_time": 24.221900921314955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 209.5, "completions/mean_terminated_length": 209.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.19882144778966904, "epoch": 0.027744326076887448, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044133211486041546, "kl": 0.0006942116451682523, "learning_rate": 9.944603983325613e-07, "loss": 0.0, "num_tokens": 16624212.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 599, "step_time": 21.113984003663063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 282.1875, "completions/mean_terminated_length": 282.1875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.30956175178289413, "epoch": 0.02779064381658175, "frac_reward_zero_std": 0.0, "grad_norm": 0.04635261371731758, "kl": 0.0010746768966782838, "learning_rate": 9.944511347846224e-07, "loss": -0.3788, "num_tokens": 16658311.0, "reward": 0.1990107297897339, "reward_std": 0.3048628866672516, "rewards/reward_func/mean": 0.1990107297897339, "rewards/reward_func/std": 0.3048628866672516, "step": 600, "step_time": 39.032493986189365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.32160045206546783, "epoch": 0.027836961556276053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006753444904461503, "kl": 0.0012247733538970351, "learning_rate": 9.944418712366836e-07, "loss": 0.0001, "num_tokens": 16679404.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 601, "step_time": 17.26866102963686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.3007632941007614, "epoch": 0.027883279295970356, "frac_reward_zero_std": 0.0, "grad_norm": 0.06767585128545761, "kl": 0.0011035270581487566, "learning_rate": 9.944326076887447e-07, "loss": 0.0107, "num_tokens": 16710062.0, "reward": 0.5670922994613647, "reward_std": 0.4607682228088379, "rewards/reward_func/mean": 0.5670922994613647, "rewards/reward_func/std": 0.4607682228088379, "step": 602, "step_time": 25.07387748733163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 190.6875, "completions/mean_terminated_length": 190.6875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.30277078598737717, "epoch": 0.02792959703566466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006902308668941259, "kl": 0.0012163456704001874, "learning_rate": 9.944233441408058e-07, "loss": 0.0001, "num_tokens": 16741433.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 603, "step_time": 21.679902721196413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2468840256333351, "epoch": 0.027975914775358962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004982223617844284, "kl": 0.0009107012447202578, "learning_rate": 9.94414080592867e-07, "loss": 0.0, "num_tokens": 16773801.0, "reward": 0.788127601146698, "reward_std": 0.0, "rewards/reward_func/mean": 0.788127601146698, "rewards/reward_func/std": 0.0, "step": 604, "step_time": 19.09699758887291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 140.1875, "completions/mean_terminated_length": 140.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2988521531224251, "epoch": 0.028022232515053265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016440520994365215, "kl": 0.0015206864336505532, "learning_rate": 9.944048170449283e-07, "loss": 0.0001, "num_tokens": 16797276.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 605, "step_time": 16.558886874467134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 130.1875, "completions/mean_terminated_length": 130.1875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3045179918408394, "epoch": 0.028068550254747568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009281300008296967, "kl": 0.0012925256451126188, "learning_rate": 9.943955534969894e-07, "loss": 0.0001, "num_tokens": 16817215.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 606, "step_time": 15.212780736386776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 135.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.30887145549058914, "epoch": 0.02811486799444187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005796050536446273, "kl": 0.0009964118362404406, "learning_rate": 9.943862899490505e-07, "loss": 0.0001, "num_tokens": 16851202.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 607, "step_time": 17.76537637040019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 158.0625, "completions/mean_terminated_length": 158.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.38649002462625504, "epoch": 0.028161185734136174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012031864607706666, "kl": 0.0015913881361484528, "learning_rate": 9.943770264011114e-07, "loss": 0.0001, "num_tokens": 16882403.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 608, "step_time": 19.019992608577013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.40181419998407364, "epoch": 0.028207503473830477, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006465300102718174, "kl": 0.0013402352924458683, "learning_rate": 9.943677628531728e-07, "loss": 0.0001, "num_tokens": 16927820.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 609, "step_time": 22.771573085337877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 149.6875, "completions/mean_terminated_length": 149.6875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.26405899599194527, "epoch": 0.02825382121352478, "frac_reward_zero_std": 1.0, "grad_norm": 0.001259437995031476, "kl": 0.0012018869747407734, "learning_rate": 9.94358499305234e-07, "loss": 0.0001, "num_tokens": 16947975.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 610, "step_time": 15.735291086137295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4246085360646248, "epoch": 0.028300138953219083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008513656794093549, "kl": 0.0013288070040289313, "learning_rate": 9.94349235757295e-07, "loss": 0.0001, "num_tokens": 16973810.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 611, "step_time": 19.875703874975443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3269863650202751, "epoch": 0.028346456692913385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012261215597391129, "kl": 0.001385840296279639, "learning_rate": 9.943399722093562e-07, "loss": 0.0001, "num_tokens": 16998506.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 612, "step_time": 16.11475994810462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 193.1875, "completions/mean_terminated_length": 193.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4552323967218399, "epoch": 0.02839277443260769, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010839778697118163, "kl": 0.0018463747692294419, "learning_rate": 9.943307086614173e-07, "loss": 0.0001, "num_tokens": 17039917.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 613, "step_time": 25.193405266851187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.2519208565354347, "epoch": 0.02843909217230199, "frac_reward_zero_std": 0.0, "grad_norm": 0.05388407036662102, "kl": 0.001017692542518489, "learning_rate": 9.943214451134784e-07, "loss": -0.0893, "num_tokens": 17073965.0, "reward": 0.6615020632743835, "reward_std": 0.44653603434562683, "rewards/reward_func/mean": 0.6615020632743835, "rewards/reward_func/std": 0.44653603434562683, "step": 614, "step_time": 27.768390368670225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.25032806396484375, "epoch": 0.028485409911996294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009284821571782231, "kl": 0.0011010119051206857, "learning_rate": 9.943121815655395e-07, "loss": 0.0001, "num_tokens": 17093478.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 615, "step_time": 13.723979100584984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 158.875, "completions/mean_terminated_length": 158.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2890133857727051, "epoch": 0.028531727651690597, "frac_reward_zero_std": 0.0, "grad_norm": 0.13864818215370178, "kl": 0.0014494710485450923, "learning_rate": 9.943029180176007e-07, "loss": 0.1458, "num_tokens": 17120036.0, "reward": 0.7270569801330566, "reward_std": 0.36072129011154175, "rewards/reward_func/mean": 0.7270569801330566, "rewards/reward_func/std": 0.36072129011154175, "step": 616, "step_time": 21.460193186998367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 201.9375, "completions/mean_terminated_length": 201.9375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.19010262191295624, "epoch": 0.0285780453913849, "frac_reward_zero_std": 0.0, "grad_norm": 0.045914456248283386, "kl": 0.0007538544741692021, "learning_rate": 9.942936544696618e-07, "loss": 0.0197, "num_tokens": 17149395.0, "reward": 0.9353712797164917, "reward_std": 0.01723429374396801, "rewards/reward_func/mean": 0.9353712797164917, "rewards/reward_func/std": 0.017234310507774353, "step": 617, "step_time": 22.644675966352224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 116.3125, "completions/mean_terminated_length": 116.3125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.28089043498039246, "epoch": 0.028624363131079203, "frac_reward_zero_std": 1.0, "grad_norm": 0.001129591604694724, "kl": 0.001515325391665101, "learning_rate": 9.94284390921723e-07, "loss": 0.0001, "num_tokens": 17172552.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 618, "step_time": 13.440848540514708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2327614687383175, "epoch": 0.028670680870773506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011413868051022291, "kl": 0.0010565890843281522, "learning_rate": 9.942751273737842e-07, "loss": 0.0001, "num_tokens": 17193346.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 619, "step_time": 13.750414993613958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2940201684832573, "epoch": 0.02871699861046781, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022562395315617323, "kl": 0.0014221240853657946, "learning_rate": 9.942658638258452e-07, "loss": 0.0001, "num_tokens": 17213282.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 620, "step_time": 13.1810467466712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.2548486962914467, "epoch": 0.02876331635016211, "frac_reward_zero_std": 0.0, "grad_norm": 0.09021364897489548, "kl": 0.000993866182398051, "learning_rate": 9.942566002779063e-07, "loss": -0.0402, "num_tokens": 17238513.0, "reward": 0.23816077411174774, "reward_std": 0.02131837047636509, "rewards/reward_func/mean": 0.23816077411174774, "rewards/reward_func/std": 0.02131836861371994, "step": 621, "step_time": 18.491280663758516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.24587681889533997, "epoch": 0.028809634089856415, "frac_reward_zero_std": 0.0, "grad_norm": 0.10550187528133392, "kl": 0.00158912711776793, "learning_rate": 9.942473367299676e-07, "loss": 0.0443, "num_tokens": 17269882.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 622, "step_time": 19.237867150455713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.1528434455394745, "epoch": 0.028855951829550718, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010806182399392128, "kl": 0.0008311954879900441, "learning_rate": 9.942380731820287e-07, "loss": 0.0, "num_tokens": 17290028.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 623, "step_time": 14.853768266737461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 174.8125, "completions/mean_terminated_length": 174.8125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3995269536972046, "epoch": 0.02890226956924502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015165697550401092, "kl": 0.0018526142812334, "learning_rate": 9.942288096340899e-07, "loss": 0.0001, "num_tokens": 17323161.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 624, "step_time": 20.973382882773876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.1739337407052517, "epoch": 0.028948587308939323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004962090752087533, "kl": 0.0007467141549568623, "learning_rate": 9.94219546086151e-07, "loss": 0.0, "num_tokens": 17360641.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 625, "step_time": 21.581327740103006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3666951060295105, "epoch": 0.028994905048633626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012159666512161493, "kl": 0.0014632449019700289, "learning_rate": 9.942102825382121e-07, "loss": 0.0001, "num_tokens": 17390001.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 626, "step_time": 19.581063494086266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.24443429335951805, "epoch": 0.02904122278832793, "frac_reward_zero_std": 1.0, "grad_norm": 0.002116140676662326, "kl": 0.00103072528145276, "learning_rate": 9.942010189902732e-07, "loss": 0.0001, "num_tokens": 17409299.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 627, "step_time": 12.728528279811144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 144.6875, "completions/mean_terminated_length": 144.6875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.304414801299572, "epoch": 0.029087540528022232, "frac_reward_zero_std": 1.0, "grad_norm": 0.000975096772890538, "kl": 0.0013835621648468077, "learning_rate": 9.941917554423344e-07, "loss": 0.0001, "num_tokens": 17440190.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 628, "step_time": 17.68225933238864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27537262067198753, "epoch": 0.029133858267716535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012910662917420268, "kl": 0.0012789879401680082, "learning_rate": 9.941824918943955e-07, "loss": 0.0001, "num_tokens": 17459598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 629, "step_time": 12.648746185004711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 222.5625, "completions/mean_terminated_length": 222.5625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.30768710747361183, "epoch": 0.029180176007410838, "frac_reward_zero_std": 0.0, "grad_norm": 0.06626363843679428, "kl": 0.0008501510455971584, "learning_rate": 9.941732283464566e-07, "loss": 0.0411, "num_tokens": 17499079.0, "reward": 0.05458332970738411, "reward_std": 0.03800695016980171, "rewards/reward_func/mean": 0.05458332970738411, "rewards/reward_func/std": 0.03800695016980171, "step": 630, "step_time": 31.33593814447522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 262.0625, "completions/mean_terminated_length": 262.0625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.2470867969095707, "epoch": 0.02922649374710514, "frac_reward_zero_std": 0.0, "grad_norm": 0.0614062137901783, "kl": 0.0006329155585262924, "learning_rate": 9.941639647985177e-07, "loss": -0.0538, "num_tokens": 17523640.0, "reward": 0.7354071140289307, "reward_std": 0.28707355260849, "rewards/reward_func/mean": 0.7354071140289307, "rewards/reward_func/std": 0.2870735824108124, "step": 631, "step_time": 26.915731094777584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 141.3125, "completions/mean_terminated_length": 141.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.21450427547097206, "epoch": 0.029272811486799444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015268087154254317, "kl": 0.0008688341185916215, "learning_rate": 9.941547012505789e-07, "loss": 0.0, "num_tokens": 17543437.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 632, "step_time": 14.755547858774662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 153.0625, "completions/mean_terminated_length": 153.0625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.37368296831846237, "epoch": 0.029319129226493747, "frac_reward_zero_std": 1.0, "grad_norm": 0.001764249405823648, "kl": 0.0018268795975018293, "learning_rate": 9.9414543770264e-07, "loss": 0.0001, "num_tokens": 17585598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 633, "step_time": 20.64495661482215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 143.1875, "completions/mean_terminated_length": 143.1875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.29835107922554016, "epoch": 0.02936544696618805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006717204814776778, "kl": 0.001157870385213755, "learning_rate": 9.941361741547011e-07, "loss": 0.0001, "num_tokens": 17612289.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 634, "step_time": 15.97997947409749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 132.9375, "completions/mean_terminated_length": 132.9375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3083609566092491, "epoch": 0.029411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011125041637569666, "kl": 0.0014098618412390351, "learning_rate": 9.941269106067625e-07, "loss": 0.0001, "num_tokens": 17635552.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 635, "step_time": 14.866094164550304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.22032272443175316, "epoch": 0.029458082445576655, "frac_reward_zero_std": 0.0, "grad_norm": 0.10421456396579742, "kl": 0.002787069766782224, "learning_rate": 9.941176470588236e-07, "loss": -0.033, "num_tokens": 17672476.0, "reward": 0.6601945161819458, "reward_std": 0.1685907542705536, "rewards/reward_func/mean": 0.6601945161819458, "rewards/reward_func/std": 0.1685907542705536, "step": 636, "step_time": 22.77554728835821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3203084096312523, "epoch": 0.02950440018527096, "frac_reward_zero_std": 0.0, "grad_norm": 0.0523492656648159, "kl": 0.0009472902747802436, "learning_rate": 9.941083835108847e-07, "loss": -0.0273, "num_tokens": 17700726.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.5625, "rewards/reward_func/std": 0.5123475790023804, "step": 637, "step_time": 27.360778879374266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 189.5625, "completions/mean_terminated_length": 189.5625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3144254833459854, "epoch": 0.02955071792496526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012330756289884448, "kl": 0.0012759679520968348, "learning_rate": 9.940991199629456e-07, "loss": 0.0001, "num_tokens": 17723903.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 638, "step_time": 23.00736243277788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2956559658050537, "epoch": 0.029597035664659564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007645520381629467, "kl": 0.001081657683243975, "learning_rate": 9.94089856415007e-07, "loss": 0.0001, "num_tokens": 17745201.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 639, "step_time": 15.197411470115185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4124782010912895, "epoch": 0.029643353404353867, "frac_reward_zero_std": 0.0, "grad_norm": 0.08005214482545853, "kl": 0.0014793944428674877, "learning_rate": 9.94080592867068e-07, "loss": 0.0139, "num_tokens": 17767669.0, "reward": 0.23001110553741455, "reward_std": 0.4114563465118408, "rewards/reward_func/mean": 0.23001110553741455, "rewards/reward_func/std": 0.4114563763141632, "step": 640, "step_time": 20.014618009328842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 130.5625, "completions/mean_terminated_length": 130.5625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.24497872963547707, "epoch": 0.02968967114404817, "frac_reward_zero_std": 1.0, "grad_norm": 0.001171390525996685, "kl": 0.0010745792533271015, "learning_rate": 9.940713293191292e-07, "loss": 0.0001, "num_tokens": 17787262.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 641, "step_time": 13.819535158574581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2683763653039932, "epoch": 0.029735988883742473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020587260369211435, "kl": 0.001427650888217613, "learning_rate": 9.940620657711903e-07, "loss": 0.0001, "num_tokens": 17823295.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 642, "step_time": 18.109300438314676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.14643412828445435, "epoch": 0.029782306623436776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005344918463379145, "kl": 0.0007510486320825294, "learning_rate": 9.940528022232515e-07, "loss": 0.0, "num_tokens": 17844654.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 643, "step_time": 16.412567649036646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2538882680237293, "epoch": 0.02982862436313108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007700459682382643, "kl": 0.0010706761095207185, "learning_rate": 9.940435386753126e-07, "loss": 0.0001, "num_tokens": 17874240.0, "reward": 0.030798785388469696, "reward_std": 0.0, "rewards/reward_func/mean": 0.030798785388469696, "rewards/reward_func/std": 0.0, "step": 644, "step_time": 18.515170965343714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.404374897480011, "epoch": 0.02987494210282538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007907731342129409, "kl": 0.0010905256785918027, "learning_rate": 9.940342751273737e-07, "loss": 0.0001, "num_tokens": 17899104.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 645, "step_time": 18.668986041098833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.22166218608617783, "epoch": 0.029921259842519685, "frac_reward_zero_std": 0.0, "grad_norm": 0.05660930275917053, "kl": 0.0013204931165091693, "learning_rate": 9.940250115794348e-07, "loss": -0.0253, "num_tokens": 17927312.0, "reward": 0.29995638132095337, "reward_std": 0.16590999066829681, "rewards/reward_func/mean": 0.29995638132095337, "rewards/reward_func/std": 0.165910005569458, "step": 646, "step_time": 19.130444202572107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4196074977517128, "epoch": 0.029967577582213988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006432700320146978, "kl": 0.0012449279602151364, "learning_rate": 9.94015748031496e-07, "loss": 0.0001, "num_tokens": 17948568.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 647, "step_time": 18.71919671073556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 112.6875, "completions/mean_terminated_length": 112.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.27163299173116684, "epoch": 0.03001389532190829, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009595048613846302, "kl": 0.0012287074932828546, "learning_rate": 9.94006484483557e-07, "loss": 0.0001, "num_tokens": 17969571.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 648, "step_time": 13.595854740589857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2343696802854538, "epoch": 0.030060213061602593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008790047722868621, "kl": 0.0011380193172954023, "learning_rate": 9.939972209356184e-07, "loss": 0.0001, "num_tokens": 17989209.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 649, "step_time": 13.889198988676071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 120.0625, "completions/mean_terminated_length": 120.0625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3073343336582184, "epoch": 0.030106530801296896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010500141652300954, "kl": 0.0011955422814935446, "learning_rate": 9.939879573876795e-07, "loss": 0.0001, "num_tokens": 18011130.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 650, "step_time": 14.17166980728507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 170.1875, "completions/mean_terminated_length": 170.1875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2249767668545246, "epoch": 0.0301528485409912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006380456034094095, "kl": 0.0009065577760338783, "learning_rate": 9.939786938397405e-07, "loss": 0.0, "num_tokens": 18033565.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 651, "step_time": 18.464233096688986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 205.75, "completions/mean_terminated_length": 205.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.4476078003644943, "epoch": 0.030199166280685502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007847800734452903, "kl": 0.0012855999520979822, "learning_rate": 9.939694302918018e-07, "loss": 0.0001, "num_tokens": 18063513.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 652, "step_time": 22.482271995395422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.2994404062628746, "epoch": 0.030245484020379805, "frac_reward_zero_std": 0.0, "grad_norm": 0.05273708701133728, "kl": 0.0009134301217272878, "learning_rate": 9.93960166743863e-07, "loss": -0.0238, "num_tokens": 18098741.0, "reward": 0.7205860614776611, "reward_std": 0.14855854213237762, "rewards/reward_func/mean": 0.7205860614776611, "rewards/reward_func/std": 0.14855854213237762, "step": 653, "step_time": 29.735829323530197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.26028087735176086, "epoch": 0.030291801760074108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009759777458384633, "kl": 0.0010395431017968804, "learning_rate": 9.93950903195924e-07, "loss": 0.0001, "num_tokens": 18120545.0, "reward": 0.4723665416240692, "reward_std": 0.0, "rewards/reward_func/mean": 0.4723665416240692, "rewards/reward_func/std": 0.0, "step": 654, "step_time": 18.53537382557988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 205.1875, "completions/mean_terminated_length": 205.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4086238220334053, "epoch": 0.03033811949976841, "frac_reward_zero_std": 0.0, "grad_norm": 0.07738785445690155, "kl": 0.0015518628933932632, "learning_rate": 9.939416396479852e-07, "loss": -0.1105, "num_tokens": 18142452.0, "reward": 0.18607217073440552, "reward_std": 0.3487555980682373, "rewards/reward_func/mean": 0.18607217073440552, "rewards/reward_func/std": 0.3487556278705597, "step": 655, "step_time": 24.589965999126434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3716308921575546, "epoch": 0.030384437239462714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010996874189004302, "kl": 0.0015363656275440007, "learning_rate": 9.939323761000463e-07, "loss": 0.0001, "num_tokens": 18193555.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 656, "step_time": 24.94709513708949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 203.9375, "completions/mean_terminated_length": 203.9375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.24424386769533157, "epoch": 0.030430754979157017, "frac_reward_zero_std": 0.0, "grad_norm": 0.06499525904655457, "kl": 0.0011902508849743754, "learning_rate": 9.939231125521074e-07, "loss": -0.0378, "num_tokens": 18231474.0, "reward": 0.8895835876464844, "reward_std": 0.23738820850849152, "rewards/reward_func/mean": 0.8895835876464844, "rewards/reward_func/std": 0.2373882383108139, "step": 657, "step_time": 23.588997296988964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 133.4375, "completions/mean_terminated_length": 133.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2545120343565941, "epoch": 0.03047707271885132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011487004812806845, "kl": 0.0011745925585273653, "learning_rate": 9.939138490041685e-07, "loss": 0.0001, "num_tokens": 18251545.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 658, "step_time": 14.423538401722908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 196.6875, "completions/mean_terminated_length": 196.6875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3835623487830162, "epoch": 0.030523390458545623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008706080261617899, "kl": 0.0012670426804106683, "learning_rate": 9.939045854562297e-07, "loss": 0.0001, "num_tokens": 18279988.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 659, "step_time": 23.73527555912733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 200.0625, "completions/mean_terminated_length": 200.0625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.32817748188972473, "epoch": 0.030569708198239925, "frac_reward_zero_std": 0.0, "grad_norm": 0.10197916626930237, "kl": 0.0013327872147783637, "learning_rate": 9.938953219082908e-07, "loss": -0.0552, "num_tokens": 18305589.0, "reward": 0.11260820180177689, "reward_std": 0.1164192333817482, "rewards/reward_func/mean": 0.11260820180177689, "rewards/reward_func/std": 0.1164192333817482, "step": 660, "step_time": 21.253589272499084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 197.8125, "completions/mean_terminated_length": 197.8125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2890971675515175, "epoch": 0.03061602593793423, "frac_reward_zero_std": 0.0, "grad_norm": 0.08252923935651779, "kl": 0.001193336385767907, "learning_rate": 9.93886058360352e-07, "loss": 0.0051, "num_tokens": 18335218.0, "reward": 0.5322065353393555, "reward_std": 0.4849793314933777, "rewards/reward_func/mean": 0.5322065353393555, "rewards/reward_func/std": 0.4849793612957001, "step": 661, "step_time": 21.42082080245018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.27542803436517715, "epoch": 0.03066234367762853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006713405600748956, "kl": 0.0009908601932693273, "learning_rate": 9.938767948124132e-07, "loss": 0.0, "num_tokens": 18354986.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 662, "step_time": 15.230031374841928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.33815905451774597, "epoch": 0.030708661417322834, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010544717079028487, "kl": 0.0012565023789647967, "learning_rate": 9.938675312644742e-07, "loss": 0.0001, "num_tokens": 18391026.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 663, "step_time": 19.1298321262002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.41814157366752625, "epoch": 0.030754979157017137, "frac_reward_zero_std": 0.0, "grad_norm": 0.07524161785840988, "kl": 0.0014231793174985796, "learning_rate": 9.938582677165353e-07, "loss": -0.0685, "num_tokens": 18420798.0, "reward": 0.058713316917419434, "reward_std": 0.23485326766967773, "rewards/reward_func/mean": 0.058713316917419434, "rewards/reward_func/std": 0.23485328257083893, "step": 664, "step_time": 22.819775208830833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3408391624689102, "epoch": 0.03080129689671144, "frac_reward_zero_std": 0.0, "grad_norm": 0.07138629257678986, "kl": 0.0011429604492150247, "learning_rate": 9.938490041685964e-07, "loss": -0.1248, "num_tokens": 18444636.0, "reward": 0.1933698058128357, "reward_std": 0.29622000455856323, "rewards/reward_func/mean": 0.1933698058128357, "rewards/reward_func/std": 0.29622000455856323, "step": 665, "step_time": 34.46691955626011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.21934204548597336, "epoch": 0.030847614636405743, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015187819954007864, "kl": 0.0013615007337648422, "learning_rate": 9.938397406206577e-07, "loss": 0.0001, "num_tokens": 18464093.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 666, "step_time": 13.363538708537817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 150.6875, "completions/mean_terminated_length": 150.6875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3489423394203186, "epoch": 0.030893932376100046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019261379493400455, "kl": 0.0015275503683369607, "learning_rate": 9.938304770727189e-07, "loss": 0.0001, "num_tokens": 18489144.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 667, "step_time": 17.13107032701373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3804415836930275, "epoch": 0.03094025011579435, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003983758215326816, "kl": 0.0010572924802545458, "learning_rate": 9.9382121352478e-07, "loss": 0.0001, "num_tokens": 18509896.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 668, "step_time": 21.60600521788001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3257174715399742, "epoch": 0.03098656785548865, "frac_reward_zero_std": 0.0, "grad_norm": 0.11692310869693756, "kl": 0.001714501326205209, "learning_rate": 9.938119499768411e-07, "loss": -0.085, "num_tokens": 18531288.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 669, "step_time": 18.257702708244324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.30243848264217377, "epoch": 0.031032885595182955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011854803888127208, "kl": 0.0012532831751741469, "learning_rate": 9.938026864289022e-07, "loss": 0.0001, "num_tokens": 18555542.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 670, "step_time": 19.74624053761363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 187.1875, "completions/mean_terminated_length": 187.1875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4084184989333153, "epoch": 0.031079203334877258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010090351570397615, "kl": 0.0015629433910362422, "learning_rate": 9.937934228809634e-07, "loss": 0.0001, "num_tokens": 18579513.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 671, "step_time": 21.323964346200228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 118.8125, "completions/mean_terminated_length": 118.8125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.26723024621605873, "epoch": 0.03112552107457156, "frac_reward_zero_std": 1.0, "grad_norm": 0.001027853460982442, "kl": 0.0011152219667565078, "learning_rate": 9.937841593330245e-07, "loss": 0.0001, "num_tokens": 18600070.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 672, "step_time": 13.687433570623398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.30489207804203033, "epoch": 0.031171838814265863, "frac_reward_zero_std": 0.0, "grad_norm": 0.07319233566522598, "kl": 0.001412037032423541, "learning_rate": 9.937748957850856e-07, "loss": 0.1205, "num_tokens": 18623745.0, "reward": 0.03309401869773865, "reward_std": 0.01641923189163208, "rewards/reward_func/mean": 0.03309401869773865, "rewards/reward_func/std": 0.01641923189163208, "step": 673, "step_time": 24.96692108362913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2258298024535179, "epoch": 0.031218156553960166, "frac_reward_zero_std": 1.0, "grad_norm": 0.000795226835180074, "kl": 0.0010459717304911464, "learning_rate": 9.937656322371467e-07, "loss": 0.0001, "num_tokens": 18646553.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 674, "step_time": 15.537427980452776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 153.1875, "completions/mean_terminated_length": 153.1875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3664705604314804, "epoch": 0.03126447429365447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009839312406256795, "kl": 0.001658998487982899, "learning_rate": 9.937563686892079e-07, "loss": 0.0001, "num_tokens": 18667644.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 675, "step_time": 16.918759364634752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.0625, "completions/mean_terminated_length": 124.0625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.25809136033058167, "epoch": 0.03131079203334877, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021402640268206596, "kl": 0.0013725956960115582, "learning_rate": 9.93747105141269e-07, "loss": 0.0001, "num_tokens": 18690877.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 676, "step_time": 14.425536841154099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.34704677760601044, "epoch": 0.031357109773043075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020199345890432596, "kl": 0.0015883515297900885, "learning_rate": 9.937378415933301e-07, "loss": 0.0001, "num_tokens": 18720398.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 677, "step_time": 17.222666319459677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 121.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.28825974464416504, "epoch": 0.03140342751273738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007279080455191433, "kl": 0.0011206761992070824, "learning_rate": 9.937285780453912e-07, "loss": 0.0001, "num_tokens": 18744434.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 678, "step_time": 14.141832951456308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 174.9375, "completions/mean_terminated_length": 174.9375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3730523809790611, "epoch": 0.03144974525243168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006375746452249587, "kl": 0.0011505739093990996, "learning_rate": 9.937193144974526e-07, "loss": 0.0001, "num_tokens": 18772705.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 679, "step_time": 22.287692293524742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 159.3125, "completions/mean_terminated_length": 159.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3158295676112175, "epoch": 0.031496062992125984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012767198495566845, "kl": 0.0013278848491609097, "learning_rate": 9.937100509495137e-07, "loss": 0.0001, "num_tokens": 18800246.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 680, "step_time": 17.864353463053703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.35829856991767883, "epoch": 0.03154238073182029, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012588640674948692, "kl": 0.0017853121389634907, "learning_rate": 9.937007874015746e-07, "loss": 0.0001, "num_tokens": 18855642.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 681, "step_time": 23.69368052110076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 112.125, "completions/mean_terminated_length": 112.125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.24923455342650414, "epoch": 0.03158869847151459, "frac_reward_zero_std": 1.0, "grad_norm": 0.001384776784107089, "kl": 0.0011787291732616723, "learning_rate": 9.93691523853636e-07, "loss": 0.0001, "num_tokens": 18875004.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 682, "step_time": 12.379692498594522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4157654121518135, "epoch": 0.03163501621120889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006648803828284144, "kl": 0.0014104731380939484, "learning_rate": 9.93682260305697e-07, "loss": 0.0001, "num_tokens": 18909208.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 683, "step_time": 19.277349393814802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.20814481377601624, "epoch": 0.031681333950903195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008208968793042004, "kl": 0.0009661391522968188, "learning_rate": 9.936729967577582e-07, "loss": 0.0, "num_tokens": 18939628.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 684, "step_time": 17.76503198221326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 144.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.40677928179502487, "epoch": 0.0317276516905975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007508598500862718, "kl": 0.0013949218846391886, "learning_rate": 9.936637332098193e-07, "loss": 0.0001, "num_tokens": 18984465.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 685, "step_time": 20.997331146150827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.9375, "completions/mean_terminated_length": 137.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2835551127791405, "epoch": 0.0317739694302918, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018957066349685192, "kl": 0.001669476885581389, "learning_rate": 9.936544696618805e-07, "loss": 0.0001, "num_tokens": 19005168.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 686, "step_time": 15.702382504940033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 111.1875, "completions/mean_terminated_length": 111.1875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.24426046386361122, "epoch": 0.031820287169986104, "frac_reward_zero_std": 1.0, "grad_norm": 0.001307345344685018, "kl": 0.0013860033504897729, "learning_rate": 9.936452061139416e-07, "loss": 0.0001, "num_tokens": 19025619.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 687, "step_time": 13.51840564981103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3493984639644623, "epoch": 0.03186660490968041, "frac_reward_zero_std": 0.0, "grad_norm": 0.10677601397037506, "kl": 0.0011691860418068245, "learning_rate": 9.936359425660027e-07, "loss": 0.0002, "num_tokens": 19053924.0, "reward": 0.01930723339319229, "reward_std": 0.03173091635107994, "rewards/reward_func/mean": 0.01930723339319229, "rewards/reward_func/std": 0.03173091635107994, "step": 688, "step_time": 22.76512398570776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.25590357929468155, "epoch": 0.03191292264937471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011609547073021531, "kl": 0.000935665360884741, "learning_rate": 9.936266790180638e-07, "loss": 0.0, "num_tokens": 19075662.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 689, "step_time": 14.06349989399314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 133.9375, "completions/mean_terminated_length": 133.9375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3027240112423897, "epoch": 0.03195924038906901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005010907188989222, "kl": 0.0008855178894009441, "learning_rate": 9.93617415470125e-07, "loss": 0.0, "num_tokens": 19103821.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 690, "step_time": 17.320686750113964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2675025314092636, "epoch": 0.032005558128763316, "frac_reward_zero_std": 1.0, "grad_norm": 0.001415303093381226, "kl": 0.0014167480112519115, "learning_rate": 9.93608151922186e-07, "loss": 0.0001, "num_tokens": 19123599.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 691, "step_time": 16.920801613479853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 136.4375, "completions/mean_terminated_length": 136.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3726763650774956, "epoch": 0.03205187586845762, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016297450056299567, "kl": 0.0019747884798562154, "learning_rate": 9.935988883742474e-07, "loss": 0.0001, "num_tokens": 19161190.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 692, "step_time": 18.093306742608547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 153.5625, "completions/mean_terminated_length": 153.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.436239056289196, "epoch": 0.03209819360815192, "frac_reward_zero_std": 1.0, "grad_norm": 0.001057538203895092, "kl": 0.0018068332865368575, "learning_rate": 9.935896248263085e-07, "loss": 0.0001, "num_tokens": 19210543.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 693, "step_time": 22.804465100169182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.36290524154901505, "epoch": 0.032144511347846225, "frac_reward_zero_std": 0.0, "grad_norm": 0.07543493807315826, "kl": 0.001298435847274959, "learning_rate": 9.935803612783695e-07, "loss": 0.0973, "num_tokens": 19237647.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 694, "step_time": 22.445793222635984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3356466591358185, "epoch": 0.03219082908754053, "frac_reward_zero_std": 0.0, "grad_norm": 0.09124850481748581, "kl": 0.0016333387466147542, "learning_rate": 9.935710977304306e-07, "loss": 0.0035, "num_tokens": 19262985.0, "reward": 0.058969270437955856, "reward_std": 0.10548744350671768, "rewards/reward_func/mean": 0.058969270437955856, "rewards/reward_func/std": 0.10548743605613708, "step": 695, "step_time": 23.154754973948002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.20573032274842262, "epoch": 0.03223714682723483, "frac_reward_zero_std": 0.0, "grad_norm": 0.06313058733940125, "kl": 0.0008511656487826258, "learning_rate": 9.93561834182492e-07, "loss": 0.0208, "num_tokens": 19299803.0, "reward": 0.9785919189453125, "reward_std": 0.08563227206468582, "rewards/reward_func/mean": 0.9785919189453125, "rewards/reward_func/std": 0.08563227951526642, "step": 696, "step_time": 24.87244164943695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 128.6875, "completions/mean_terminated_length": 128.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.350361131131649, "epoch": 0.03228346456692913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009798979153856635, "kl": 0.00100500434928108, "learning_rate": 9.93552570634553e-07, "loss": 0.0001, "num_tokens": 19324214.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 697, "step_time": 15.373975336551666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.29454803466796875, "epoch": 0.032329782306623436, "frac_reward_zero_std": 1.0, "grad_norm": 0.001718203304335475, "kl": 0.0014642720634583384, "learning_rate": 9.935433070866142e-07, "loss": 0.0001, "num_tokens": 19345044.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 698, "step_time": 13.65459056571126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 160.1875, "completions/mean_terminated_length": 160.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3443695157766342, "epoch": 0.03237610004631774, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009369025938212872, "kl": 0.0011791688884841278, "learning_rate": 9.935340435386753e-07, "loss": 0.0001, "num_tokens": 19366391.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 699, "step_time": 16.102231845259666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 147.125, "completions/mean_terminated_length": 147.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.21535206213593483, "epoch": 0.03242241778601204, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042517686961218715, "kl": 0.0007701033173361793, "learning_rate": 9.935247799907364e-07, "loss": 0.0, "num_tokens": 19387657.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 700, "step_time": 16.184305012226105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.34540799260139465, "epoch": 0.032468735525706345, "frac_reward_zero_std": 0.0, "grad_norm": 0.07063841819763184, "kl": 0.0010681675630621612, "learning_rate": 9.935155164427975e-07, "loss": -0.0456, "num_tokens": 19409131.0, "reward": 0.7472657561302185, "reward_std": 0.371590256690979, "rewards/reward_func/mean": 0.7472657561302185, "rewards/reward_func/std": 0.3715902864933014, "step": 701, "step_time": 24.055828519165516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2532971352338791, "epoch": 0.03251505326540065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007650414481759071, "kl": 0.0010268701007589698, "learning_rate": 9.935062528948587e-07, "loss": 0.0001, "num_tokens": 19433935.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 702, "step_time": 15.407848794013262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.45612356066703796, "epoch": 0.03256137100509495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006161848432384431, "kl": 0.0013176907668821514, "learning_rate": 9.934969893469198e-07, "loss": 0.0001, "num_tokens": 19459193.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 703, "step_time": 20.25391785427928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 224.4375, "completions/mean_terminated_length": 224.4375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.2529004104435444, "epoch": 0.032607688744789254, "frac_reward_zero_std": 0.0, "grad_norm": 0.05345242843031883, "kl": 0.0006900770240463316, "learning_rate": 9.93487725798981e-07, "loss": -0.0405, "num_tokens": 19493248.0, "reward": 0.9401124715805054, "reward_std": 0.23955021798610687, "rewards/reward_func/mean": 0.9401124715805054, "rewards/reward_func/std": 0.23955021798610687, "step": 704, "step_time": 23.980402942746878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.424755334854126, "epoch": 0.03265400648448356, "frac_reward_zero_std": 1.0, "grad_norm": 0.000872373057063669, "kl": 0.0015883336891420186, "learning_rate": 9.934784622510423e-07, "loss": 0.0001, "num_tokens": 19516060.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 705, "step_time": 22.41007725521922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.23423856869339943, "epoch": 0.03270032422417786, "frac_reward_zero_std": 0.0, "grad_norm": 0.05221366882324219, "kl": 0.0008720298501430079, "learning_rate": 9.934691987031032e-07, "loss": -0.0714, "num_tokens": 19544920.0, "reward": 0.9230508804321289, "reward_std": 0.13765078783035278, "rewards/reward_func/mean": 0.9230508804321289, "rewards/reward_func/std": 0.13765080273151398, "step": 706, "step_time": 28.587394293397665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 139.0625, "completions/mean_terminated_length": 139.0625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28025446459650993, "epoch": 0.03274664196387216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015409340849146247, "kl": 0.0013668817409779876, "learning_rate": 9.934599351551643e-07, "loss": 0.0001, "num_tokens": 19573241.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 707, "step_time": 18.28684702515602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 156.3125, "completions/mean_terminated_length": 156.3125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2281733974814415, "epoch": 0.032792959703566466, "frac_reward_zero_std": 0.0, "grad_norm": 0.09042426943778992, "kl": 0.0009373894135933369, "learning_rate": 9.934506716072254e-07, "loss": 0.011, "num_tokens": 19593982.0, "reward": 0.4834691882133484, "reward_std": 0.19668100774288177, "rewards/reward_func/mean": 0.4834691882133484, "rewards/reward_func/std": 0.19668102264404297, "step": 708, "step_time": 18.26748325675726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2592911906540394, "epoch": 0.03283927744326077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005293122376315296, "kl": 0.0008192450914066285, "learning_rate": 9.934414080592868e-07, "loss": 0.0, "num_tokens": 19621368.0, "reward": 0.7788007855415344, "reward_std": 0.0, "rewards/reward_func/mean": 0.7788007855415344, "rewards/reward_func/std": 0.0, "step": 709, "step_time": 22.77542806044221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4455046057701111, "epoch": 0.03288559518295507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007364381453953683, "kl": 0.0015260854561347514, "learning_rate": 9.934321445113479e-07, "loss": 0.0001, "num_tokens": 19676332.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 710, "step_time": 28.35394797474146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 159.0625, "completions/mean_terminated_length": 159.0625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3817960396409035, "epoch": 0.032931912922649374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005524643347598612, "kl": 0.001286326936678961, "learning_rate": 9.93422880963409e-07, "loss": 0.0001, "num_tokens": 19708141.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 711, "step_time": 19.659467611461878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.21849926188588142, "epoch": 0.03297823066234368, "frac_reward_zero_std": 0.0, "grad_norm": 0.18764466047286987, "kl": 0.001313518121605739, "learning_rate": 9.934136174154701e-07, "loss": -0.0318, "num_tokens": 19729360.0, "reward": 0.2936846613883972, "reward_std": 0.18835076689720154, "rewards/reward_func/mean": 0.2936846613883972, "rewards/reward_func/std": 0.18835076689720154, "step": 712, "step_time": 18.324943736195564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 137.5625, "completions/mean_terminated_length": 137.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.24218040704727173, "epoch": 0.03302454840203798, "frac_reward_zero_std": 1.0, "grad_norm": 0.00061118631856516, "kl": 0.0008218033908633515, "learning_rate": 9.934043538675313e-07, "loss": 0.0, "num_tokens": 19753625.0, "reward": 0.35782673954963684, "reward_std": 0.0, "rewards/reward_func/mean": 0.35782673954963684, "rewards/reward_func/std": 0.0, "step": 713, "step_time": 16.36056460440159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2659558728337288, "epoch": 0.03307086614173228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010865560034289956, "kl": 0.0011735920270439237, "learning_rate": 9.933950903195924e-07, "loss": 0.0001, "num_tokens": 19774773.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 714, "step_time": 15.701992142945528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3677009716629982, "epoch": 0.033117183881426586, "frac_reward_zero_std": 0.0, "grad_norm": 0.05988532304763794, "kl": 0.0014401644875761122, "learning_rate": 9.933858267716535e-07, "loss": -0.0327, "num_tokens": 19797503.0, "reward": 0.6246673464775085, "reward_std": 0.461967796087265, "rewards/reward_func/mean": 0.6246673464775085, "rewards/reward_func/std": 0.4619678258895874, "step": 715, "step_time": 26.359856016933918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 137.1875, "completions/mean_terminated_length": 137.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3047793358564377, "epoch": 0.03316350162112089, "frac_reward_zero_std": 1.0, "grad_norm": 0.002679202938452363, "kl": 0.0018670128483790904, "learning_rate": 9.933765632237146e-07, "loss": 0.0001, "num_tokens": 19828530.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 716, "step_time": 17.258633948862553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.262413639575243, "epoch": 0.03320981936081519, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006397275719791651, "kl": 0.001206342043587938, "learning_rate": 9.933672996757758e-07, "loss": 0.0001, "num_tokens": 19849502.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 717, "step_time": 14.940201926976442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 194.3125, "completions/mean_terminated_length": 194.3125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2365681529045105, "epoch": 0.033256137100509495, "frac_reward_zero_std": 0.0, "grad_norm": 0.0802539587020874, "kl": 0.0008338545303558931, "learning_rate": 9.933580361278369e-07, "loss": -0.0055, "num_tokens": 19887619.0, "reward": 0.9095170497894287, "reward_std": 0.24253785610198975, "rewards/reward_func/mean": 0.9095170497894287, "rewards/reward_func/std": 0.24253787100315094, "step": 718, "step_time": 22.915946260094643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 165.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.31308742612600327, "epoch": 0.0333024548402038, "frac_reward_zero_std": 0.0, "grad_norm": 0.07160071283578873, "kl": 0.0009658441704232246, "learning_rate": 9.93348772579898e-07, "loss": -0.0388, "num_tokens": 19908424.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 719, "step_time": 17.067168951034546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3278944119811058, "epoch": 0.0333487725798981, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012633440783247352, "kl": 0.0014585844764951617, "learning_rate": 9.933395090319591e-07, "loss": 0.0001, "num_tokens": 19945291.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 720, "step_time": 20.91337340325117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3783861994743347, "epoch": 0.0333950903195924, "frac_reward_zero_std": 0.0, "grad_norm": 0.07267819344997406, "kl": 0.001166014961199835, "learning_rate": 9.933302454840203e-07, "loss": -0.1424, "num_tokens": 19966749.0, "reward": 0.4347226917743683, "reward_std": 0.5092058777809143, "rewards/reward_func/mean": 0.4347226917743683, "rewards/reward_func/std": 0.5092059373855591, "step": 721, "step_time": 24.837207660079002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.270227313041687, "epoch": 0.033441408059286706, "frac_reward_zero_std": 1.0, "grad_norm": 0.00099515775218606, "kl": 0.0011917454103240743, "learning_rate": 9.933209819360816e-07, "loss": 0.0001, "num_tokens": 19989625.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 722, "step_time": 16.01493902504444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3660736382007599, "epoch": 0.03348772579898101, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013438266469165683, "kl": 0.0015953602851368487, "learning_rate": 9.933117183881427e-07, "loss": 0.0001, "num_tokens": 20012057.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 723, "step_time": 18.63400572165847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 162.3125, "completions/mean_terminated_length": 162.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.34091276675462723, "epoch": 0.03353404353867531, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006663731182925403, "kl": 0.0012835308152716607, "learning_rate": 9.933024548402038e-07, "loss": 0.0001, "num_tokens": 20039854.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 724, "step_time": 19.417426977306604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.40660395473241806, "epoch": 0.033580361278369615, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007975733024068177, "kl": 0.0016166368732228875, "learning_rate": 9.932931912922648e-07, "loss": 0.0001, "num_tokens": 20097651.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 725, "step_time": 26.35659772530198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3349655866622925, "epoch": 0.03362667901806392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015858388505876064, "kl": 0.0015882920124568045, "learning_rate": 9.93283927744326e-07, "loss": 0.0001, "num_tokens": 20146989.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 726, "step_time": 23.59270754829049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.37874750047922134, "epoch": 0.03367299675775822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009076311835087836, "kl": 0.0014788406842853874, "learning_rate": 9.932746641963872e-07, "loss": 0.0001, "num_tokens": 20182705.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 727, "step_time": 22.591049123555422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2066696584224701, "epoch": 0.033719314497452524, "frac_reward_zero_std": 0.0, "grad_norm": 0.08451955765485764, "kl": 0.0009963087213691324, "learning_rate": 9.932654006484483e-07, "loss": 0.0859, "num_tokens": 20216014.0, "reward": 0.5573630332946777, "reward_std": 0.14863014221191406, "rewards/reward_func/mean": 0.5573630332946777, "rewards/reward_func/std": 0.14863014221191406, "step": 728, "step_time": 23.8648879006505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 137.1875, "completions/mean_terminated_length": 137.1875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3334541544318199, "epoch": 0.03376563223714683, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012491237139329314, "kl": 0.0016305186436511576, "learning_rate": 9.932561371005095e-07, "loss": 0.0001, "num_tokens": 20251985.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 729, "step_time": 18.136456787586212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29052938520908356, "epoch": 0.03381194997684113, "frac_reward_zero_std": 1.0, "grad_norm": 0.001193807809613645, "kl": 0.0013317964621819556, "learning_rate": 9.932468735525706e-07, "loss": 0.0001, "num_tokens": 20272509.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 730, "step_time": 14.66816596314311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3935469910502434, "epoch": 0.03385826771653543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006336777005344629, "kl": 0.0011270699906162918, "learning_rate": 9.932376100046317e-07, "loss": 0.0001, "num_tokens": 20299671.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 731, "step_time": 17.40089276432991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3299044221639633, "epoch": 0.033904585456229736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006873649545013905, "kl": 0.001305067795328796, "learning_rate": 9.932283464566928e-07, "loss": 0.0001, "num_tokens": 20322429.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 732, "step_time": 18.563721273094416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 186.6875, "completions/mean_terminated_length": 186.6875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.371376670897007, "epoch": 0.03395090319592404, "frac_reward_zero_std": 0.0, "grad_norm": 0.08148089051246643, "kl": 0.0010322736197849736, "learning_rate": 9.93219082908754e-07, "loss": -0.038, "num_tokens": 20342808.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 733, "step_time": 21.671961937099695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.22905239090323448, "epoch": 0.03399722093561834, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014935294166207314, "kl": 0.001308032893575728, "learning_rate": 9.93209819360815e-07, "loss": 0.0001, "num_tokens": 20362368.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 734, "step_time": 13.496449582278728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 198.875, "completions/mean_terminated_length": 198.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2636433206498623, "epoch": 0.034043538675312644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005140812718309462, "kl": 0.0007544051186414436, "learning_rate": 9.932005558128762e-07, "loss": 0.0, "num_tokens": 20399006.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 735, "step_time": 22.731944765895605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.16582611203193665, "epoch": 0.03408985641500695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007200704421848059, "kl": 0.0008659442391945049, "learning_rate": 9.931912922649375e-07, "loss": 0.0, "num_tokens": 20423320.0, "reward": 0.46831193566322327, "reward_std": 0.0, "rewards/reward_func/mean": 0.46831193566322327, "rewards/reward_func/std": 0.0, "step": 736, "step_time": 20.08252888917923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3942590281367302, "epoch": 0.03413617415470125, "frac_reward_zero_std": 1.0, "grad_norm": 0.002515591913834214, "kl": 0.002140891447197646, "learning_rate": 9.931820287169985e-07, "loss": 0.0001, "num_tokens": 20453596.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 737, "step_time": 20.763261321932077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 181.875, "completions/mean_terminated_length": 181.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3159395232796669, "epoch": 0.03418249189439555, "frac_reward_zero_std": 0.0, "grad_norm": 0.1018763855099678, "kl": 0.002017256512772292, "learning_rate": 9.931727651690596e-07, "loss": -0.0771, "num_tokens": 20478202.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.4787135720252991, "step": 738, "step_time": 19.832482635974884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 118.375, "completions/mean_terminated_length": 118.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.23701602593064308, "epoch": 0.034228809634089856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008975856471806765, "kl": 0.0010572972532827407, "learning_rate": 9.93163501621121e-07, "loss": 0.0001, "num_tokens": 20498528.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 739, "step_time": 13.707400850951672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 196.8125, "completions/mean_terminated_length": 196.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3240229934453964, "epoch": 0.03427512737378416, "frac_reward_zero_std": 0.0, "grad_norm": 0.07890082150697708, "kl": 0.0016898235189728439, "learning_rate": 9.93154238073182e-07, "loss": -0.0403, "num_tokens": 20521229.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.4787135720252991, "step": 740, "step_time": 22.117990609258413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 152.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.36976949870586395, "epoch": 0.03432144511347846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006292248726822436, "kl": 0.001158057653810829, "learning_rate": 9.931449745252432e-07, "loss": 0.0001, "num_tokens": 20552102.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 741, "step_time": 18.70715820044279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.3275139331817627, "epoch": 0.034367762853172765, "frac_reward_zero_std": 0.0, "grad_norm": 0.06887755542993546, "kl": 0.001257510157302022, "learning_rate": 9.931357109773043e-07, "loss": -0.2157, "num_tokens": 20579036.0, "reward": 0.2892181873321533, "reward_std": 0.4444383680820465, "rewards/reward_func/mean": 0.2892181873321533, "rewards/reward_func/std": 0.4444383978843689, "step": 742, "step_time": 27.33681583032012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 184.1875, "completions/mean_terminated_length": 184.1875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4183108061552048, "epoch": 0.03441408059286707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011316396994516253, "kl": 0.001560896256705746, "learning_rate": 9.931264474293654e-07, "loss": 0.0001, "num_tokens": 20608591.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 743, "step_time": 22.03494720160961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 189.1875, "completions/mean_terminated_length": 189.1875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3440265357494354, "epoch": 0.03446039833256137, "frac_reward_zero_std": 0.0, "grad_norm": 0.11852071434259415, "kl": 0.0022669222380500287, "learning_rate": 9.931171838814265e-07, "loss": -0.0199, "num_tokens": 20633218.0, "reward": 0.8482850790023804, "reward_std": 0.2262093424797058, "rewards/reward_func/mean": 0.8482850790023804, "rewards/reward_func/std": 0.226209357380867, "step": 744, "step_time": 24.622589204460382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 223.8125, "completions/mean_terminated_length": 223.8125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.3200981020927429, "epoch": 0.03450671607225567, "frac_reward_zero_std": 0.0, "grad_norm": 0.05692924186587334, "kl": 0.0008040217508096248, "learning_rate": 9.931079203334877e-07, "loss": -0.0969, "num_tokens": 20659615.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 745, "step_time": 26.291905768215656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1850629523396492, "epoch": 0.034553033811949976, "frac_reward_zero_std": 0.0, "grad_norm": 0.07412910461425781, "kl": 0.0010273780062561855, "learning_rate": 9.930986567855488e-07, "loss": -0.0354, "num_tokens": 20681252.0, "reward": 0.8714442253112793, "reward_std": 0.1666259467601776, "rewards/reward_func/mean": 0.8714442253112793, "rewards/reward_func/std": 0.1666259467601776, "step": 746, "step_time": 16.422810439020395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.424900121986866, "epoch": 0.03459935155164428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016889924881979823, "kl": 0.0015171955456025898, "learning_rate": 9.9308939323761e-07, "loss": 0.0001, "num_tokens": 20712300.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 747, "step_time": 18.411602519452572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 160.0625, "completions/mean_terminated_length": 160.0625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.19160006195306778, "epoch": 0.03464566929133858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006853294325992465, "kl": 0.0007444946531904861, "learning_rate": 9.93080129689671e-07, "loss": 0.0, "num_tokens": 20744205.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 748, "step_time": 18.91827342286706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.33665014058351517, "epoch": 0.034691987031032885, "frac_reward_zero_std": 1.0, "grad_norm": 0.003475069534033537, "kl": 0.0019420332391746342, "learning_rate": 9.930708661417322e-07, "loss": 0.0001, "num_tokens": 20764709.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 749, "step_time": 14.929662246257067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.30779921263456345, "epoch": 0.03473830477072719, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016567701241001487, "kl": 0.0011813818127848208, "learning_rate": 9.930616025937933e-07, "loss": 0.0001, "num_tokens": 20784625.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 750, "step_time": 13.047545105218887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.30832144618034363, "epoch": 0.03478462251042149, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010846181539818645, "kl": 0.0011726339143933728, "learning_rate": 9.930523390458544e-07, "loss": 0.0001, "num_tokens": 20807743.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 751, "step_time": 18.087368704378605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.2382570020854473, "epoch": 0.034830940250115794, "frac_reward_zero_std": 0.0, "grad_norm": 0.17337356507778168, "kl": 0.0009417200344614685, "learning_rate": 9.930430754979158e-07, "loss": -0.0338, "num_tokens": 20842793.0, "reward": 0.1939225196838379, "reward_std": 0.11004070192575455, "rewards/reward_func/mean": 0.1939225196838379, "rewards/reward_func/std": 0.11004070192575455, "step": 752, "step_time": 23.629357885569334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2175726406276226, "epoch": 0.0348772579898101, "frac_reward_zero_std": 0.0, "grad_norm": 0.16890059411525726, "kl": 0.0011022739636246115, "learning_rate": 9.930338119499769e-07, "loss": -0.0042, "num_tokens": 20869639.0, "reward": 0.21134749054908752, "reward_std": 0.023851996287703514, "rewards/reward_func/mean": 0.21134749054908752, "rewards/reward_func/std": 0.023851994425058365, "step": 753, "step_time": 18.526702269911766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 159.9375, "completions/mean_terminated_length": 159.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.43599508702754974, "epoch": 0.0349235757295044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010902105132117867, "kl": 0.0014562978176400065, "learning_rate": 9.93024548402038e-07, "loss": 0.0001, "num_tokens": 20915062.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 754, "step_time": 23.182676058262587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.31797099113464355, "epoch": 0.0349698934691987, "frac_reward_zero_std": 0.0, "grad_norm": 0.0610932894051075, "kl": 0.0012669971329160035, "learning_rate": 9.93015284854099e-07, "loss": 0.0008, "num_tokens": 20946928.0, "reward": 0.7376224994659424, "reward_std": 0.3595956563949585, "rewards/reward_func/mean": 0.7376224994659424, "rewards/reward_func/std": 0.3595956563949585, "step": 755, "step_time": 23.928455755114555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4484062194824219, "epoch": 0.035016211208893006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010049432748928666, "kl": 0.0016918782494030893, "learning_rate": 9.930060213061603e-07, "loss": 0.0001, "num_tokens": 20970862.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 756, "step_time": 18.918332800269127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.13485882431268692, "epoch": 0.03506252894858731, "frac_reward_zero_std": 0.0, "grad_norm": 0.10510154068470001, "kl": 0.0009273640462197363, "learning_rate": 9.929967577582214e-07, "loss": -0.0104, "num_tokens": 20991297.0, "reward": 0.11500650644302368, "reward_std": 0.31425464153289795, "rewards/reward_func/mean": 0.11500650644302368, "rewards/reward_func/std": 0.31425464153289795, "step": 757, "step_time": 14.219180513173342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.21773843094706535, "epoch": 0.03510884668828161, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007945893448777497, "kl": 0.0009267694258596748, "learning_rate": 9.929874942102825e-07, "loss": 0.0, "num_tokens": 21013027.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 758, "step_time": 15.522979341447353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.338830903172493, "epoch": 0.035155164427975914, "frac_reward_zero_std": 1.0, "grad_norm": 0.001139418687671423, "kl": 0.0012303398980293423, "learning_rate": 9.929782306623436e-07, "loss": 0.0001, "num_tokens": 21033395.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 759, "step_time": 14.582605965435505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.32310692965984344, "epoch": 0.03520148216767022, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023779685143381357, "kl": 0.0016642717528156936, "learning_rate": 9.929689671144048e-07, "loss": 0.0001, "num_tokens": 21053390.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 760, "step_time": 17.098231252282858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.41315774619579315, "epoch": 0.03524779990736452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005841231904923916, "kl": 0.0013927049294579774, "learning_rate": 9.929597035664659e-07, "loss": 0.0001, "num_tokens": 21087617.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 761, "step_time": 22.515921484678984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 175.4375, "completions/mean_terminated_length": 175.4375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.21197232976555824, "epoch": 0.03529411764705882, "frac_reward_zero_std": 0.0, "grad_norm": 0.06189983710646629, "kl": 0.0008826443227007985, "learning_rate": 9.92950440018527e-07, "loss": -0.0124, "num_tokens": 21109352.0, "reward": 0.59250807762146, "reward_std": 0.15573769807815552, "rewards/reward_func/mean": 0.59250807762146, "rewards/reward_func/std": 0.15573768317699432, "step": 762, "step_time": 18.795867145061493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 159.1875, "completions/mean_terminated_length": 159.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.33611395210027695, "epoch": 0.035340435386753126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012660189531743526, "kl": 0.0015209891716949642, "learning_rate": 9.929411764705881e-07, "loss": 0.0001, "num_tokens": 21146523.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 763, "step_time": 20.935311947017908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.21683895960450172, "epoch": 0.03538675312644743, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012084031477570534, "kl": 0.001167740294476971, "learning_rate": 9.929319129226493e-07, "loss": 0.0001, "num_tokens": 21168275.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 764, "step_time": 18.577101062983274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.29378584772348404, "epoch": 0.03543307086614173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018863253062590957, "kl": 0.0017148111073765904, "learning_rate": 9.929226493747104e-07, "loss": 0.0001, "num_tokens": 21190977.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 765, "step_time": 14.933240331709385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3214898705482483, "epoch": 0.035479388605836035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006536695291288197, "kl": 0.0010365339112468064, "learning_rate": 9.929133858267717e-07, "loss": 0.0001, "num_tokens": 21212544.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 766, "step_time": 15.943621318787336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 244.8125, "completions/mean_terminated_length": 244.8125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.28746990859508514, "epoch": 0.03552570634553034, "frac_reward_zero_std": 0.0, "grad_norm": 0.061868999153375626, "kl": 0.0009608932887203991, "learning_rate": 9.929041222788328e-07, "loss": -0.022, "num_tokens": 21249981.0, "reward": 0.7940504550933838, "reward_std": 0.07485879957675934, "rewards/reward_func/mean": 0.7940504550933838, "rewards/reward_func/std": 0.07485879212617874, "step": 767, "step_time": 26.725659370422363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 125.6875, "completions/mean_terminated_length": 125.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.27554355934262276, "epoch": 0.03557202408522464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008337905164808035, "kl": 0.0012617142347153276, "learning_rate": 9.928948587308938e-07, "loss": 0.0001, "num_tokens": 21270920.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 768, "step_time": 13.756904244422913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.39798150956630707, "epoch": 0.03561834182491894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011346646351739764, "kl": 0.0016589273873250932, "learning_rate": 9.92885595182955e-07, "loss": 0.0001, "num_tokens": 21305014.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 769, "step_time": 20.33734503760934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3338210806250572, "epoch": 0.035664659564613246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009008368942886591, "kl": 0.0012040752044413239, "learning_rate": 9.928763316350162e-07, "loss": 0.0001, "num_tokens": 21331598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 770, "step_time": 16.965304989367723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 109.1875, "completions/mean_terminated_length": 109.1875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28656987100839615, "epoch": 0.03571097730430755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009368361788801849, "kl": 0.00133035026374273, "learning_rate": 9.928670680870773e-07, "loss": 0.0001, "num_tokens": 21352081.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 771, "step_time": 12.301237791776657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 127.3125, "completions/mean_terminated_length": 127.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2899698242545128, "epoch": 0.03575729504400185, "frac_reward_zero_std": 1.0, "grad_norm": 0.001047339290380478, "kl": 0.0013316216936800629, "learning_rate": 9.928578045391385e-07, "loss": 0.0001, "num_tokens": 21374518.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 772, "step_time": 14.309594821184874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3939480558037758, "epoch": 0.035803612783696155, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047841350897215307, "kl": 0.0012557295995065942, "learning_rate": 9.928485409911996e-07, "loss": 0.0001, "num_tokens": 21402280.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 773, "step_time": 21.371572624891996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3412705659866333, "epoch": 0.03584993052339046, "frac_reward_zero_std": 0.0, "grad_norm": 0.07474339753389359, "kl": 0.0010987457353621721, "learning_rate": 9.928392774432607e-07, "loss": -0.0085, "num_tokens": 21439276.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 774, "step_time": 24.488349922001362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3591512441635132, "epoch": 0.03589624826308476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027559211011976004, "kl": 0.0017321380146313459, "learning_rate": 9.928300138953218e-07, "loss": 0.0001, "num_tokens": 21472214.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 775, "step_time": 18.36331943050027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.30273303389549255, "epoch": 0.035942566002779064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005927165038883686, "kl": 0.0011400552175473422, "learning_rate": 9.92820750347383e-07, "loss": 0.0001, "num_tokens": 21494346.0, "reward": 0.47266900539398193, "reward_std": 0.0, "rewards/reward_func/mean": 0.47266900539398193, "rewards/reward_func/std": 0.0, "step": 776, "step_time": 16.990239322185516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.1891249492764473, "epoch": 0.03598888374247337, "frac_reward_zero_std": 0.0, "grad_norm": 0.09204047173261642, "kl": 0.000799347908468917, "learning_rate": 9.92811486799444e-07, "loss": -0.0961, "num_tokens": 21527049.0, "reward": 0.6444321870803833, "reward_std": 0.3238654136657715, "rewards/reward_func/mean": 0.6444321870803833, "rewards/reward_func/std": 0.3238654136657715, "step": 777, "step_time": 21.485658913850784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 118.375, "completions/mean_terminated_length": 118.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.22776512056589127, "epoch": 0.03603520148216767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016181276878342032, "kl": 0.0009063722391147166, "learning_rate": 9.928022232515052e-07, "loss": 0.0, "num_tokens": 21548783.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 778, "step_time": 13.551093552261591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1654873713850975, "epoch": 0.03608151922186197, "frac_reward_zero_std": 0.0, "grad_norm": 0.07232765853404999, "kl": 0.0006980880134506151, "learning_rate": 9.927929597035666e-07, "loss": -0.1047, "num_tokens": 21570921.0, "reward": 0.33508291840553284, "reward_std": 0.3896864056587219, "rewards/reward_func/mean": 0.33508291840553284, "rewards/reward_func/std": 0.3896864354610443, "step": 779, "step_time": 18.786138746887445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.24532657489180565, "epoch": 0.036127836961556276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022845608182251453, "kl": 0.0017486957367509604, "learning_rate": 9.927836961556275e-07, "loss": 0.0001, "num_tokens": 21600119.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 780, "step_time": 20.80349262431264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.32380537688732147, "epoch": 0.03617415470125058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006954511627554893, "kl": 0.0011527634633239359, "learning_rate": 9.927744326076886e-07, "loss": 0.0001, "num_tokens": 21620965.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 781, "step_time": 14.75655872002244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2626483403146267, "epoch": 0.03622047244094488, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008991743670776486, "kl": 0.001040646922774613, "learning_rate": 9.9276516905975e-07, "loss": 0.0001, "num_tokens": 21643931.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 782, "step_time": 15.008624862879515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.32240891456604004, "epoch": 0.036266790180639184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012835003435611725, "kl": 0.001953296741703525, "learning_rate": 9.92755905511811e-07, "loss": 0.0001, "num_tokens": 21696177.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 783, "step_time": 24.101244494318962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.6875, "completions/mean_terminated_length": 179.6875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3580075055360794, "epoch": 0.03631310792033349, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006679447251372039, "kl": 0.0010465997911524028, "learning_rate": 9.927466419638722e-07, "loss": 0.0001, "num_tokens": 21719980.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 784, "step_time": 19.346400436013937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 178.8125, "completions/mean_terminated_length": 178.8125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.43494129180908203, "epoch": 0.03635942566002779, "frac_reward_zero_std": 1.0, "grad_norm": 0.001725780894048512, "kl": 0.0016527536790817976, "learning_rate": 9.927373784159333e-07, "loss": 0.0001, "num_tokens": 21741257.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 785, "step_time": 18.63063418865204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4212736263871193, "epoch": 0.03640574339972209, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013622586848214269, "kl": 0.0018332574982196093, "learning_rate": 9.927281148679944e-07, "loss": 0.0001, "num_tokens": 21770571.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 786, "step_time": 20.721955724060535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.21823961660265923, "epoch": 0.036452061139416396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006008459022268653, "kl": 0.0008442414400633425, "learning_rate": 9.927188513200556e-07, "loss": 0.0, "num_tokens": 21821855.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 787, "step_time": 26.682371847331524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.40522969514131546, "epoch": 0.0364983788791107, "frac_reward_zero_std": 0.0, "grad_norm": 0.07477609068155289, "kl": 0.0014336762833409011, "learning_rate": 9.927095877721167e-07, "loss": -0.0727, "num_tokens": 21847130.0, "reward": 0.018614530563354492, "reward_std": 0.027606507763266563, "rewards/reward_func/mean": 0.018614530563354492, "rewards/reward_func/std": 0.027606507763266563, "step": 788, "step_time": 23.462858349084854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3644990473985672, "epoch": 0.036544696618805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012292582541704178, "kl": 0.001423132256604731, "learning_rate": 9.927003242241778e-07, "loss": 0.0001, "num_tokens": 21871554.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 789, "step_time": 18.672618698328733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 135.3125, "completions/mean_terminated_length": 135.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2719787135720253, "epoch": 0.036591014358499305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009575061849318445, "kl": 0.0013380526943365112, "learning_rate": 9.92691060676239e-07, "loss": 0.0001, "num_tokens": 21894455.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 790, "step_time": 14.909899402409792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 110.875, "completions/mean_terminated_length": 110.875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2577039450407028, "epoch": 0.03663733209819361, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015504512703046203, "kl": 0.0015930982190184295, "learning_rate": 9.926817971283e-07, "loss": 0.0001, "num_tokens": 21913701.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 791, "step_time": 12.476087305694818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.26193511486053467, "epoch": 0.03668364983788791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015716665657237172, "kl": 0.0015727959398645908, "learning_rate": 9.926725335803612e-07, "loss": 0.0001, "num_tokens": 21933701.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 792, "step_time": 12.937334209680557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.36753329634666443, "epoch": 0.036729967577582213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006561006885021925, "kl": 0.0013610675669042394, "learning_rate": 9.926632700324223e-07, "loss": 0.0001, "num_tokens": 21955303.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 793, "step_time": 18.506981823593378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.23263446241617203, "epoch": 0.036776285317276516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011079427786171436, "kl": 0.0013293431256897748, "learning_rate": 9.926540064844834e-07, "loss": 0.0001, "num_tokens": 21974999.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 794, "step_time": 14.711909919977188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 159.8125, "completions/mean_terminated_length": 159.8125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.12268547713756561, "epoch": 0.03682260305697082, "frac_reward_zero_std": 0.0, "grad_norm": 0.06275269389152527, "kl": 0.000531562152900733, "learning_rate": 9.926447429365446e-07, "loss": -0.0501, "num_tokens": 22006564.0, "reward": 0.9305884838104248, "reward_std": 0.018509721383452415, "rewards/reward_func/mean": 0.9305884838104248, "rewards/reward_func/std": 0.01850973069667816, "step": 795, "step_time": 19.20914574339986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27159298956394196, "epoch": 0.03686892079666512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009578621829859912, "kl": 0.00117204335401766, "learning_rate": 9.926354793886059e-07, "loss": 0.0001, "num_tokens": 22026539.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 796, "step_time": 14.955506909638643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.36915767937898636, "epoch": 0.036915238536359425, "frac_reward_zero_std": 1.0, "grad_norm": 0.000535281898919493, "kl": 0.0009660491195973009, "learning_rate": 9.92626215840667e-07, "loss": 0.0, "num_tokens": 22051605.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 797, "step_time": 20.611786134541035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 142.625, "completions/mean_terminated_length": 142.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.21989630907773972, "epoch": 0.03696155627605373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008252896368503571, "kl": 0.0009417905821464956, "learning_rate": 9.92616952292728e-07, "loss": 0.0, "num_tokens": 22071487.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 798, "step_time": 14.327708523720503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 152.0625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3419434130191803, "epoch": 0.03700787401574803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008594851242378354, "kl": 0.001281717763049528, "learning_rate": 9.926076887447893e-07, "loss": 0.0001, "num_tokens": 22096880.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 799, "step_time": 17.970007836818695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4320857673883438, "epoch": 0.037054191755442334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022666838485747576, "kl": 0.0022287203755695373, "learning_rate": 9.925984251968504e-07, "loss": 0.0001, "num_tokens": 22126368.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 800, "step_time": 24.515904534608126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4241221025586128, "epoch": 0.03710050949513664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011163548333570361, "kl": 0.0019402401230763644, "learning_rate": 9.925891616489115e-07, "loss": 0.0001, "num_tokens": 22156014.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 801, "step_time": 18.589754354208708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.17512871697545052, "epoch": 0.03714682723483094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010291461367160082, "kl": 0.0008728425891604275, "learning_rate": 9.925798981009726e-07, "loss": 0.0, "num_tokens": 22176841.0, "reward": 0.7403417825698853, "reward_std": 0.0, "rewards/reward_func/mean": 0.7403417825698853, "rewards/reward_func/std": 0.0, "step": 802, "step_time": 16.448123518377542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 285.6875, "completions/mean_terminated_length": 285.6875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.26425112783908844, "epoch": 0.03719314497452524, "frac_reward_zero_std": 0.0, "grad_norm": 0.04466618597507477, "kl": 0.0007954746979521587, "learning_rate": 9.925706345530338e-07, "loss": -0.0642, "num_tokens": 22205924.0, "reward": 0.5974346399307251, "reward_std": 0.2332146167755127, "rewards/reward_func/mean": 0.5974346399307251, "rewards/reward_func/std": 0.2332146167755127, "step": 803, "step_time": 30.98456759750843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.15964826568961143, "epoch": 0.037239462714219546, "frac_reward_zero_std": 1.0, "grad_norm": 0.001106057083234191, "kl": 0.0009501670720055699, "learning_rate": 9.925613710050949e-07, "loss": 0.0, "num_tokens": 22240180.0, "reward": 0.8702397346496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.8702397346496582, "rewards/reward_func/std": 0.0, "step": 804, "step_time": 21.30280603468418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 208.5625, "completions/mean_terminated_length": 208.5625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.39372440427541733, "epoch": 0.03728578045391385, "frac_reward_zero_std": 0.0, "grad_norm": 0.1070389375090599, "kl": 0.001414378290064633, "learning_rate": 9.92552107457156e-07, "loss": -0.0253, "num_tokens": 22265533.0, "reward": 0.5325166583061218, "reward_std": 0.4859153926372528, "rewards/reward_func/mean": 0.5325166583061218, "rewards/reward_func/std": 0.4859154224395752, "step": 805, "step_time": 22.00924064591527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.27635348588228226, "epoch": 0.03733209819360815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012643581721931696, "kl": 0.0012706111301667988, "learning_rate": 9.925428439092171e-07, "loss": 0.0001, "num_tokens": 22288812.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 806, "step_time": 17.406505286693573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.125, "completions/mean_terminated_length": 133.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24264325946569443, "epoch": 0.037378415933302454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006564650102518499, "kl": 0.0009725986019475386, "learning_rate": 9.925335803612783e-07, "loss": 0.0, "num_tokens": 22308526.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 807, "step_time": 14.06134543940425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 169.8125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4375000149011612, "epoch": 0.03742473367299676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009823600994423032, "kl": 0.0012600447225850075, "learning_rate": 9.925243168133394e-07, "loss": 0.0001, "num_tokens": 22341451.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 808, "step_time": 22.87074578180909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 165.6875, "completions/mean_terminated_length": 165.6875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.38376880437135696, "epoch": 0.03747105141269106, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005137338303029537, "kl": 0.0012045193143421784, "learning_rate": 9.925150532654007e-07, "loss": 0.0001, "num_tokens": 22367814.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 809, "step_time": 18.01202069595456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 132.0625, "completions/mean_terminated_length": 132.0625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2751566097140312, "epoch": 0.03751736915238536, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010438364697620273, "kl": 0.0010625261347740889, "learning_rate": 9.925057897174618e-07, "loss": 0.0001, "num_tokens": 22390007.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 810, "step_time": 14.121484663337469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.2260236032307148, "epoch": 0.037563686892079666, "frac_reward_zero_std": 1.0, "grad_norm": 0.000488622288685292, "kl": 0.0007573103066533804, "learning_rate": 9.924965261695228e-07, "loss": 0.0, "num_tokens": 22427959.0, "reward": 0.9383861422538757, "reward_std": 0.0, "rewards/reward_func/mean": 0.9383861422538757, "rewards/reward_func/std": 0.0, "step": 811, "step_time": 24.568120811134577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2874022424221039, "epoch": 0.03761000463177397, "frac_reward_zero_std": 0.0, "grad_norm": 0.10655516386032104, "kl": 0.0017227102653123438, "learning_rate": 9.92487262621584e-07, "loss": -0.1044, "num_tokens": 22466147.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 812, "step_time": 29.970730647444725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24665077775716782, "epoch": 0.03765632237146827, "frac_reward_zero_std": 1.0, "grad_norm": 0.003576041432097554, "kl": 0.0016069425037130713, "learning_rate": 9.924779990736452e-07, "loss": 0.0001, "num_tokens": 22487624.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 813, "step_time": 14.225050505250692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2610808089375496, "epoch": 0.037702640111162575, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008053500787355006, "kl": 0.0011654336849460378, "learning_rate": 9.924687355257063e-07, "loss": 0.0001, "num_tokens": 22508454.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 814, "step_time": 13.55789552256465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2975633218884468, "epoch": 0.03774895785085688, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008555660024285316, "kl": 0.0011338264012010768, "learning_rate": 9.924594719777675e-07, "loss": 0.0001, "num_tokens": 22528577.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 815, "step_time": 13.34862768650055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 160.1875, "completions/mean_terminated_length": 160.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.42140527814626694, "epoch": 0.03779527559055118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007337057613767684, "kl": 0.0013057987089268863, "learning_rate": 9.924502084298286e-07, "loss": 0.0001, "num_tokens": 22557012.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 816, "step_time": 20.276419568806887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2034543715417385, "epoch": 0.037841593330245483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007220017723739147, "kl": 0.000908565940335393, "learning_rate": 9.924409448818897e-07, "loss": 0.0, "num_tokens": 22589927.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 817, "step_time": 19.533989932388067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.32018114626407623, "epoch": 0.037887911069939786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006020690780133009, "kl": 0.0010532069572946057, "learning_rate": 9.924316813339508e-07, "loss": 0.0001, "num_tokens": 22617281.0, "reward": 0.020545542240142822, "reward_std": 0.0, "rewards/reward_func/mean": 0.020545542240142822, "rewards/reward_func/std": 0.0, "step": 818, "step_time": 20.884681150317192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3311654031276703, "epoch": 0.03793422880963409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013042479986324906, "kl": 0.0016282762517221272, "learning_rate": 9.92422417786012e-07, "loss": 0.0001, "num_tokens": 22638445.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 819, "step_time": 15.344467476010323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 138.1875, "completions/mean_terminated_length": 138.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.28598906099796295, "epoch": 0.03798054654932839, "frac_reward_zero_std": 1.0, "grad_norm": 0.000775039370637387, "kl": 0.0010381936153862625, "learning_rate": 9.92413154238073e-07, "loss": 0.0001, "num_tokens": 22660192.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 820, "step_time": 15.39529787749052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 122.1875, "completions/mean_terminated_length": 122.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3442473039031029, "epoch": 0.038026864289022695, "frac_reward_zero_std": 1.0, "grad_norm": 0.001066489377990365, "kl": 0.0014191086229402572, "learning_rate": 9.924038906901342e-07, "loss": 0.0001, "num_tokens": 22682227.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 821, "step_time": 14.344931341707706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3186531886458397, "epoch": 0.038073182028717, "frac_reward_zero_std": 1.0, "grad_norm": 0.001553441397845745, "kl": 0.001338458008831367, "learning_rate": 9.923946271421956e-07, "loss": 0.0001, "num_tokens": 22705123.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 822, "step_time": 19.846717324107885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.371276393532753, "epoch": 0.0381194997684113, "frac_reward_zero_std": 0.0, "grad_norm": 0.06931908428668976, "kl": 0.0010701469436753541, "learning_rate": 9.923853635942565e-07, "loss": -0.0327, "num_tokens": 22726433.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 823, "step_time": 22.185349114239216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 151.25, "completions/mean_terminated_length": 151.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.41176582127809525, "epoch": 0.038165817508105604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006509088561870158, "kl": 0.0011380686919437721, "learning_rate": 9.923761000463176e-07, "loss": 0.0001, "num_tokens": 22751893.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 824, "step_time": 16.74121941626072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 195.3125, "completions/mean_terminated_length": 195.3125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4078410267829895, "epoch": 0.03821213524779991, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006997660966590047, "kl": 0.0013196228246670216, "learning_rate": 9.923668364983787e-07, "loss": 0.0001, "num_tokens": 22773690.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 825, "step_time": 23.617992267012596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 211.3125, "completions/mean_terminated_length": 211.3125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4072626978158951, "epoch": 0.03825845298749421, "frac_reward_zero_std": 0.0, "grad_norm": 0.08284235745668411, "kl": 0.0013661879929713905, "learning_rate": 9.9235757295044e-07, "loss": 0.0624, "num_tokens": 22799215.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 826, "step_time": 23.910189773887396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 128.125, "completions/mean_terminated_length": 128.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24906330555677414, "epoch": 0.03830477072718851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009547322406433523, "kl": 0.0009482608147663996, "learning_rate": 9.923483094025012e-07, "loss": 0.0, "num_tokens": 22818913.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 827, "step_time": 14.135732557624578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 110.5, "completions/mean_terminated_length": 110.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.23942997679114342, "epoch": 0.038351088466882816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010104505345225334, "kl": 0.0011649180960375816, "learning_rate": 9.923390458545623e-07, "loss": 0.0001, "num_tokens": 22838713.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 828, "step_time": 13.335768409073353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 148.8125, "completions/mean_terminated_length": 148.8125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3456391915678978, "epoch": 0.03839740620657712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007521145744249225, "kl": 0.001152332144556567, "learning_rate": 9.923297823066234e-07, "loss": 0.0001, "num_tokens": 22863366.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 829, "step_time": 16.91843691468239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.23295189067721367, "epoch": 0.03844372394627142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005820674123242497, "kl": 0.0009615173185011372, "learning_rate": 9.923205187586846e-07, "loss": 0.0, "num_tokens": 22887772.0, "reward": 0.4008028209209442, "reward_std": 0.0, "rewards/reward_func/mean": 0.4008028209209442, "rewards/reward_func/std": 0.0, "step": 830, "step_time": 20.30594377592206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.32865454256534576, "epoch": 0.038490041685965724, "frac_reward_zero_std": 0.0, "grad_norm": 0.06871698051691055, "kl": 0.0013664969301316887, "learning_rate": 9.923112552107457e-07, "loss": -0.051, "num_tokens": 22918766.0, "reward": 0.019764235243201256, "reward_std": 0.07905694842338562, "rewards/reward_func/mean": 0.019764235243201256, "rewards/reward_func/std": 0.07905694097280502, "step": 831, "step_time": 21.14786373078823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 136.0625, "completions/mean_terminated_length": 136.0625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.34320181608200073, "epoch": 0.03853635942566003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007403282797895372, "kl": 0.001321841060416773, "learning_rate": 9.923019916628068e-07, "loss": 0.0001, "num_tokens": 22939375.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 832, "step_time": 15.485927652567625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.40896496176719666, "epoch": 0.03858267716535433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006073734839446843, "kl": 0.001216251141158864, "learning_rate": 9.92292728114868e-07, "loss": 0.0001, "num_tokens": 22967875.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 833, "step_time": 22.568044397979975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.36554165929555893, "epoch": 0.03862899490504863, "frac_reward_zero_std": 1.0, "grad_norm": 0.001040755887515843, "kl": 0.0014154599339235574, "learning_rate": 9.92283464566929e-07, "loss": 0.0001, "num_tokens": 22991287.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 834, "step_time": 18.86409217491746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29438622295856476, "epoch": 0.038675312644742936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009892889065667987, "kl": 0.0012028044293401763, "learning_rate": 9.922742010189902e-07, "loss": 0.0001, "num_tokens": 23016003.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 835, "step_time": 15.510872017592192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3107244223356247, "epoch": 0.03872163038443724, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019122130470350385, "kl": 0.0014791909197811037, "learning_rate": 9.922649374710513e-07, "loss": 0.0001, "num_tokens": 23051767.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 836, "step_time": 18.339373033493757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 254.5625, "completions/mean_terminated_length": 254.5625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.3048095479607582, "epoch": 0.03876794812413154, "frac_reward_zero_std": 0.0, "grad_norm": 0.06858941167593002, "kl": 0.0011206775961909443, "learning_rate": 9.922556739231124e-07, "loss": 0.0549, "num_tokens": 23082720.0, "reward": 0.509719967842102, "reward_std": 0.32573890686035156, "rewards/reward_func/mean": 0.509719967842102, "rewards/reward_func/std": 0.32573890686035156, "step": 837, "step_time": 27.66253012046218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.39568687975406647, "epoch": 0.038814265863825845, "frac_reward_zero_std": 0.0, "grad_norm": 0.08459905534982681, "kl": 0.0010677657701307908, "learning_rate": 9.922464103751736e-07, "loss": -0.0867, "num_tokens": 23104692.0, "reward": 0.2371823638677597, "reward_std": 0.4242846965789795, "rewards/reward_func/mean": 0.2371823638677597, "rewards/reward_func/std": 0.4242846965789795, "step": 838, "step_time": 21.718665331602097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 183.3125, "completions/mean_terminated_length": 183.3125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3188238888978958, "epoch": 0.03886058360352015, "frac_reward_zero_std": 0.0, "grad_norm": 0.09563377499580383, "kl": 0.0014460600505117327, "learning_rate": 9.92237146827235e-07, "loss": -0.025, "num_tokens": 23126137.0, "reward": 0.4231693744659424, "reward_std": 0.43837499618530273, "rewards/reward_func/mean": 0.4231693744659424, "rewards/reward_func/std": 0.43837499618530273, "step": 839, "step_time": 19.095177225768566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2542569972574711, "epoch": 0.03890690134321445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008802962256595492, "kl": 0.001020480354782194, "learning_rate": 9.92227883279296e-07, "loss": 0.0001, "num_tokens": 23146905.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 840, "step_time": 13.482799373567104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.21041544526815414, "epoch": 0.038953219082908754, "frac_reward_zero_std": 0.0, "grad_norm": 0.0753721222281456, "kl": 0.0010078297345899045, "learning_rate": 9.92218619731357e-07, "loss": 0.0082, "num_tokens": 23175985.0, "reward": 0.9868549108505249, "reward_std": 0.03591921180486679, "rewards/reward_func/mean": 0.9868549108505249, "rewards/reward_func/std": 0.035919200628995895, "step": 841, "step_time": 20.711773075163364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24506905302405357, "epoch": 0.038999536822603056, "frac_reward_zero_std": 1.0, "grad_norm": 0.001000660820864141, "kl": 0.0011933351051993668, "learning_rate": 9.922093561834183e-07, "loss": 0.0001, "num_tokens": 23195421.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 842, "step_time": 14.039854612201452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 194.6875, "completions/mean_terminated_length": 194.6875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3695313408970833, "epoch": 0.03904585456229736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011918543605133891, "kl": 0.0014249386440496892, "learning_rate": 9.922000926354794e-07, "loss": 0.0001, "num_tokens": 23221464.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 843, "step_time": 21.10698227584362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 294.9375, "completions/mean_terminated_length": 294.9375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.21876288205385208, "epoch": 0.03909217230199166, "frac_reward_zero_std": 0.0, "grad_norm": 0.06910669058561325, "kl": 0.0007599894743179902, "learning_rate": 9.921908290875405e-07, "loss": -0.0502, "num_tokens": 23250951.0, "reward": 0.6757249236106873, "reward_std": 0.26562076807022095, "rewards/reward_func/mean": 0.6757249236106873, "rewards/reward_func/std": 0.26562079787254333, "step": 844, "step_time": 29.39313641563058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3069569543004036, "epoch": 0.039138490041685965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016099393833428621, "kl": 0.0014371881261467934, "learning_rate": 9.921815655396016e-07, "loss": 0.0001, "num_tokens": 23278538.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 845, "step_time": 17.548515994101763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 149.8125, "completions/mean_terminated_length": 149.8125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.17683753743767738, "epoch": 0.03918480778138027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006053652614355087, "kl": 0.0007061581200105138, "learning_rate": 9.921723019916628e-07, "loss": 0.0, "num_tokens": 23302695.0, "reward": 0.8883547186851501, "reward_std": 0.0, "rewards/reward_func/mean": 0.8883547186851501, "rewards/reward_func/std": 0.0, "step": 846, "step_time": 16.126937676221132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3190002292394638, "epoch": 0.03923112552107457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012537972070276737, "kl": 0.0014405875408556312, "learning_rate": 9.92163038443724e-07, "loss": 0.0001, "num_tokens": 23322405.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 847, "step_time": 16.301903445273638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 177.6875, "completions/mean_terminated_length": 177.6875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.2442517653107643, "epoch": 0.039277443260768874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016692023491486907, "kl": 0.0013725541066378355, "learning_rate": 9.92153774895785e-07, "loss": 0.0001, "num_tokens": 23345440.0, "reward": 0.4111122786998749, "reward_std": 0.0, "rewards/reward_func/mean": 0.4111122786998749, "rewards/reward_func/std": 0.0, "step": 848, "step_time": 17.377079091966152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.20605628937482834, "epoch": 0.03932376100046318, "frac_reward_zero_std": 0.0, "grad_norm": 0.08510545641183853, "kl": 0.0008309491677209735, "learning_rate": 9.921445113478461e-07, "loss": -0.0259, "num_tokens": 23383105.0, "reward": 0.3339391350746155, "reward_std": 0.0740714743733406, "rewards/reward_func/mean": 0.3339391350746155, "rewards/reward_func/std": 0.0740714892745018, "step": 849, "step_time": 22.711984291672707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.22407619282603264, "epoch": 0.03937007874015748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011288082459941506, "kl": 0.0011935117654502392, "learning_rate": 9.921352477999073e-07, "loss": 0.0001, "num_tokens": 23403849.0, "reward": 0.5488116145133972, "reward_std": 0.0, "rewards/reward_func/mean": 0.5488116145133972, "rewards/reward_func/std": 0.0, "step": 850, "step_time": 15.657405402511358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2101145051419735, "epoch": 0.03941639647985178, "frac_reward_zero_std": 0.0, "grad_norm": 0.05516713857650757, "kl": 0.0008015447965590283, "learning_rate": 9.921259842519684e-07, "loss": -0.013, "num_tokens": 23427795.0, "reward": 0.9432417750358582, "reward_std": 0.022156143561005592, "rewards/reward_func/mean": 0.9432417750358582, "rewards/reward_func/std": 0.022156143561005592, "step": 851, "step_time": 20.915113903582096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.34642454981803894, "epoch": 0.039462714219546086, "frac_reward_zero_std": 0.0, "grad_norm": 0.10046864300966263, "kl": 0.002284130488988012, "learning_rate": 9.921167207040297e-07, "loss": -0.0457, "num_tokens": 23464934.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 852, "step_time": 23.169690739363432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.2737343981862068, "epoch": 0.03950903195924039, "frac_reward_zero_std": 0.0, "grad_norm": 0.09689640998840332, "kl": 0.0012106007779948413, "learning_rate": 9.921074571560909e-07, "loss": -0.022, "num_tokens": 23499866.0, "reward": 0.919789731502533, "reward_std": 0.07305874675512314, "rewards/reward_func/mean": 0.919789731502533, "rewards/reward_func/std": 0.07305874675512314, "step": 853, "step_time": 20.427601240575314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.1593393310904503, "epoch": 0.03955534969893469, "frac_reward_zero_std": 1.0, "grad_norm": 0.00044821377377957106, "kl": 0.0005306711173034273, "learning_rate": 9.920981936081518e-07, "loss": 0.0, "num_tokens": 23527508.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 854, "step_time": 19.216147657483816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.25104890763759613, "epoch": 0.039601667438628994, "frac_reward_zero_std": 1.0, "grad_norm": 0.007444969844073057, "kl": 0.0022193838376551867, "learning_rate": 9.920889300602129e-07, "loss": 0.0001, "num_tokens": 23565038.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 855, "step_time": 21.615963652729988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3022039160132408, "epoch": 0.0396479851783233, "frac_reward_zero_std": 0.0, "grad_norm": 0.07054363191127777, "kl": 0.0011581739236135036, "learning_rate": 9.920796665122742e-07, "loss": 0.0049, "num_tokens": 23596463.0, "reward": 0.21576446294784546, "reward_std": 0.37498414516448975, "rewards/reward_func/mean": 0.21576446294784546, "rewards/reward_func/std": 0.37498414516448975, "step": 856, "step_time": 20.23256105557084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.28507737815380096, "epoch": 0.0396943029180176, "frac_reward_zero_std": 0.0, "grad_norm": 0.06856705248355865, "kl": 0.0010991398594342172, "learning_rate": 9.920704029643354e-07, "loss": -0.0638, "num_tokens": 23628525.0, "reward": 0.08405046164989471, "reward_std": 0.0403740294277668, "rewards/reward_func/mean": 0.08405046164989471, "rewards/reward_func/std": 0.0403740257024765, "step": 857, "step_time": 25.41255698353052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.23209454864263535, "epoch": 0.0397406206577119, "frac_reward_zero_std": 0.0, "grad_norm": 0.07368174940347672, "kl": 0.0009128926321864128, "learning_rate": 9.920611394163965e-07, "loss": 0.0181, "num_tokens": 23685775.0, "reward": 0.8295896053314209, "reward_std": 0.00487559475004673, "rewards/reward_func/mean": 0.8295896053314209, "rewards/reward_func/std": 0.004875591490417719, "step": 858, "step_time": 27.572569452226162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 159.6875, "completions/mean_terminated_length": 159.6875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.33524635434150696, "epoch": 0.039786938397406206, "frac_reward_zero_std": 1.0, "grad_norm": 0.000995760434307158, "kl": 0.0011498018284328282, "learning_rate": 9.920518758684576e-07, "loss": 0.0001, "num_tokens": 23720682.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 859, "step_time": 19.58662161231041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24422408640384674, "epoch": 0.03983325613710051, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007869779947213829, "kl": 0.0008795668691163883, "learning_rate": 9.920426123205187e-07, "loss": 0.0, "num_tokens": 23746990.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 860, "step_time": 15.046861194074154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 177.9375, "completions/mean_terminated_length": 177.9375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.35781367123126984, "epoch": 0.03987957387679481, "frac_reward_zero_std": 1.0, "grad_norm": 0.003613840788602829, "kl": 0.002170583524275571, "learning_rate": 9.920333487725799e-07, "loss": 0.0001, "num_tokens": 23776781.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 861, "step_time": 21.053212970495224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 298.0625, "completions/mean_terminated_length": 298.0625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.3049522712826729, "epoch": 0.039925891616489115, "frac_reward_zero_std": 0.0, "grad_norm": 0.058802779763936996, "kl": 0.001043334777932614, "learning_rate": 9.92024085224641e-07, "loss": -0.0937, "num_tokens": 23807566.0, "reward": 0.8277145624160767, "reward_std": 0.3702131509780884, "rewards/reward_func/mean": 0.8277145624160767, "rewards/reward_func/std": 0.3702131509780884, "step": 862, "step_time": 29.90588680282235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.13397923856973648, "epoch": 0.03997220935618342, "frac_reward_zero_std": 1.0, "grad_norm": 0.001241574645973742, "kl": 0.0006542782066389918, "learning_rate": 9.92014821676702e-07, "loss": 0.0, "num_tokens": 23839910.0, "reward": 0.8914703726768494, "reward_std": 0.0, "rewards/reward_func/mean": 0.8914703726768494, "rewards/reward_func/std": 0.0, "step": 863, "step_time": 19.784176409244537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2424134723842144, "epoch": 0.04001852709587772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008954207296483219, "kl": 0.0009859423444140702, "learning_rate": 9.920055581287632e-07, "loss": 0.0, "num_tokens": 23859256.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 864, "step_time": 13.532780464738607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 142.375, "completions/mean_terminated_length": 142.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3090411126613617, "epoch": 0.040064844835572024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008063865243457258, "kl": 0.0012019427958875895, "learning_rate": 9.919962945808244e-07, "loss": 0.0001, "num_tokens": 23891886.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 865, "step_time": 18.694202043116093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 198.5625, "completions/mean_terminated_length": 198.5625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.38849782943725586, "epoch": 0.040111162575266326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007156712817959487, "kl": 0.001291893218876794, "learning_rate": 9.919870310328855e-07, "loss": 0.0001, "num_tokens": 23919591.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 866, "step_time": 20.873049806803465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.27741505205631256, "epoch": 0.04015748031496063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007374704582616687, "kl": 0.001058865716913715, "learning_rate": 9.919777674849466e-07, "loss": 0.0001, "num_tokens": 23939639.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 867, "step_time": 12.763438243418932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3621453046798706, "epoch": 0.04020379805465493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011747012613341212, "kl": 0.0012898755667265505, "learning_rate": 9.919685039370077e-07, "loss": 0.0001, "num_tokens": 23961366.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 868, "step_time": 16.606018260121346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 194.1875, "completions/mean_terminated_length": 194.1875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.15809489414095879, "epoch": 0.040250115794349235, "frac_reward_zero_std": 0.0, "grad_norm": 0.15900906920433044, "kl": 0.0012111798860132694, "learning_rate": 9.91959240389069e-07, "loss": -0.0648, "num_tokens": 23985497.0, "reward": 0.8437808156013489, "reward_std": 0.1455288976430893, "rewards/reward_func/mean": 0.8437808156013489, "rewards/reward_func/std": 0.1455289125442505, "step": 869, "step_time": 19.615712836384773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 178.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4628566801548004, "epoch": 0.04029643353404354, "frac_reward_zero_std": 0.0, "grad_norm": 0.09006517380475998, "kl": 0.0014835805050097406, "learning_rate": 9.919499768411302e-07, "loss": 0.0441, "num_tokens": 24010481.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 870, "step_time": 19.705397214740515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.14962278679013252, "epoch": 0.04034275127373784, "frac_reward_zero_std": 0.0, "grad_norm": 0.06065298616886139, "kl": 0.0007261905120685697, "learning_rate": 9.919407132931913e-07, "loss": -0.0105, "num_tokens": 24048807.0, "reward": 0.925041675567627, "reward_std": 0.01998889446258545, "rewards/reward_func/mean": 0.925041675567627, "rewards/reward_func/std": 0.0199888963252306, "step": 871, "step_time": 20.8767249584198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 140.0625, "completions/mean_terminated_length": 140.0625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3491576388478279, "epoch": 0.040389069013432144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018656151369214058, "kl": 0.0017743144708219916, "learning_rate": 9.919314497452524e-07, "loss": 0.0001, "num_tokens": 24081528.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 872, "step_time": 18.238072484731674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3172079026699066, "epoch": 0.04043538675312645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008050501928664744, "kl": 0.0010919965570792556, "learning_rate": 9.919221861973136e-07, "loss": 0.0001, "num_tokens": 24107192.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 873, "step_time": 15.443362895399332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 135.4375, "completions/mean_terminated_length": 135.4375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2499684989452362, "epoch": 0.04048170449282075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015567787922918797, "kl": 0.001300970237934962, "learning_rate": 9.919129226493747e-07, "loss": 0.0001, "num_tokens": 24127535.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 874, "step_time": 15.023460488766432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 135.9375, "completions/mean_terminated_length": 135.9375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.33585429191589355, "epoch": 0.04052802223251505, "frac_reward_zero_std": 1.0, "grad_norm": 0.001978588756173849, "kl": 0.0015191614511422813, "learning_rate": 9.919036591014358e-07, "loss": 0.0001, "num_tokens": 24159726.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 875, "step_time": 16.590162009000778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.33138158917427063, "epoch": 0.040574339972209356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008576564723625779, "kl": 0.0013629891618620604, "learning_rate": 9.91894395553497e-07, "loss": 0.0001, "num_tokens": 24185342.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 876, "step_time": 14.932226613163948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 156.4375, "completions/mean_terminated_length": 156.4375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2170252650976181, "epoch": 0.04062065771190366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016902198549360037, "kl": 0.0015073083341121674, "learning_rate": 9.91885132005558e-07, "loss": 0.0001, "num_tokens": 24207701.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 877, "step_time": 16.58178937062621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2176058068871498, "epoch": 0.04066697545159796, "frac_reward_zero_std": 0.0, "grad_norm": 0.09435556083917618, "kl": 0.0012219191703479737, "learning_rate": 9.918758684576192e-07, "loss": 0.0961, "num_tokens": 24239897.0, "reward": 0.599349856376648, "reward_std": 0.23396223783493042, "rewards/reward_func/mean": 0.599349856376648, "rewards/reward_func/std": 0.23396222293376923, "step": 878, "step_time": 26.13662463799119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.23211872950196266, "epoch": 0.040713293191292264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012353787897154689, "kl": 0.0011770162091124803, "learning_rate": 9.918666049096803e-07, "loss": 0.0001, "num_tokens": 24259372.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 879, "step_time": 13.605142381042242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.35946550220251083, "epoch": 0.04075961093098657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010624685091897845, "kl": 0.0015476414700970054, "learning_rate": 9.918573413617414e-07, "loss": 0.0001, "num_tokens": 24291056.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 880, "step_time": 19.141439214348793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.31737447530031204, "epoch": 0.04080592867068087, "frac_reward_zero_std": 1.0, "grad_norm": 0.001965844538062811, "kl": 0.0018645181262400001, "learning_rate": 9.918480778138026e-07, "loss": 0.0001, "num_tokens": 24329018.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 881, "step_time": 21.36028627678752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.32912537455558777, "epoch": 0.04085224641037517, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013385425554588437, "kl": 0.0013619031524285674, "learning_rate": 9.91838814265864e-07, "loss": 0.0001, "num_tokens": 24357574.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 882, "step_time": 16.5103618837893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.2413436844944954, "epoch": 0.040898564150069476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006712871254421771, "kl": 0.0009202266519423574, "learning_rate": 9.91829550717925e-07, "loss": 0.0, "num_tokens": 24381178.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 883, "step_time": 21.478361073881388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.29485518485307693, "epoch": 0.04094488188976378, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011903924169018865, "kl": 0.001347521523712203, "learning_rate": 9.918202871699862e-07, "loss": 0.0001, "num_tokens": 24413337.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 884, "step_time": 16.653850506991148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3341088593006134, "epoch": 0.04099119962945808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018057597335428, "kl": 0.0015847394533921033, "learning_rate": 9.91811023622047e-07, "loss": 0.0001, "num_tokens": 24449391.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 885, "step_time": 18.39985877275467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.34564724564552307, "epoch": 0.041037517369152385, "frac_reward_zero_std": 0.0, "grad_norm": 0.11464457958936691, "kl": 0.0013137346832081676, "learning_rate": 9.918017600741084e-07, "loss": -0.0892, "num_tokens": 24473376.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 886, "step_time": 20.970414962619543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 125.6875, "completions/mean_terminated_length": 125.6875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3436511158943176, "epoch": 0.04108383510884669, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011709880782291293, "kl": 0.001377139298710972, "learning_rate": 9.917924965261695e-07, "loss": 0.0001, "num_tokens": 24501003.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 887, "step_time": 16.413439992815256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.21886874735355377, "epoch": 0.04113015284854099, "frac_reward_zero_std": 0.0, "grad_norm": 0.08236662298440933, "kl": 0.0008434822811977938, "learning_rate": 9.917832329782307e-07, "loss": -0.0085, "num_tokens": 24534935.0, "reward": 0.9118726253509521, "reward_std": 0.052548982203006744, "rewards/reward_func/mean": 0.9118726253509521, "rewards/reward_func/std": 0.05254898592829704, "step": 888, "step_time": 20.45571358129382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 224.0625, "completions/mean_terminated_length": 224.0625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.1623828411102295, "epoch": 0.041176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005422509857453406, "kl": 0.0007415378058794886, "learning_rate": 9.917739694302918e-07, "loss": 0.0, "num_tokens": 24566520.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 889, "step_time": 23.425292938947678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4034350737929344, "epoch": 0.041222788327929596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009264025138691068, "kl": 0.0012167560926172882, "learning_rate": 9.91764705882353e-07, "loss": 0.0001, "num_tokens": 24592968.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 890, "step_time": 19.325445406138897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23975902423262596, "epoch": 0.0412691060676239, "frac_reward_zero_std": 1.0, "grad_norm": 0.001312562613748014, "kl": 0.0012075002596247941, "learning_rate": 9.91755442334414e-07, "loss": 0.0001, "num_tokens": 24612440.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 891, "step_time": 13.437476575374603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2583157308399677, "epoch": 0.0413154238073182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008441361715085804, "kl": 0.0011131021892651916, "learning_rate": 9.917461787864751e-07, "loss": 0.0001, "num_tokens": 24635716.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 892, "step_time": 13.729831136763096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2921217456459999, "epoch": 0.041361741547012505, "frac_reward_zero_std": 1.0, "grad_norm": 0.00707467133179307, "kl": 0.0020684940682258457, "learning_rate": 9.917369152385363e-07, "loss": 0.0001, "num_tokens": 24660816.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 893, "step_time": 16.902304004877806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2442249245941639, "epoch": 0.04140805928670681, "frac_reward_zero_std": 0.0, "grad_norm": 0.06777264922857285, "kl": 0.000899084858247079, "learning_rate": 9.917276516905974e-07, "loss": -0.027, "num_tokens": 24683361.0, "reward": 0.8284921050071716, "reward_std": 0.2555558681488037, "rewards/reward_func/mean": 0.8284921050071716, "rewards/reward_func/std": 0.2555558979511261, "step": 894, "step_time": 18.19170006364584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 154.3125, "completions/mean_terminated_length": 154.3125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2531934566795826, "epoch": 0.04145437702640111, "frac_reward_zero_std": 1.0, "grad_norm": 0.00047968150465749204, "kl": 0.000688874781189952, "learning_rate": 9.917183881426585e-07, "loss": 0.0, "num_tokens": 24710294.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 895, "step_time": 18.35441016405821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.26756370812654495, "epoch": 0.041500694766095414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011044122511520982, "kl": 0.0012441368453437462, "learning_rate": 9.917091245947199e-07, "loss": 0.0001, "num_tokens": 24731000.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 896, "step_time": 14.109066184610128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 119.5625, "completions/mean_terminated_length": 119.5625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.21244856715202332, "epoch": 0.04154701250578972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010753453243523836, "kl": 0.0012322746915742755, "learning_rate": 9.916998610467808e-07, "loss": 0.0001, "num_tokens": 24750241.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 897, "step_time": 13.320092979818583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3756285607814789, "epoch": 0.04159333024548402, "frac_reward_zero_std": 0.0, "grad_norm": 0.07290878891944885, "kl": 0.0015508253709413111, "learning_rate": 9.91690597498842e-07, "loss": -0.1088, "num_tokens": 24778655.0, "reward": 0.13992351293563843, "reward_std": 0.3008265197277069, "rewards/reward_func/mean": 0.13992351293563843, "rewards/reward_func/std": 0.3008265197277069, "step": 898, "step_time": 24.30946659296751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.33028341084718704, "epoch": 0.04163964798517832, "frac_reward_zero_std": 1.0, "grad_norm": 0.000989797175861895, "kl": 0.0011517624952830374, "learning_rate": 9.916813339509032e-07, "loss": 0.0001, "num_tokens": 24808507.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 899, "step_time": 17.414367869496346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2317407764494419, "epoch": 0.041685965724872626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010478079784661531, "kl": 0.001201534309075214, "learning_rate": 9.916720704029644e-07, "loss": 0.0001, "num_tokens": 24831665.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 900, "step_time": 16.36012415215373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 221.4375, "completions/mean_terminated_length": 221.4375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.2911301776766777, "epoch": 0.04173228346456693, "frac_reward_zero_std": 0.0, "grad_norm": 0.057238928973674774, "kl": 0.0009754351049195975, "learning_rate": 9.916628068550255e-07, "loss": -0.0585, "num_tokens": 24858776.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 901, "step_time": 24.013755716383457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.35609270632267, "epoch": 0.04177860120426123, "frac_reward_zero_std": 0.0, "grad_norm": 0.10632283985614777, "kl": 0.0011484703136375174, "learning_rate": 9.916535433070866e-07, "loss": -0.0499, "num_tokens": 24879705.0, "reward": 0.4157751798629761, "reward_std": 0.4877914488315582, "rewards/reward_func/mean": 0.4157751798629761, "rewards/reward_func/std": 0.4877914786338806, "step": 902, "step_time": 22.129552900791168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 122.875, "completions/mean_terminated_length": 122.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.28633588552474976, "epoch": 0.041824918943955534, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007128144497983158, "kl": 0.0010221440315945074, "learning_rate": 9.916442797591477e-07, "loss": 0.0001, "num_tokens": 24900727.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 903, "step_time": 13.659861445426941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.22553906217217445, "epoch": 0.04187123668364984, "frac_reward_zero_std": 1.0, "grad_norm": 0.002935833763331175, "kl": 0.0016055934393079951, "learning_rate": 9.916350162112089e-07, "loss": 0.0001, "num_tokens": 24923665.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 904, "step_time": 22.655254740267992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.25387056171894073, "epoch": 0.04191755442334414, "frac_reward_zero_std": 0.0, "grad_norm": 0.05357150733470917, "kl": 0.0011125316377729177, "learning_rate": 9.9162575266327e-07, "loss": -0.0406, "num_tokens": 24959293.0, "reward": 0.9125852584838867, "reward_std": 0.06320253014564514, "rewards/reward_func/mean": 0.9125852584838867, "rewards/reward_func/std": 0.06320253014564514, "step": 905, "step_time": 31.907665256410837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.22939521074295044, "epoch": 0.04196387216303844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005068927421234548, "kl": 0.0008163191378116608, "learning_rate": 9.916164891153311e-07, "loss": 0.0, "num_tokens": 24983539.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 906, "step_time": 16.72318310290575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21878432855010033, "epoch": 0.042010189902732746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015956859569996595, "kl": 0.0012934014084748924, "learning_rate": 9.916072255673922e-07, "loss": 0.0001, "num_tokens": 25003031.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 907, "step_time": 13.752490423619747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 168.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.22134746611118317, "epoch": 0.04205650764242705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013118194183334708, "kl": 0.0009753816411830485, "learning_rate": 9.915979620194534e-07, "loss": 0.0, "num_tokens": 25024412.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 908, "step_time": 17.913808669894934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.18628476187586784, "epoch": 0.04210282538212135, "frac_reward_zero_std": 0.0, "grad_norm": 0.05642639100551605, "kl": 0.000570238204090856, "learning_rate": 9.915886984715145e-07, "loss": -0.0523, "num_tokens": 25046866.0, "reward": 0.5372694134712219, "reward_std": 0.10262186825275421, "rewards/reward_func/mean": 0.5372694134712219, "rewards/reward_func/std": 0.10262187570333481, "step": 909, "step_time": 21.101673137396574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 168.6875, "completions/mean_terminated_length": 168.6875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.46945177018642426, "epoch": 0.042149143121815655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008493889472447336, "kl": 0.001389862794894725, "learning_rate": 9.915794349235756e-07, "loss": 0.0001, "num_tokens": 25074493.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 910, "step_time": 21.144909985363483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 119.375, "completions/mean_terminated_length": 119.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.25054027885198593, "epoch": 0.04219546086150996, "frac_reward_zero_std": 1.0, "grad_norm": 0.000666202453430742, "kl": 0.001013650165987201, "learning_rate": 9.915701713756367e-07, "loss": 0.0001, "num_tokens": 25094355.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 911, "step_time": 13.317175682634115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 125.0625, "completions/mean_terminated_length": 125.0625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.28567124903202057, "epoch": 0.04224177860120426, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011674201814457774, "kl": 0.001341990107903257, "learning_rate": 9.91560907827698e-07, "loss": 0.0001, "num_tokens": 25117460.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 912, "step_time": 14.951915934681892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.0, "completions/mean_terminated_length": 137.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2821376770734787, "epoch": 0.042288096340898564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014422236708924174, "kl": 0.001285295671550557, "learning_rate": 9.915516442797592e-07, "loss": 0.0001, "num_tokens": 25139940.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 913, "step_time": 15.471151653677225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 132.3125, "completions/mean_terminated_length": 132.3125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.341471403837204, "epoch": 0.042334414080592866, "frac_reward_zero_std": 1.0, "grad_norm": 0.000769183796364814, "kl": 0.0014067496813368052, "learning_rate": 9.915423807318203e-07, "loss": 0.0001, "num_tokens": 25161193.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 914, "step_time": 15.628828033804893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3295915126800537, "epoch": 0.04238073182028717, "frac_reward_zero_std": 0.0, "grad_norm": 0.08101384341716766, "kl": 0.001092489721486345, "learning_rate": 9.915331171838812e-07, "loss": -0.0696, "num_tokens": 25182810.0, "reward": 0.37304767966270447, "reward_std": 0.3916044235229492, "rewards/reward_func/mean": 0.37304767966270447, "rewards/reward_func/std": 0.3916044235229492, "step": 915, "step_time": 22.14657584577799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2927909716963768, "epoch": 0.04242704955998147, "frac_reward_zero_std": 1.0, "grad_norm": 0.002993886824697256, "kl": 0.002071876573609188, "learning_rate": 9.915238536359426e-07, "loss": 0.0001, "num_tokens": 25207634.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 916, "step_time": 15.564940758049488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2962787076830864, "epoch": 0.042473367299675775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030500530265271664, "kl": 0.0018200974445790052, "learning_rate": 9.915145900880037e-07, "loss": 0.0001, "num_tokens": 25227960.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 917, "step_time": 14.766521293669939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2951975166797638, "epoch": 0.04251968503937008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010513971792533994, "kl": 0.001188266251119785, "learning_rate": 9.915053265400648e-07, "loss": 0.0001, "num_tokens": 25256098.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 918, "step_time": 14.576378718018532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 183.1875, "completions/mean_terminated_length": 183.1875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.43721695989370346, "epoch": 0.04256600277906438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005205922061577439, "kl": 0.0011477641965029761, "learning_rate": 9.91496062992126e-07, "loss": 0.0001, "num_tokens": 25293717.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 919, "step_time": 22.255840439349413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3722553998231888, "epoch": 0.042612320518758684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008447070722468197, "kl": 0.0011220781016163528, "learning_rate": 9.91486799444187e-07, "loss": 0.0001, "num_tokens": 25315771.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 920, "step_time": 17.35065533220768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 223.4375, "completions/mean_terminated_length": 223.4375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.19360385835170746, "epoch": 0.04265863825845299, "frac_reward_zero_std": 0.0, "grad_norm": 0.06590349227190018, "kl": 0.0006857907719677314, "learning_rate": 9.914775358962482e-07, "loss": -0.0534, "num_tokens": 25345906.0, "reward": 0.7841700911521912, "reward_std": 0.12175922840833664, "rewards/reward_func/mean": 0.7841700911521912, "rewards/reward_func/std": 0.12175923585891724, "step": 921, "step_time": 23.794354770332575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 186.3125, "completions/mean_terminated_length": 186.3125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.15523645281791687, "epoch": 0.04270495599814729, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003852669906336814, "kl": 0.0005026564249419607, "learning_rate": 9.914682723483093e-07, "loss": 0.0, "num_tokens": 25383495.0, "reward": 0.8869204521179199, "reward_std": 0.0, "rewards/reward_func/mean": 0.8869204521179199, "rewards/reward_func/std": 0.0, "step": 922, "step_time": 22.65672117099166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28473517298698425, "epoch": 0.04275127373784159, "frac_reward_zero_std": 1.0, "grad_norm": 0.000955732713919133, "kl": 0.0010942591761704534, "learning_rate": 9.914590088003704e-07, "loss": 0.0001, "num_tokens": 25408577.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 923, "step_time": 14.98143096268177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 125.8125, "completions/mean_terminated_length": 125.8125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.32515229284763336, "epoch": 0.042797591477535896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010904496302828193, "kl": 0.0014065119030419737, "learning_rate": 9.914497452524316e-07, "loss": 0.0001, "num_tokens": 25430094.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 924, "step_time": 15.248995453119278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.19039393216371536, "epoch": 0.0428439092172302, "frac_reward_zero_std": 1.0, "grad_norm": 0.000549613730981946, "kl": 0.0007599204545840621, "learning_rate": 9.914404817044927e-07, "loss": 0.0, "num_tokens": 25469255.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 925, "step_time": 23.040058355778456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3542131409049034, "epoch": 0.0428902269569245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012450783979147673, "kl": 0.0013468109245877713, "learning_rate": 9.91431218156554e-07, "loss": 0.0001, "num_tokens": 25489807.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 926, "step_time": 16.3294418156147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.19528140500187874, "epoch": 0.042936544696618804, "frac_reward_zero_std": 0.0, "grad_norm": 0.08553256094455719, "kl": 0.0008833478495944291, "learning_rate": 9.914219546086152e-07, "loss": -0.1422, "num_tokens": 25520516.0, "reward": 0.4365028440952301, "reward_std": 0.3096548616886139, "rewards/reward_func/mean": 0.4365028440952301, "rewards/reward_func/std": 0.3096548616886139, "step": 927, "step_time": 28.269976779818535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.32308219373226166, "epoch": 0.04298286243631311, "frac_reward_zero_std": 0.0, "grad_norm": 0.06600859761238098, "kl": 0.0014190545916790143, "learning_rate": 9.91412691060676e-07, "loss": -0.015, "num_tokens": 25543296.0, "reward": 0.3001863360404968, "reward_std": 0.4620959162712097, "rewards/reward_func/mean": 0.3001863360404968, "rewards/reward_func/std": 0.4620959162712097, "step": 928, "step_time": 19.748896960169077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 182.4375, "completions/mean_terminated_length": 182.4375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3041607439517975, "epoch": 0.04302918017600741, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006013662205077708, "kl": 0.0010577643843134865, "learning_rate": 9.914034275127374e-07, "loss": 0.0001, "num_tokens": 25564583.0, "reward": 0.7958667874336243, "reward_std": 0.0, "rewards/reward_func/mean": 0.7958667874336243, "rewards/reward_func/std": 0.0, "step": 929, "step_time": 18.69086018204689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2321130819618702, "epoch": 0.04307549791570171, "frac_reward_zero_std": 0.0, "grad_norm": 0.07265283912420273, "kl": 0.0011636123526841402, "learning_rate": 9.913941639647985e-07, "loss": -0.0249, "num_tokens": 25586082.0, "reward": 0.8816871643066406, "reward_std": 0.12416164577007294, "rewards/reward_func/mean": 0.8816871643066406, "rewards/reward_func/std": 0.12416165322065353, "step": 930, "step_time": 19.568458043038845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3222050368785858, "epoch": 0.043121815655396016, "frac_reward_zero_std": 1.0, "grad_norm": 0.001881249132566154, "kl": 0.0021569784148596227, "learning_rate": 9.913849004168597e-07, "loss": 0.0001, "num_tokens": 25616818.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 931, "step_time": 15.721865832805634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.408104345202446, "epoch": 0.04316813339509032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006213901797309518, "kl": 0.0011445400887168944, "learning_rate": 9.913756368689208e-07, "loss": 0.0001, "num_tokens": 25644132.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 932, "step_time": 21.428357008844614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.31693416833877563, "epoch": 0.04321445113478462, "frac_reward_zero_std": 0.0, "grad_norm": 0.07095703482627869, "kl": 0.0012221091310493648, "learning_rate": 9.91366373320982e-07, "loss": 0.0111, "num_tokens": 25668402.0, "reward": 0.0486750490963459, "reward_std": 0.1947001814842224, "rewards/reward_func/mean": 0.0486750490963459, "rewards/reward_func/std": 0.1947001963853836, "step": 933, "step_time": 16.34353280812502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 202.75, "completions/mean_terminated_length": 202.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.25528978556394577, "epoch": 0.043260768874478925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009993150597438216, "kl": 0.001088865305064246, "learning_rate": 9.91357109773043e-07, "loss": 0.0001, "num_tokens": 25695598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 934, "step_time": 21.84546685218811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 140.875, "completions/mean_terminated_length": 140.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.27832359075546265, "epoch": 0.04330708661417323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024308261927217245, "kl": 0.0015019922284409404, "learning_rate": 9.913478462251042e-07, "loss": 0.0001, "num_tokens": 25728604.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 935, "step_time": 17.569377820938826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 136.5625, "completions/mean_terminated_length": 136.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.39912573993206024, "epoch": 0.04335340435386753, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009236956248059869, "kl": 0.0014983774744905531, "learning_rate": 9.913385826771653e-07, "loss": 0.0001, "num_tokens": 25752997.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 936, "step_time": 16.631429065018892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 203.4375, "completions/mean_terminated_length": 203.4375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.4405878186225891, "epoch": 0.043399722093561834, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007935868925414979, "kl": 0.001365014468319714, "learning_rate": 9.913293191292264e-07, "loss": 0.0001, "num_tokens": 25777132.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 937, "step_time": 20.493006374686956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3793506845831871, "epoch": 0.043446039833256136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013016482116654515, "kl": 0.0016592135361861438, "learning_rate": 9.913200555812875e-07, "loss": 0.0001, "num_tokens": 25822580.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 938, "step_time": 23.562003422528505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3772532120347023, "epoch": 0.04349235757295044, "frac_reward_zero_std": 0.0, "grad_norm": 0.131752610206604, "kl": 0.0012197851901873946, "learning_rate": 9.913107920333489e-07, "loss": -0.0841, "num_tokens": 25846276.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 939, "step_time": 19.540890879929066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.1573271043598652, "epoch": 0.04353867531264474, "frac_reward_zero_std": 1.0, "grad_norm": 0.000697934243362397, "kl": 0.0007628782768733799, "learning_rate": 9.913015284854098e-07, "loss": 0.0, "num_tokens": 25879619.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 940, "step_time": 16.907122440636158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.260529488325119, "epoch": 0.043584993052339045, "frac_reward_zero_std": 0.0, "grad_norm": 0.10483313351869583, "kl": 0.0011952401837334037, "learning_rate": 9.91292264937471e-07, "loss": 0.0724, "num_tokens": 25907725.0, "reward": 0.877037763595581, "reward_std": 0.23387674987316132, "rewards/reward_func/mean": 0.877037763595581, "rewards/reward_func/std": 0.23387673497200012, "step": 941, "step_time": 21.299443446099758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 202.5625, "completions/mean_terminated_length": 202.5625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.20744986832141876, "epoch": 0.04363131079203335, "frac_reward_zero_std": 0.0, "grad_norm": 0.07444166392087936, "kl": 0.0012948303774464875, "learning_rate": 9.912830013895322e-07, "loss": -0.0117, "num_tokens": 25945494.0, "reward": 0.8516945838928223, "reward_std": 0.22711853682994843, "rewards/reward_func/mean": 0.8516945838928223, "rewards/reward_func/std": 0.22711855173110962, "step": 942, "step_time": 24.27873231470585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.32297907769680023, "epoch": 0.04367762853172765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022987041156738997, "kl": 0.0015412152570206672, "learning_rate": 9.912737378415934e-07, "loss": 0.0001, "num_tokens": 25974688.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 943, "step_time": 17.78444692119956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.29273275285959244, "epoch": 0.043723946271421954, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006314238416962326, "kl": 0.0008948555623646826, "learning_rate": 9.912644742936545e-07, "loss": 0.0, "num_tokens": 26011630.0, "reward": 0.8507331609725952, "reward_std": 0.0, "rewards/reward_func/mean": 0.8507331609725952, "rewards/reward_func/std": 0.0, "step": 944, "step_time": 23.350538298487663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.23666785657405853, "epoch": 0.04377026401111626, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015405794838443398, "kl": 0.0015227313851937652, "learning_rate": 9.912552107457156e-07, "loss": 0.0001, "num_tokens": 26045904.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 945, "step_time": 19.95811043307185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 142.375, "completions/mean_terminated_length": 142.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3757862076163292, "epoch": 0.04381658175081056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013596073258668184, "kl": 0.0014998194528743625, "learning_rate": 9.912459471977767e-07, "loss": 0.0001, "num_tokens": 26065942.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 946, "step_time": 16.22507019340992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3453558087348938, "epoch": 0.04386289949050486, "frac_reward_zero_std": 1.0, "grad_norm": 0.001597587950527668, "kl": 0.0018345113785471767, "learning_rate": 9.912366836498379e-07, "loss": 0.0001, "num_tokens": 26120394.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 947, "step_time": 24.664816740900278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 204.8125, "completions/mean_terminated_length": 204.8125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.39521076530218124, "epoch": 0.043909217230199166, "frac_reward_zero_std": 0.0, "grad_norm": 0.08535200357437134, "kl": 0.001448167604394257, "learning_rate": 9.91227420101899e-07, "loss": -0.0268, "num_tokens": 26145319.0, "reward": 0.04770008474588394, "reward_std": 0.018802374601364136, "rewards/reward_func/mean": 0.04770008474588394, "rewards/reward_func/std": 0.018802374601364136, "step": 948, "step_time": 25.312162697315216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3368866443634033, "epoch": 0.04395553496989347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009091639658436179, "kl": 0.0012967442453373224, "learning_rate": 9.912181565539601e-07, "loss": 0.0001, "num_tokens": 26182023.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 949, "step_time": 20.87411253899336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2637072578072548, "epoch": 0.04400185270958777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008538886904716492, "kl": 0.0010562004172243178, "learning_rate": 9.912088930060212e-07, "loss": 0.0001, "num_tokens": 26203711.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 950, "step_time": 15.144162889569998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2999601513147354, "epoch": 0.044048170449282074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007343225297518075, "kl": 0.00115480792010203, "learning_rate": 9.911996294580824e-07, "loss": 0.0001, "num_tokens": 26227069.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 951, "step_time": 14.849574849009514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.18190209940075874, "epoch": 0.04409448818897638, "frac_reward_zero_std": 0.0, "grad_norm": 0.08292274177074432, "kl": 0.0009367444145027548, "learning_rate": 9.911903659101435e-07, "loss": 0.0217, "num_tokens": 26249375.0, "reward": 0.9273681640625, "reward_std": 0.025895869359374046, "rewards/reward_func/mean": 0.9273681640625, "rewards/reward_func/std": 0.025895869359374046, "step": 952, "step_time": 16.659883372485638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 143.4375, "completions/mean_terminated_length": 143.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2833913266658783, "epoch": 0.04414080592867068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024984402116388083, "kl": 0.0016165999695658684, "learning_rate": 9.911811023622046e-07, "loss": 0.0001, "num_tokens": 26270134.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 953, "step_time": 15.93040182814002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1734016165137291, "epoch": 0.04418712366836498, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007538609206676483, "kl": 0.0006580217886948958, "learning_rate": 9.911718388142657e-07, "loss": 0.0, "num_tokens": 26295244.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 954, "step_time": 17.942106883972883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2591349147260189, "epoch": 0.044233441408059286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013043932849541306, "kl": 0.0014181759615894407, "learning_rate": 9.911625752663269e-07, "loss": 0.0001, "num_tokens": 26314840.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 955, "step_time": 14.206559136509895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3751741051673889, "epoch": 0.04427975914775359, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019639788661152124, "kl": 0.002565555914770812, "learning_rate": 9.911533117183882e-07, "loss": 0.0001, "num_tokens": 26370039.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 956, "step_time": 26.0766384601593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.28951142728328705, "epoch": 0.04432607688744789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008503881981596351, "kl": 0.0010200832184636965, "learning_rate": 9.911440481704493e-07, "loss": 0.0001, "num_tokens": 26389778.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 957, "step_time": 14.31048109382391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 209.375, "completions/mean_terminated_length": 209.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.1391860507428646, "epoch": 0.044372394627142195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003694234765134752, "kl": 0.0004186817241134122, "learning_rate": 9.911347846225102e-07, "loss": 0.0, "num_tokens": 26427240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 958, "step_time": 23.268010932952166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.17917417362332344, "epoch": 0.0444187123668365, "frac_reward_zero_std": 1.0, "grad_norm": 0.00046418776037171483, "kl": 0.0007251384085975587, "learning_rate": 9.911255210745716e-07, "loss": 0.0, "num_tokens": 26448486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 959, "step_time": 17.64775961264968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 373.5625, "completions/mean_terminated_length": 373.5625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.28814607486128807, "epoch": 0.0444650301065308, "frac_reward_zero_std": 0.0, "grad_norm": 0.05286726728081703, "kl": 0.001133385201683268, "learning_rate": 9.911162575266327e-07, "loss": 0.3167, "num_tokens": 26479039.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 960, "step_time": 43.64147626236081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 166.3125, "completions/mean_terminated_length": 166.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.1643184870481491, "epoch": 0.044511347846225104, "frac_reward_zero_std": 0.0, "grad_norm": 0.10498344153165817, "kl": 0.004419477394549176, "learning_rate": 9.911069939786938e-07, "loss": -0.0452, "num_tokens": 26506388.0, "reward": 0.8449504375457764, "reward_std": 0.21580785512924194, "rewards/reward_func/mean": 0.8449504375457764, "rewards/reward_func/std": 0.21580785512924194, "step": 961, "step_time": 18.347329638898373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.14045149832963943, "epoch": 0.044557665585919407, "frac_reward_zero_std": 0.0, "grad_norm": 0.08984079211950302, "kl": 0.000682404643157497, "learning_rate": 9.91097730430755e-07, "loss": -0.0536, "num_tokens": 26540084.0, "reward": 0.887914776802063, "reward_std": 0.1020917147397995, "rewards/reward_func/mean": 0.887914776802063, "rewards/reward_func/std": 0.1020917147397995, "step": 962, "step_time": 19.268025774508715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 150.9375, "completions/mean_terminated_length": 150.9375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3451591953635216, "epoch": 0.04460398332561371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008214266272261739, "kl": 0.001269234111532569, "learning_rate": 9.91088466882816e-07, "loss": 0.0001, "num_tokens": 26576579.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 963, "step_time": 19.809270162135363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.33301592618227005, "epoch": 0.04465030106530801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036126698832958937, "kl": 0.0016686512681189924, "learning_rate": 9.910792033348772e-07, "loss": 0.0001, "num_tokens": 26605919.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 964, "step_time": 18.184857320040464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 206.9375, "completions/mean_terminated_length": 206.9375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.165915597230196, "epoch": 0.044696618805002315, "frac_reward_zero_std": 1.0, "grad_norm": 0.00042020107503049076, "kl": 0.0006393089715857059, "learning_rate": 9.910699397869383e-07, "loss": 0.0, "num_tokens": 26643710.0, "reward": 0.9111884832382202, "reward_std": 0.0, "rewards/reward_func/mean": 0.9111884832382202, "rewards/reward_func/std": 0.0, "step": 965, "step_time": 23.48654007539153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 178.1875, "completions/mean_terminated_length": 178.1875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.39551738649606705, "epoch": 0.04474293654469662, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008176004630513489, "kl": 0.0015144433709792793, "learning_rate": 9.910606762389995e-07, "loss": 0.0001, "num_tokens": 26691393.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 966, "step_time": 24.9347990937531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 148.9375, "completions/mean_terminated_length": 148.9375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.35475438833236694, "epoch": 0.04478925428439092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008755417657084763, "kl": 0.00114651262992993, "learning_rate": 9.910514126910606e-07, "loss": 0.0001, "num_tokens": 26713344.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 967, "step_time": 15.805172581225634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.28755972534418106, "epoch": 0.044835572024085224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008892468176782131, "kl": 0.0012161753402324393, "learning_rate": 9.910421491431217e-07, "loss": 0.0001, "num_tokens": 26746592.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 968, "step_time": 18.469312489032745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3481305167078972, "epoch": 0.04488188976377953, "frac_reward_zero_std": 0.0, "grad_norm": 0.08232767134904861, "kl": 0.001309857121668756, "learning_rate": 9.91032885595183e-07, "loss": -0.151, "num_tokens": 26784116.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 969, "step_time": 32.54627714306116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 136.4375, "completions/mean_terminated_length": 136.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.22975606471300125, "epoch": 0.04492820750347383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014065310824662447, "kl": 0.0013225088478066027, "learning_rate": 9.910236220472442e-07, "loss": 0.0001, "num_tokens": 26803723.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 970, "step_time": 15.287191644310951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 202.8125, "completions/mean_terminated_length": 202.8125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3717750906944275, "epoch": 0.04497452524316813, "frac_reward_zero_std": 0.0, "grad_norm": 0.07482772320508957, "kl": 0.0012675386969931424, "learning_rate": 9.91014358499305e-07, "loss": 0.0792, "num_tokens": 26836296.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 971, "step_time": 24.93474406003952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 153.0625, "completions/mean_terminated_length": 153.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4077810049057007, "epoch": 0.045020842982862436, "frac_reward_zero_std": 1.0, "grad_norm": 0.000945600273553282, "kl": 0.0015428498154506087, "learning_rate": 9.910050949513662e-07, "loss": 0.0001, "num_tokens": 26890249.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 972, "step_time": 24.346562299877405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 219.4375, "completions/mean_terminated_length": 219.4375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.35865750908851624, "epoch": 0.04506716072255674, "frac_reward_zero_std": 0.0, "grad_norm": 0.07685404270887375, "kl": 0.0010975960321957245, "learning_rate": 9.909958314034275e-07, "loss": -0.029, "num_tokens": 26918000.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.4787135720252991, "step": 973, "step_time": 22.07788737118244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.16718735173344612, "epoch": 0.04511347846225104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011189704528078437, "kl": 0.0008590102370362729, "learning_rate": 9.909865678554887e-07, "loss": 0.0, "num_tokens": 26946600.0, "reward": 0.6041615009307861, "reward_std": 0.0, "rewards/reward_func/mean": 0.6041615009307861, "rewards/reward_func/std": 0.0, "step": 974, "step_time": 20.742748513817787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2968504950404167, "epoch": 0.045159796201945344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015855777310207486, "kl": 0.0014514246431645006, "learning_rate": 9.909773043075498e-07, "loss": 0.0001, "num_tokens": 26969752.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 975, "step_time": 14.812765996903181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 188.8125, "completions/mean_terminated_length": 188.8125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.22856667265295982, "epoch": 0.04520611394163965, "frac_reward_zero_std": 1.0, "grad_norm": 0.001736610196530819, "kl": 0.0008091626805253327, "learning_rate": 9.90968040759611e-07, "loss": 0.0, "num_tokens": 26994629.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 976, "step_time": 20.00553661584854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 212.8125, "completions/mean_terminated_length": 212.8125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.2676115930080414, "epoch": 0.04525243168133395, "frac_reward_zero_std": 0.0, "grad_norm": 0.08182648569345474, "kl": 0.0008092873758869246, "learning_rate": 9.90958777211672e-07, "loss": -0.0264, "num_tokens": 27032546.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 977, "step_time": 26.288400877267122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.1920037753880024, "epoch": 0.04529874942102825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012227949919179082, "kl": 0.0010539426148170605, "learning_rate": 9.909495136637332e-07, "loss": 0.0001, "num_tokens": 27052122.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 978, "step_time": 13.783868838101625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.16859596595168114, "epoch": 0.045345067160722556, "frac_reward_zero_std": 1.0, "grad_norm": 0.000766526150982827, "kl": 0.0008085589506663382, "learning_rate": 9.909402501157943e-07, "loss": 0.0, "num_tokens": 27080802.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 979, "step_time": 21.164541829377413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 250.4375, "completions/mean_terminated_length": 250.4375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.36518095433712006, "epoch": 0.04539138490041686, "frac_reward_zero_std": 0.0, "grad_norm": 0.061376895755529404, "kl": 0.0011561915162019432, "learning_rate": 9.909309865678554e-07, "loss": 0.0109, "num_tokens": 27109481.0, "reward": 0.1178591400384903, "reward_std": 0.3220524787902832, "rewards/reward_func/mean": 0.1178591400384903, "rewards/reward_func/std": 0.3220525085926056, "step": 980, "step_time": 27.487699549645185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.27842877805233, "epoch": 0.04543770264011116, "frac_reward_zero_std": 1.0, "grad_norm": 0.001050975057296455, "kl": 0.001031560663250275, "learning_rate": 9.909217230199165e-07, "loss": 0.0001, "num_tokens": 27130609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 981, "step_time": 13.546759389340878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2758451849222183, "epoch": 0.045484020379805465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010525949764996767, "kl": 0.0012523937621153891, "learning_rate": 9.909124594719779e-07, "loss": 0.0001, "num_tokens": 27152678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 982, "step_time": 15.912566743791103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4020918607711792, "epoch": 0.04553033811949977, "frac_reward_zero_std": 1.0, "grad_norm": 0.003653065301477909, "kl": 0.0019453027343843132, "learning_rate": 9.909031959240388e-07, "loss": 0.0001, "num_tokens": 27175626.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 983, "step_time": 21.362744972109795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3147260621190071, "epoch": 0.04557665585919407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007648139144293964, "kl": 0.0009988356614485383, "learning_rate": 9.908939323761e-07, "loss": 0.0001, "num_tokens": 27198742.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 984, "step_time": 16.57590225711465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2475137710571289, "epoch": 0.045622973598888374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005296228919178247, "kl": 0.0008576173277106136, "learning_rate": 9.90884668828161e-07, "loss": 0.0, "num_tokens": 27219948.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 985, "step_time": 14.385677341371775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3114180639386177, "epoch": 0.04566929133858268, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008898326195776463, "kl": 0.00121376384049654, "learning_rate": 9.908754052802224e-07, "loss": 0.0001, "num_tokens": 27243668.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 986, "step_time": 14.067319616675377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.28713928908109665, "epoch": 0.04571560907827698, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008185781189240515, "kl": 0.001233345188666135, "learning_rate": 9.908661417322835e-07, "loss": 0.0001, "num_tokens": 27263687.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 987, "step_time": 17.29108925536275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 152.3125, "completions/mean_terminated_length": 152.3125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3779827654361725, "epoch": 0.04576192681797128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010614685015752912, "kl": 0.001530019799247384, "learning_rate": 9.908568781843446e-07, "loss": 0.0001, "num_tokens": 27284460.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 988, "step_time": 15.335717637091875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3747268542647362, "epoch": 0.045808244557665585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008985947351902723, "kl": 0.0016214477946050465, "learning_rate": 9.908476146364057e-07, "loss": 0.0001, "num_tokens": 27340980.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 989, "step_time": 26.592960093170404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.19721996411681175, "epoch": 0.04585456229735989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005840666126459837, "kl": 0.0008120789570966735, "learning_rate": 9.908383510884669e-07, "loss": 0.0, "num_tokens": 27364886.0, "reward": 0.8611735105514526, "reward_std": 0.0, "rewards/reward_func/mean": 0.8611735105514526, "rewards/reward_func/std": 0.0, "step": 990, "step_time": 18.10546052083373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3560015484690666, "epoch": 0.04590088003705419, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007833911222405732, "kl": 0.0010475474409759045, "learning_rate": 9.90829087540528e-07, "loss": 0.0001, "num_tokens": 27396920.0, "reward": 0.2741396427154541, "reward_std": 0.0, "rewards/reward_func/mean": 0.2741396427154541, "rewards/reward_func/std": 0.0, "step": 991, "step_time": 21.2666631154716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.31283534318208694, "epoch": 0.045947197776748494, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009348664898425341, "kl": 0.0009910411317832768, "learning_rate": 9.908198239925891e-07, "loss": 0.0001, "num_tokens": 27418654.0, "reward": 1.5776187467508862e-08, "reward_std": 4.2069832062452406e-09, "rewards/reward_func/mean": 1.5776187467508862e-08, "rewards/reward_func/std": 4.2069832062452406e-09, "step": 992, "step_time": 21.00212061777711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.21668926626443863, "epoch": 0.0459935155164428, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012792462948709726, "kl": 0.0011507587332744151, "learning_rate": 9.908105604446502e-07, "loss": 0.0001, "num_tokens": 27453263.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 993, "step_time": 21.143105305731297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 179.9375, "completions/mean_terminated_length": 179.9375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.21674254164099693, "epoch": 0.0460398332561371, "frac_reward_zero_std": 0.0, "grad_norm": 0.08749733120203018, "kl": 0.0011520619154907763, "learning_rate": 9.908012968967114e-07, "loss": -0.0055, "num_tokens": 27479342.0, "reward": 0.956403374671936, "reward_std": 0.01162576675415039, "rewards/reward_func/mean": 0.956403374671936, "rewards/reward_func/std": 0.011625767685472965, "step": 994, "step_time": 19.21072856336832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 199.9375, "completions/mean_terminated_length": 199.9375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3687305301427841, "epoch": 0.0460861509958314, "frac_reward_zero_std": 1.0, "grad_norm": 0.000500780763104558, "kl": 0.0010712180228438228, "learning_rate": 9.907920333487725e-07, "loss": 0.0001, "num_tokens": 27503437.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 995, "step_time": 22.20991675555706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 186.1875, "completions/mean_terminated_length": 186.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.45539455860853195, "epoch": 0.046132468735525706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007872936548665166, "kl": 0.0014461501850746572, "learning_rate": 9.907827698008336e-07, "loss": 0.0001, "num_tokens": 27524672.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 996, "step_time": 19.799541417509317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 210.9375, "completions/mean_terminated_length": 210.9375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4072110280394554, "epoch": 0.04617878647522001, "frac_reward_zero_std": 0.0, "grad_norm": 0.06839299947023392, "kl": 0.002012406912399456, "learning_rate": 9.907735062528947e-07, "loss": -0.2488, "num_tokens": 27551599.0, "reward": 0.05903397873044014, "reward_std": 0.2361346185207367, "rewards/reward_func/mean": 0.05903397873044014, "rewards/reward_func/std": 0.2361346185207367, "step": 997, "step_time": 33.95599554479122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 320.3125, "completions/mean_terminated_length": 320.3125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "entropy": 0.2276455983519554, "epoch": 0.04622510421491431, "frac_reward_zero_std": 0.0, "grad_norm": 0.05795934796333313, "kl": 0.0009760240791365504, "learning_rate": 9.907642427049559e-07, "loss": -0.0316, "num_tokens": 27592628.0, "reward": 0.6055164337158203, "reward_std": 0.21161341667175293, "rewards/reward_func/mean": 0.6055164337158203, "rewards/reward_func/std": 0.21161341667175293, "step": 998, "step_time": 33.63652973622084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.33441710472106934, "epoch": 0.046271421954608614, "frac_reward_zero_std": 0.0, "grad_norm": 0.10504307597875595, "kl": 0.001504827494500205, "learning_rate": 9.907549791570172e-07, "loss": 0.1282, "num_tokens": 27625506.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 999, "step_time": 25.208656802773476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 175.4375, "completions/mean_terminated_length": 175.4375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.188066765666008, "epoch": 0.04631773969430292, "frac_reward_zero_std": 0.0, "grad_norm": 0.06686665862798691, "kl": 0.0010400941246189177, "learning_rate": 9.907457156090783e-07, "loss": 0.0279, "num_tokens": 27650217.0, "reward": 0.9611176252365112, "reward_std": 0.0151781365275383, "rewards/reward_func/mean": 0.9611176252365112, "rewards/reward_func/std": 0.01517812255769968, "step": 1000, "step_time": 18.592824559658766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.15011244639754295, "epoch": 0.04636405743399722, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007342693861573935, "kl": 0.000607321817369666, "learning_rate": 9.907364520611392e-07, "loss": 0.0, "num_tokens": 27679433.0, "reward": 0.05351965129375458, "reward_std": 0.0, "rewards/reward_func/mean": 0.05351965129375458, "rewards/reward_func/std": 0.0, "step": 1001, "step_time": 15.300740394741297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.14754782989621162, "epoch": 0.04641037517369152, "frac_reward_zero_std": 1.0, "grad_norm": 0.000899335544090718, "kl": 0.0007484348898287863, "learning_rate": 9.907271885132004e-07, "loss": 0.0, "num_tokens": 27699823.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1002, "step_time": 15.886766765266657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.26406119018793106, "epoch": 0.046456692913385826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013387624640017748, "kl": 0.0012153905990999192, "learning_rate": 9.907179249652617e-07, "loss": 0.0001, "num_tokens": 27725191.0, "reward": 0.8919567465782166, "reward_std": 0.0, "rewards/reward_func/mean": 0.8919567465782166, "rewards/reward_func/std": 0.0, "step": 1003, "step_time": 21.744496561586857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 200.4375, "completions/mean_terminated_length": 200.4375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.4516618251800537, "epoch": 0.04650301065308013, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018766354769468307, "kl": 0.0014909605670254678, "learning_rate": 9.907086614173228e-07, "loss": 0.0001, "num_tokens": 27753150.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1004, "step_time": 20.778580099344254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 185.1875, "completions/mean_terminated_length": 185.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.1956096552312374, "epoch": 0.04654932839277443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005770224379375577, "kl": 0.0007550330483354628, "learning_rate": 9.90699397869384e-07, "loss": 0.0, "num_tokens": 27778449.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1005, "step_time": 20.553741309791803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2911641448736191, "epoch": 0.046595646132468735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016610927414149046, "kl": 0.001227592452778481, "learning_rate": 9.90690134321445e-07, "loss": 0.0001, "num_tokens": 27801849.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1006, "step_time": 16.10859252884984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3084492161870003, "epoch": 0.04664196387216304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014078120002523065, "kl": 0.0016218001546803862, "learning_rate": 9.906808707735062e-07, "loss": 0.0001, "num_tokens": 27821525.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1007, "step_time": 13.805916965007782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.37004653364419937, "epoch": 0.04668828161185734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010392072144895792, "kl": 0.0016200426907744259, "learning_rate": 9.906716072255673e-07, "loss": 0.0001, "num_tokens": 27844563.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1008, "step_time": 16.00742544233799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2602906823158264, "epoch": 0.046734599351551644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007567908032797277, "kl": 0.0009213285811711103, "learning_rate": 9.906623436776285e-07, "loss": 0.0, "num_tokens": 27865195.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1009, "step_time": 13.58338475972414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.26458117365837097, "epoch": 0.04678091709124595, "frac_reward_zero_std": 0.0, "grad_norm": 0.054729245603084564, "kl": 0.0008141873258864507, "learning_rate": 9.906530801296896e-07, "loss": -0.0347, "num_tokens": 27899251.0, "reward": 0.5853970050811768, "reward_std": 0.007496384438127279, "rewards/reward_func/mean": 0.5853970050811768, "rewards/reward_func/std": 0.007496391888707876, "step": 1010, "step_time": 26.771377734839916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2727834992110729, "epoch": 0.04682723483094025, "frac_reward_zero_std": 0.0, "grad_norm": 0.07664250582456589, "kl": 0.0009567018860252574, "learning_rate": 9.906438165817507e-07, "loss": -0.0267, "num_tokens": 27931115.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 1011, "step_time": 21.531744547188282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 144.5625, "completions/mean_terminated_length": 144.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.24290068447589874, "epoch": 0.04687355257063455, "frac_reward_zero_std": 1.0, "grad_norm": 0.000837481755297631, "kl": 0.0010146870190510526, "learning_rate": 9.90634553033812e-07, "loss": 0.0001, "num_tokens": 27951620.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1012, "step_time": 15.088049869984388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3999406322836876, "epoch": 0.046919870310328855, "frac_reward_zero_std": 0.0, "grad_norm": 0.07669055461883545, "kl": 0.001167811220511794, "learning_rate": 9.906252894858732e-07, "loss": -0.2254, "num_tokens": 27991568.0, "reward": 0.04979052022099495, "reward_std": 0.1991620808839798, "rewards/reward_func/mean": 0.04979052022099495, "rewards/reward_func/std": 0.199162095785141, "step": 1013, "step_time": 35.830657087266445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 172.4375, "completions/mean_terminated_length": 172.4375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.36374586820602417, "epoch": 0.04696618805002316, "frac_reward_zero_std": 1.0, "grad_norm": 0.00498656602576375, "kl": 0.0029439253848977387, "learning_rate": 9.90616025937934e-07, "loss": 0.0001, "num_tokens": 28052807.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1014, "step_time": 28.638108514249325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 140.8125, "completions/mean_terminated_length": 140.8125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3514583930373192, "epoch": 0.04701250578971746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012915709521621466, "kl": 0.0011993466614512727, "learning_rate": 9.906067623899952e-07, "loss": 0.0001, "num_tokens": 28078228.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1015, "step_time": 16.573318760842085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 116.875, "completions/mean_terminated_length": 116.875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3068011477589607, "epoch": 0.047058823529411764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025617980863898993, "kl": 0.0018665404641069472, "learning_rate": 9.905974988420565e-07, "loss": 0.0001, "num_tokens": 28099010.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1016, "step_time": 14.626772541552782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 121.6875, "completions/mean_terminated_length": 121.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.27892550826072693, "epoch": 0.04710514126910607, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015984615311026573, "kl": 0.0014860667288303375, "learning_rate": 9.905882352941177e-07, "loss": 0.0001, "num_tokens": 28123037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1017, "step_time": 15.03841832652688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 129.25, "completions/mean_terminated_length": 129.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2556873820722103, "epoch": 0.04715145900880037, "frac_reward_zero_std": 1.0, "grad_norm": 0.001144444802775979, "kl": 0.0011821803927887231, "learning_rate": 9.905789717461788e-07, "loss": 0.0001, "num_tokens": 28142769.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1018, "step_time": 13.272651929408312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 179.1875, "completions/mean_terminated_length": 179.1875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.32374632358551025, "epoch": 0.04719777674849467, "frac_reward_zero_std": 0.0, "grad_norm": 0.07296735793352127, "kl": 0.0012415140372468159, "learning_rate": 9.9056970819824e-07, "loss": 0.0116, "num_tokens": 28171908.0, "reward": 0.858630895614624, "reward_std": 0.23279428482055664, "rewards/reward_func/mean": 0.858630895614624, "rewards/reward_func/std": 0.23279428482055664, "step": 1019, "step_time": 22.157922506332397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 184.0625, "completions/mean_terminated_length": 184.0625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3561431095004082, "epoch": 0.047244094488188976, "frac_reward_zero_std": 0.0, "grad_norm": 0.14152806997299194, "kl": 0.0031695798970758915, "learning_rate": 9.90560444650301e-07, "loss": -0.0217, "num_tokens": 28217685.0, "reward": 0.3959382176399231, "reward_std": 0.46706902980804443, "rewards/reward_func/mean": 0.3959382176399231, "rewards/reward_func/std": 0.4670690596103668, "step": 1020, "step_time": 25.170372180640697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.37993478775024414, "epoch": 0.04729041222788328, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018244581297039986, "kl": 0.0018740384257398546, "learning_rate": 9.905511811023622e-07, "loss": 0.0001, "num_tokens": 28241669.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1021, "step_time": 20.334804717451334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 131.25, "completions/mean_terminated_length": 131.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2585686296224594, "epoch": 0.04733672996757758, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021120314486324787, "kl": 0.0018503092287573963, "learning_rate": 9.905419175544233e-07, "loss": 0.0001, "num_tokens": 28264361.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1022, "step_time": 14.707961484789848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.27028487622737885, "epoch": 0.047383047707271884, "frac_reward_zero_std": 0.0, "grad_norm": 0.09869460761547089, "kl": 0.0010344291804358363, "learning_rate": 9.905326540064844e-07, "loss": 0.013, "num_tokens": 28286164.0, "reward": 0.9014118909835815, "reward_std": 0.02629014663398266, "rewards/reward_func/mean": 0.9014118909835815, "rewards/reward_func/std": 0.026290163397789, "step": 1023, "step_time": 17.112298902124166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 193.9375, "completions/mean_terminated_length": 193.9375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.20764760300517082, "epoch": 0.04742936544696619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011982873547822237, "kl": 0.0012532433029264212, "learning_rate": 9.905233904585455e-07, "loss": 0.0001, "num_tokens": 28340467.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 1024, "step_time": 27.57717900723219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.30567528307437897, "epoch": 0.04747568318666049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009536018478684127, "kl": 0.0010554982582107186, "learning_rate": 9.905141269106067e-07, "loss": 0.0001, "num_tokens": 28363021.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1025, "step_time": 14.877538722008467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.23596811294555664, "epoch": 0.04752200092635479, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008968862821348011, "kl": 0.0010560166410868987, "learning_rate": 9.905048633626678e-07, "loss": 0.0001, "num_tokens": 28384080.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 1026, "step_time": 17.550704695284367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 194.3125, "completions/mean_terminated_length": 194.3125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.18419504538178444, "epoch": 0.047568318666049096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006631119758822024, "kl": 0.0007572467438876629, "learning_rate": 9.90495599814729e-07, "loss": 0.0, "num_tokens": 28412421.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1027, "step_time": 20.707427095621824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 230.375, "completions/mean_terminated_length": 230.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.3973093628883362, "epoch": 0.0476146364057434, "frac_reward_zero_std": 0.0, "grad_norm": 0.07777100056409836, "kl": 0.0015067631611600518, "learning_rate": 9.9048633626679e-07, "loss": 0.0644, "num_tokens": 28438235.0, "reward": 0.04411235451698303, "reward_std": 0.026303526014089584, "rewards/reward_func/mean": 0.04411235451698303, "rewards/reward_func/std": 0.026303526014089584, "step": 1028, "step_time": 25.296633563935757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3399771451950073, "epoch": 0.0476609541454377, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014560186536982656, "kl": 0.0016204967396333814, "learning_rate": 9.904770727188514e-07, "loss": 0.0001, "num_tokens": 28458394.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1029, "step_time": 14.084137599915266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 184.8125, "completions/mean_terminated_length": 184.8125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2464139610528946, "epoch": 0.047707271885132005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008261942421086133, "kl": 0.0009879043354885653, "learning_rate": 9.904678091709125e-07, "loss": 0.0, "num_tokens": 28479399.0, "reward": 0.6376281380653381, "reward_std": 0.0, "rewards/reward_func/mean": 0.6376281380653381, "rewards/reward_func/std": 0.0, "step": 1030, "step_time": 18.75266282632947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 176.6875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3958509489893913, "epoch": 0.04775358962482631, "frac_reward_zero_std": 0.0, "grad_norm": 0.09384248405694962, "kl": 0.0012421664432622492, "learning_rate": 9.904585456229736e-07, "loss": -0.0399, "num_tokens": 28505778.0, "reward": 0.011110535822808743, "reward_std": 0.04444214329123497, "rewards/reward_func/mean": 0.011110535822808743, "rewards/reward_func/std": 0.04444214701652527, "step": 1031, "step_time": 21.056372981518507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 125.0625, "completions/mean_terminated_length": 125.0625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.29521749913692474, "epoch": 0.04779990736452061, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011238530278205872, "kl": 0.0012431179638952017, "learning_rate": 9.904492820750345e-07, "loss": 0.0001, "num_tokens": 28528259.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1032, "step_time": 14.588222738355398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 114.5625, "completions/mean_terminated_length": 114.5625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.28873495757579803, "epoch": 0.047846225104214914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013598831137642264, "kl": 0.001216363481944427, "learning_rate": 9.904400185270959e-07, "loss": 0.0001, "num_tokens": 28553868.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1033, "step_time": 13.451530616730452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.30545124411582947, "epoch": 0.04789254284390922, "frac_reward_zero_std": 0.0, "grad_norm": 0.08109713345766068, "kl": 0.0015163577045314014, "learning_rate": 9.90430754979157e-07, "loss": -0.0362, "num_tokens": 28577468.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 1034, "step_time": 21.60637979581952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 288.5625, "completions/mean_terminated_length": 288.5625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.23269639909267426, "epoch": 0.04793886058360352, "frac_reward_zero_std": 0.0, "grad_norm": 0.076987624168396, "kl": 0.0011784299713326618, "learning_rate": 9.904214914312181e-07, "loss": -0.1046, "num_tokens": 28617637.0, "reward": 0.6312336921691895, "reward_std": 0.4019744098186493, "rewards/reward_func/mean": 0.6312336921691895, "rewards/reward_func/std": 0.4019744396209717, "step": 1035, "step_time": 33.45261598005891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.20818743482232094, "epoch": 0.04798517832329782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011204793117940426, "kl": 0.0012418660335242748, "learning_rate": 9.904122278832793e-07, "loss": 0.0001, "num_tokens": 28638248.0, "reward": 0.780767560005188, "reward_std": 0.0, "rewards/reward_func/mean": 0.780767560005188, "rewards/reward_func/std": 0.0, "step": 1036, "step_time": 14.784468349069357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 127.0625, "completions/mean_terminated_length": 127.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3053184598684311, "epoch": 0.048031496062992125, "frac_reward_zero_std": 1.0, "grad_norm": 0.002906875452026725, "kl": 0.0019339821301400661, "learning_rate": 9.904029643353404e-07, "loss": 0.0001, "num_tokens": 28668953.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1037, "step_time": 16.426546167582273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.34770015627145767, "epoch": 0.04807781380268643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013411898398771882, "kl": 0.0017382468504365534, "learning_rate": 9.903937007874015e-07, "loss": 0.0001, "num_tokens": 28689332.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1038, "step_time": 14.812857542186975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 113.8125, "completions/mean_terminated_length": 113.8125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2688029557466507, "epoch": 0.04812413154238073, "frac_reward_zero_std": 1.0, "grad_norm": 0.001557044917717576, "kl": 0.001222449413035065, "learning_rate": 9.903844372394626e-07, "loss": 0.0001, "num_tokens": 28712033.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1039, "step_time": 14.10106448084116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 167.8125, "completions/mean_terminated_length": 167.8125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4481164366006851, "epoch": 0.048170449282075034, "frac_reward_zero_std": 1.0, "grad_norm": 0.001968338154256344, "kl": 0.0017540619592182338, "learning_rate": 9.903751736915238e-07, "loss": 0.0001, "num_tokens": 28734542.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1040, "step_time": 17.662689447402954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2544493079185486, "epoch": 0.04821676702176934, "frac_reward_zero_std": 1.0, "grad_norm": 0.005187811329960823, "kl": 0.0017500853282399476, "learning_rate": 9.903659101435849e-07, "loss": 0.0001, "num_tokens": 28758149.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1041, "step_time": 15.444901376962662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 211.25, "completions/mean_terminated_length": 211.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.24108736217021942, "epoch": 0.04826308476146364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018843008438125253, "kl": 0.0013402788899838924, "learning_rate": 9.90356646595646e-07, "loss": 0.0001, "num_tokens": 28786537.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1042, "step_time": 22.423270910978317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3065830022096634, "epoch": 0.04830940250115794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014831394655629992, "kl": 0.0015106158389244229, "learning_rate": 9.903473830477073e-07, "loss": 0.0001, "num_tokens": 28806847.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1043, "step_time": 13.123977195471525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.29124607890844345, "epoch": 0.048355720240852246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007524856482632458, "kl": 0.0010806052305269986, "learning_rate": 9.903381194997685e-07, "loss": 0.0001, "num_tokens": 28829545.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1044, "step_time": 14.699601989239454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2185620404779911, "epoch": 0.04840203798054655, "frac_reward_zero_std": 1.0, "grad_norm": 0.001890899264253676, "kl": 0.0011860131926368922, "learning_rate": 9.903288559518294e-07, "loss": 0.0001, "num_tokens": 28849185.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1045, "step_time": 13.41104994341731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 209.6875, "completions/mean_terminated_length": 209.6875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3234374299645424, "epoch": 0.04844835572024085, "frac_reward_zero_std": 0.0, "grad_norm": 0.0852591022849083, "kl": 0.001257849100511521, "learning_rate": 9.903195924038907e-07, "loss": -0.1827, "num_tokens": 28888236.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 1046, "step_time": 28.974656738340855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.377143032848835, "epoch": 0.048494673459935154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013827767688781023, "kl": 0.0014107885072007775, "learning_rate": 9.903103288559518e-07, "loss": 0.0001, "num_tokens": 28913528.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1047, "step_time": 20.787018537521362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 188.1875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3820813000202179, "epoch": 0.04854099119962946, "frac_reward_zero_std": 0.0, "grad_norm": 0.09522230923175812, "kl": 0.0011572672810871154, "learning_rate": 9.90301065308013e-07, "loss": 0.0279, "num_tokens": 28942011.0, "reward": 0.7721847891807556, "reward_std": 0.3014300763607025, "rewards/reward_func/mean": 0.7721847891807556, "rewards/reward_func/std": 0.3014300763607025, "step": 1048, "step_time": 24.79846828058362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2822834402322769, "epoch": 0.04858730893932376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010766360210254788, "kl": 0.0011587667395360768, "learning_rate": 9.90291801760074e-07, "loss": 0.0001, "num_tokens": 28974830.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1049, "step_time": 19.522984847426414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 121.3125, "completions/mean_terminated_length": 121.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3349555507302284, "epoch": 0.04863362667901806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018369510071352124, "kl": 0.0017125543090514839, "learning_rate": 9.902825382121352e-07, "loss": 0.0001, "num_tokens": 28995667.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1050, "step_time": 12.765341181308031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.75, "completions/mean_terminated_length": 132.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2551494538784027, "epoch": 0.048679944418712366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015950956149026752, "kl": 0.0009169582335744053, "learning_rate": 9.902732746641963e-07, "loss": 0.0, "num_tokens": 29024271.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1051, "step_time": 15.948854491114616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 228.0625, "completions/mean_terminated_length": 228.0625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.40965036302804947, "epoch": 0.04872626215840667, "frac_reward_zero_std": 0.0, "grad_norm": 0.06601638346910477, "kl": 0.0010745679610408843, "learning_rate": 9.902640111162575e-07, "loss": -0.0452, "num_tokens": 29062816.0, "reward": 0.22259801626205444, "reward_std": 0.1388283222913742, "rewards/reward_func/mean": 0.22259801626205444, "rewards/reward_func/std": 0.1388283371925354, "step": 1052, "step_time": 26.456897154450417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.17067742720246315, "epoch": 0.04877257989810097, "frac_reward_zero_std": 0.0, "grad_norm": 0.08437284082174301, "kl": 0.001013905493891798, "learning_rate": 9.902547475683186e-07, "loss": -0.0323, "num_tokens": 29083904.0, "reward": 0.1774260252714157, "reward_std": 0.00568058155477047, "rewards/reward_func/mean": 0.1774260252714157, "rewards/reward_func/std": 0.005680582020431757, "step": 1053, "step_time": 16.477448847144842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3493001386523247, "epoch": 0.048818897637795275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017707743681967258, "kl": 0.002003938687266782, "learning_rate": 9.902454840203797e-07, "loss": 0.0001, "num_tokens": 29106786.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1054, "step_time": 16.477712485939264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 371.75, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "entropy": 0.16637060418725014, "epoch": 0.04886521537748958, "frac_reward_zero_std": 0.0, "grad_norm": 0.03795616701245308, "kl": 0.0005525052256416529, "learning_rate": 9.902362204724408e-07, "loss": -0.0019, "num_tokens": 29135694.0, "reward": 0.9831075072288513, "reward_std": 0.017591135576367378, "rewards/reward_func/mean": 0.9831075072288513, "rewards/reward_func/std": 0.017591137439012527, "step": 1055, "step_time": 33.35343899577856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.31635092198848724, "epoch": 0.04891153311718388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012206325773149729, "kl": 0.001716562604997307, "learning_rate": 9.902269569245022e-07, "loss": 0.0001, "num_tokens": 29163638.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1056, "step_time": 15.829910147935152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 147.3125, "completions/mean_terminated_length": 147.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3439546301960945, "epoch": 0.048957850856878184, "frac_reward_zero_std": 0.0, "grad_norm": 0.14148610830307007, "kl": 0.0021906031761318445, "learning_rate": 9.90217693376563e-07, "loss": 0.0021, "num_tokens": 29188555.0, "reward": 0.04598493129014969, "reward_std": 0.12565475702285767, "rewards/reward_func/mean": 0.04598493129014969, "rewards/reward_func/std": 0.12565475702285767, "step": 1057, "step_time": 16.40659347549081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3805572837591171, "epoch": 0.04900416859657249, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009291348978877068, "kl": 0.0014806788240093738, "learning_rate": 9.902084298286242e-07, "loss": 0.0001, "num_tokens": 29211077.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1058, "step_time": 17.243373408913612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3104632496833801, "epoch": 0.04905048633626679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008697710582055151, "kl": 0.0010451382404426113, "learning_rate": 9.901991662806855e-07, "loss": 0.0001, "num_tokens": 29233732.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1059, "step_time": 17.982558369636536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.341328501701355, "epoch": 0.04909680407596109, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008639628649689257, "kl": 0.001301385651458986, "learning_rate": 9.901899027327467e-07, "loss": 0.0001, "num_tokens": 29269716.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1060, "step_time": 18.138447511941195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 135.25, "completions/mean_terminated_length": 135.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2975976839661598, "epoch": 0.049143121815655395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008323938236571848, "kl": 0.001154248500824906, "learning_rate": 9.901806391848078e-07, "loss": 0.0001, "num_tokens": 29293592.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1061, "step_time": 16.84081495180726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.21305131912231445, "epoch": 0.0491894395553497, "frac_reward_zero_std": 0.0, "grad_norm": 0.07748810946941376, "kl": 0.0010197410738328472, "learning_rate": 9.90171375636869e-07, "loss": 0.0379, "num_tokens": 29318441.0, "reward": 0.9566360712051392, "reward_std": 0.039497580379247665, "rewards/reward_func/mean": 0.9566360712051392, "rewards/reward_func/std": 0.03949758782982826, "step": 1062, "step_time": 19.56187452748418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.1875189207494259, "epoch": 0.049235757295044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009874739916995168, "kl": 0.0007780540472595021, "learning_rate": 9.9016211208893e-07, "loss": 0.0, "num_tokens": 29341595.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1063, "step_time": 19.514124918729067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 110.4375, "completions/mean_terminated_length": 110.4375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.28170251101255417, "epoch": 0.049282075034738304, "frac_reward_zero_std": 1.0, "grad_norm": 0.002179292729124427, "kl": 0.0021450460189953446, "learning_rate": 9.901528485409912e-07, "loss": 0.0001, "num_tokens": 29361986.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1064, "step_time": 13.229779623448849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.25674911588430405, "epoch": 0.04932839277443261, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008665647474117577, "kl": 0.0011798219056800008, "learning_rate": 9.901435849930523e-07, "loss": 0.0001, "num_tokens": 29394046.0, "reward": 0.5623413324356079, "reward_std": 0.0, "rewards/reward_func/mean": 0.5623413324356079, "rewards/reward_func/std": 0.0, "step": 1065, "step_time": 21.710856899619102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 150.6875, "completions/mean_terminated_length": 150.6875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2926267385482788, "epoch": 0.04937471051412691, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008478787494823337, "kl": 0.001008578998153098, "learning_rate": 9.901343214451134e-07, "loss": 0.0001, "num_tokens": 29414441.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1066, "step_time": 16.394194394350052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 149.5625, "completions/mean_terminated_length": 149.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3093181997537613, "epoch": 0.04942102825382121, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009809837210923433, "kl": 0.0011437874200055376, "learning_rate": 9.901250578971745e-07, "loss": 0.0001, "num_tokens": 29449090.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1067, "step_time": 20.499243050813675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 132.4375, "completions/mean_terminated_length": 132.4375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2840966731309891, "epoch": 0.049467345993515516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014046216383576393, "kl": 0.00144971240661107, "learning_rate": 9.901157943492357e-07, "loss": 0.0001, "num_tokens": 29481625.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1068, "step_time": 16.62565889954567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 191.625, "completions/mean_terminated_length": 191.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.41206422448158264, "epoch": 0.04951366373320982, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012185772648081183, "kl": 0.0015064417966641486, "learning_rate": 9.901065308012968e-07, "loss": 0.0001, "num_tokens": 29510867.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1069, "step_time": 21.32372997328639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2584761790931225, "epoch": 0.04955998147290412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008137408876791596, "kl": 0.0009865954925771803, "learning_rate": 9.90097267253358e-07, "loss": 0.0, "num_tokens": 29533003.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1070, "step_time": 14.583113599568605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3181304261088371, "epoch": 0.049606299212598425, "frac_reward_zero_std": 1.0, "grad_norm": 0.000891090021468699, "kl": 0.00127486334531568, "learning_rate": 9.90088003705419e-07, "loss": 0.0001, "num_tokens": 29555345.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1071, "step_time": 15.791466876864433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 184.0625, "completions/mean_terminated_length": 184.0625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2723410539329052, "epoch": 0.04965261695229273, "frac_reward_zero_std": 0.0, "grad_norm": 0.08994110673666, "kl": 0.0010682634310796857, "learning_rate": 9.900787401574802e-07, "loss": -0.092, "num_tokens": 29581058.0, "reward": 0.8752042055130005, "reward_std": 0.33016297221183777, "rewards/reward_func/mean": 0.8752042055130005, "rewards/reward_func/std": 0.33016300201416016, "step": 1072, "step_time": 21.679789248853922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 141.3125, "completions/mean_terminated_length": 141.3125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3139200508594513, "epoch": 0.04969893469198703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014281471958383918, "kl": 0.0011057298397645354, "learning_rate": 9.900694766095415e-07, "loss": 0.0001, "num_tokens": 29608423.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1073, "step_time": 16.696265920996666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "entropy": 0.2060050554573536, "epoch": 0.04974525243168133, "frac_reward_zero_std": 0.0, "grad_norm": 0.058326784521341324, "kl": 0.0007712448859820142, "learning_rate": 9.900602130616026e-07, "loss": -0.032, "num_tokens": 29636355.0, "reward": 0.8223556280136108, "reward_std": 0.014419873245060444, "rewards/reward_func/mean": 0.8223556280136108, "rewards/reward_func/std": 0.014419869519770145, "step": 1074, "step_time": 29.50220875069499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 169.875, "completions/mean_terminated_length": 169.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3809701278805733, "epoch": 0.049791570171375636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016130228759720922, "kl": 0.001457754144212231, "learning_rate": 9.900509495136635e-07, "loss": 0.0001, "num_tokens": 29658593.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1075, "step_time": 19.350709948688745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 149.75, "completions/mean_terminated_length": 149.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.33714165538549423, "epoch": 0.04983788791106994, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016034268774092197, "kl": 0.0013555600889958441, "learning_rate": 9.900416859657249e-07, "loss": 0.0001, "num_tokens": 29683661.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1076, "step_time": 16.41102172806859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.31672170758247375, "epoch": 0.04988420565076424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008252878906205297, "kl": 0.0012895289110019803, "learning_rate": 9.90032422417786e-07, "loss": 0.0001, "num_tokens": 29705955.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1077, "step_time": 16.75003569200635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 247.3125, "completions/mean_terminated_length": 247.3125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.2997872605919838, "epoch": 0.049930523390458545, "frac_reward_zero_std": 0.0, "grad_norm": 0.07149261981248856, "kl": 0.0010818528244271874, "learning_rate": 9.900231588698471e-07, "loss": 0.024, "num_tokens": 29744312.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 1078, "step_time": 28.95836164802313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 133.5625, "completions/mean_terminated_length": 133.5625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2362428493797779, "epoch": 0.04997684113015285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010859938338398933, "kl": 0.0012356432271189988, "learning_rate": 9.900138953219083e-07, "loss": 0.0001, "num_tokens": 29763953.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1079, "step_time": 13.911317389458418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 172.375, "completions/mean_terminated_length": 172.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3578645661473274, "epoch": 0.05002315886984715, "frac_reward_zero_std": 0.0, "grad_norm": 0.09514566510915756, "kl": 0.0016648909368086606, "learning_rate": 9.900046317739694e-07, "loss": 0.1251, "num_tokens": 29784487.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 1080, "step_time": 19.114783979952335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3330864980816841, "epoch": 0.050069476609541454, "frac_reward_zero_std": 0.0, "grad_norm": 0.09150286763906479, "kl": 0.0010565592092461884, "learning_rate": 9.899953682260305e-07, "loss": -0.0914, "num_tokens": 29804804.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 1081, "step_time": 20.501433834433556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 249.6875, "completions/mean_terminated_length": 249.6875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.37190791219472885, "epoch": 0.05011579434923576, "frac_reward_zero_std": 0.0, "grad_norm": 0.07680151611566544, "kl": 0.0011202323366887867, "learning_rate": 9.899861046780916e-07, "loss": -0.1132, "num_tokens": 29828783.0, "reward": 0.5919939875602722, "reward_std": 0.4850125014781952, "rewards/reward_func/mean": 0.5919939875602722, "rewards/reward_func/std": 0.4850125312805176, "step": 1082, "step_time": 24.543287433683872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 200.9375, "completions/mean_terminated_length": 200.9375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.19050583988428116, "epoch": 0.05016211208893006, "frac_reward_zero_std": 0.0, "grad_norm": 0.0795620009303093, "kl": 0.0007946142286527902, "learning_rate": 9.899768411301528e-07, "loss": 0.058, "num_tokens": 29852190.0, "reward": 0.70708829164505, "reward_std": 0.010605335235595703, "rewards/reward_func/mean": 0.70708829164505, "rewards/reward_func/std": 0.010605335235595703, "step": 1083, "step_time": 22.166945844888687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 155.4375, "completions/mean_terminated_length": 155.4375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.19904791936278343, "epoch": 0.05020842982862436, "frac_reward_zero_std": 0.0, "grad_norm": 0.10177665203809738, "kl": 0.0013613034825539216, "learning_rate": 9.899675775822139e-07, "loss": 0.0314, "num_tokens": 29874261.0, "reward": 0.8440724611282349, "reward_std": 0.05945207178592682, "rewards/reward_func/mean": 0.8440724611282349, "rewards/reward_func/std": 0.05945207178592682, "step": 1084, "step_time": 16.072620674967766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 243.1875, "completions/mean_terminated_length": 243.1875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.2188308946788311, "epoch": 0.050254747568318665, "frac_reward_zero_std": 0.0, "grad_norm": 0.08357269316911697, "kl": 0.0009986345248762518, "learning_rate": 9.89958314034275e-07, "loss": -0.0593, "num_tokens": 29898312.0, "reward": 0.5324225425720215, "reward_std": 0.12115535885095596, "rewards/reward_func/mean": 0.5324225425720215, "rewards/reward_func/std": 0.12115536630153656, "step": 1085, "step_time": 25.0636737793684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.21330950036644936, "epoch": 0.05030106530801297, "frac_reward_zero_std": 0.0, "grad_norm": 0.08567479997873306, "kl": 0.0011195143160875887, "learning_rate": 9.899490504863363e-07, "loss": 0.0049, "num_tokens": 29926453.0, "reward": 0.9535844326019287, "reward_std": 0.023028584197163582, "rewards/reward_func/mean": 0.9535844326019287, "rewards/reward_func/std": 0.023028582334518433, "step": 1086, "step_time": 19.37694550305605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.33746711909770966, "epoch": 0.05034738304770727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013701335992664099, "kl": 0.001653151965001598, "learning_rate": 9.899397869383975e-07, "loss": 0.0001, "num_tokens": 29946179.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1087, "step_time": 14.556724030524492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2678183689713478, "epoch": 0.050393700787401574, "frac_reward_zero_std": 0.0, "grad_norm": 0.07312487810850143, "kl": 0.0011790625285357237, "learning_rate": 9.899305233904584e-07, "loss": -0.0268, "num_tokens": 29968917.0, "reward": 0.9347172975540161, "reward_std": 0.025483757257461548, "rewards/reward_func/mean": 0.9347172975540161, "rewards/reward_func/std": 0.025483759120106697, "step": 1088, "step_time": 18.529357075691223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2221939153969288, "epoch": 0.05044001852709588, "frac_reward_zero_std": 0.0, "grad_norm": 0.09114961326122284, "kl": 0.0011734712170436978, "learning_rate": 9.899212598425197e-07, "loss": -0.0636, "num_tokens": 29991316.0, "reward": 0.6862379312515259, "reward_std": 0.4361012578010559, "rewards/reward_func/mean": 0.6862379312515259, "rewards/reward_func/std": 0.4361012578010559, "step": 1089, "step_time": 21.20943710207939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 163.4375, "completions/mean_terminated_length": 163.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.38223615288734436, "epoch": 0.05048633626679018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023525089491158724, "kl": 0.002194980625063181, "learning_rate": 9.899119962945808e-07, "loss": 0.0001, "num_tokens": 30044427.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1090, "step_time": 23.972115632146597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 146.375, "completions/mean_terminated_length": 146.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2931435704231262, "epoch": 0.05053265400648448, "frac_reward_zero_std": 1.0, "grad_norm": 0.00117598962970078, "kl": 0.0012452219962142408, "learning_rate": 9.89902732746642e-07, "loss": 0.0001, "num_tokens": 30068017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1091, "step_time": 16.37996008247137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.8125, "completions/mean_terminated_length": 123.8125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.27399730682373047, "epoch": 0.050578971746178786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009294325718656182, "kl": 0.0011050984903704375, "learning_rate": 9.89893469198703e-07, "loss": 0.0001, "num_tokens": 30089918.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1092, "step_time": 13.417639058083296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 200.3125, "completions/mean_terminated_length": 200.3125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.2586069777607918, "epoch": 0.05062528948587309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011547214817255735, "kl": 0.0011535890225786716, "learning_rate": 9.898842056507642e-07, "loss": 0.0001, "num_tokens": 30116259.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1093, "step_time": 21.74313473328948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 172.25, "completions/mean_terminated_length": 172.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.38282404094934464, "epoch": 0.05067160722556739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014855681220069528, "kl": 0.0015927240310702473, "learning_rate": 9.898749421028253e-07, "loss": 0.0001, "num_tokens": 30141159.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1094, "step_time": 18.784075524657965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 124.9375, "completions/mean_terminated_length": 124.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2513014040887356, "epoch": 0.050717924965261695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014614450046792626, "kl": 0.0014210605586413294, "learning_rate": 9.898656785548865e-07, "loss": 0.0001, "num_tokens": 30160550.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1095, "step_time": 14.311907079070807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 247.6875, "completions/mean_terminated_length": 247.6875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.31304802745580673, "epoch": 0.050764242704956, "frac_reward_zero_std": 0.0, "grad_norm": 0.07821127772331238, "kl": 0.0014542262069880962, "learning_rate": 9.898564150069476e-07, "loss": -0.0658, "num_tokens": 30198913.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 1096, "step_time": 27.757250882685184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.33365585654973984, "epoch": 0.0508105604446503, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007472095894627273, "kl": 0.0012242514931131154, "learning_rate": 9.898471514590087e-07, "loss": 0.0001, "num_tokens": 30230907.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1097, "step_time": 20.173478361219168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.40793028473854065, "epoch": 0.0508568781843446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021767800208181143, "kl": 0.001700811495538801, "learning_rate": 9.898378879110698e-07, "loss": 0.0001, "num_tokens": 30258490.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1098, "step_time": 20.168668530881405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.25799454003572464, "epoch": 0.050903195924038906, "frac_reward_zero_std": 0.0, "grad_norm": 0.09027945250272751, "kl": 0.0012264355609659106, "learning_rate": 9.898286243631312e-07, "loss": -0.0359, "num_tokens": 30283272.0, "reward": 0.8811430931091309, "reward_std": 0.05903739482164383, "rewards/reward_func/mean": 0.8811430931091309, "rewards/reward_func/std": 0.05903739109635353, "step": 1099, "step_time": 18.734569620341063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 132.9375, "completions/mean_terminated_length": 132.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3116462081670761, "epoch": 0.05094951366373321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014807460829615593, "kl": 0.001511095353635028, "learning_rate": 9.89819360815192e-07, "loss": 0.0001, "num_tokens": 30306567.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1100, "step_time": 14.597326949238777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 182.3125, "completions/mean_terminated_length": 182.3125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.2798741087317467, "epoch": 0.05099583140342751, "frac_reward_zero_std": 0.0, "grad_norm": 0.07083644717931747, "kl": 0.0011947660677833483, "learning_rate": 9.898100972672532e-07, "loss": -0.0113, "num_tokens": 30328124.0, "reward": 0.8233011960983276, "reward_std": 0.21954698860645294, "rewards/reward_func/mean": 0.8233011960983276, "rewards/reward_func/std": 0.21954698860645294, "step": 1101, "step_time": 20.294959507882595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 208.1875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2013857662677765, "epoch": 0.051042149143121815, "frac_reward_zero_std": 0.0, "grad_norm": 0.0598393976688385, "kl": 0.0010075128375319764, "learning_rate": 9.898008337193143e-07, "loss": 0.0009, "num_tokens": 30354879.0, "reward": 0.8500427007675171, "reward_std": 0.05378911271691322, "rewards/reward_func/mean": 0.8500427007675171, "rewards/reward_func/std": 0.053789105266332626, "step": 1102, "step_time": 21.27879510819912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.4276796504855156, "epoch": 0.05108846688281612, "frac_reward_zero_std": 0.0, "grad_norm": 0.07888320088386536, "kl": 0.0011481109831947833, "learning_rate": 9.897915701713757e-07, "loss": 0.089, "num_tokens": 30379417.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 1103, "step_time": 28.08647546172142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3153912052512169, "epoch": 0.05113478462251042, "frac_reward_zero_std": 0.0, "grad_norm": 0.1048893928527832, "kl": 0.0014552755455952138, "learning_rate": 9.897823066234368e-07, "loss": 0.0111, "num_tokens": 30408147.0, "reward": 0.3920607566833496, "reward_std": 0.29425516724586487, "rewards/reward_func/mean": 0.3920607566833496, "rewards/reward_func/std": 0.29425519704818726, "step": 1104, "step_time": 24.419979251921177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.3451824113726616, "epoch": 0.051181102362204724, "frac_reward_zero_std": 0.0, "grad_norm": 0.0886145755648613, "kl": 0.001503424602560699, "learning_rate": 9.89773043075498e-07, "loss": 0.0201, "num_tokens": 30446601.0, "reward": 0.9942312240600586, "reward_std": 0.012402480468153954, "rewards/reward_func/mean": 0.9942312240600586, "rewards/reward_func/std": 0.01240248791873455, "step": 1105, "step_time": 25.77444277703762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.24910329654812813, "epoch": 0.05122742010189903, "frac_reward_zero_std": 1.0, "grad_norm": 0.001084607094526291, "kl": 0.0010699391859816387, "learning_rate": 9.89763779527559e-07, "loss": 0.0001, "num_tokens": 30468519.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1106, "step_time": 18.155639626085758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.6875, "completions/mean_terminated_length": 120.6875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.25526338815689087, "epoch": 0.05127373784159333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016850433312356472, "kl": 0.001595150592038408, "learning_rate": 9.897545159796202e-07, "loss": 0.0001, "num_tokens": 30488162.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1107, "step_time": 12.975851442664862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3439238592982292, "epoch": 0.05132005558128763, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012190868146717548, "kl": 0.0012358661915641278, "learning_rate": 9.897452524316813e-07, "loss": 0.0001, "num_tokens": 30520335.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 1108, "step_time": 23.448437191545963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.22079867124557495, "epoch": 0.051366373320981935, "frac_reward_zero_std": 0.0, "grad_norm": 0.06497100740671158, "kl": 0.0009250911243725568, "learning_rate": 9.897359888837424e-07, "loss": -0.0125, "num_tokens": 30541467.0, "reward": 0.8230994939804077, "reward_std": 0.09352880716323853, "rewards/reward_func/mean": 0.8230994939804077, "rewards/reward_func/std": 0.09352879971265793, "step": 1109, "step_time": 18.13630971312523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 119.125, "completions/mean_terminated_length": 119.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2911180630326271, "epoch": 0.05141269106067624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010315573308616877, "kl": 0.0011704214848577976, "learning_rate": 9.897267253358036e-07, "loss": 0.0001, "num_tokens": 30561229.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1110, "step_time": 12.498386699706316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 210.875, "completions/mean_terminated_length": 210.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.19843004271388054, "epoch": 0.05145900880037054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004728315689135343, "kl": 0.0006877919950056821, "learning_rate": 9.897174617878647e-07, "loss": 0.0, "num_tokens": 30593211.0, "reward": 0.9574533700942993, "reward_std": 0.0, "rewards/reward_func/mean": 0.9574533700942993, "rewards/reward_func/std": 0.0, "step": 1111, "step_time": 23.56514134258032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.38042762130498886, "epoch": 0.051505326540064844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010632815537974238, "kl": 0.0014865275588817894, "learning_rate": 9.897081982399258e-07, "loss": 0.0001, "num_tokens": 30627218.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1112, "step_time": 19.564482927322388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1334095038473606, "epoch": 0.05155164427975915, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005674534477293491, "kl": 0.0005581184232141823, "learning_rate": 9.89698934691987e-07, "loss": 0.0, "num_tokens": 30657221.0, "reward": 0.8574039340019226, "reward_std": 0.0, "rewards/reward_func/mean": 0.8574039340019226, "rewards/reward_func/std": 0.0, "step": 1113, "step_time": 17.10247927904129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.35668937116861343, "epoch": 0.05159796201945345, "frac_reward_zero_std": 0.0, "grad_norm": 0.13557200133800507, "kl": 0.0018954368715640157, "learning_rate": 9.89689671144048e-07, "loss": 0.0713, "num_tokens": 30685237.0, "reward": 0.862541675567627, "reward_std": 0.23001109063625336, "rewards/reward_func/mean": 0.862541675567627, "rewards/reward_func/std": 0.23001112043857574, "step": 1114, "step_time": 20.952950689941645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.19039323180913925, "epoch": 0.05164427975914775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006320044631138444, "kl": 0.0007345816120505333, "learning_rate": 9.896804075961092e-07, "loss": 0.0, "num_tokens": 30706863.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 1115, "step_time": 17.534554477781057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3573169931769371, "epoch": 0.051690597498842056, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010215246584266424, "kl": 0.001329431717749685, "learning_rate": 9.896711440481705e-07, "loss": 0.0001, "num_tokens": 30726974.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1116, "step_time": 14.324847247451544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4384104534983635, "epoch": 0.05173691523853636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011730300029739738, "kl": 0.001636853179661557, "learning_rate": 9.896618805002316e-07, "loss": 0.0001, "num_tokens": 30757676.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1117, "step_time": 18.594801753759384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.17398259416222572, "epoch": 0.05178323297823066, "frac_reward_zero_std": 1.0, "grad_norm": 0.00389515096321702, "kl": 0.0020509135792963207, "learning_rate": 9.896526169522926e-07, "loss": 0.0001, "num_tokens": 30803322.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 1118, "step_time": 23.118459444493055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2766589969396591, "epoch": 0.051829550717924965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018308153375983238, "kl": 0.0017926291620824486, "learning_rate": 9.896433534043539e-07, "loss": 0.0001, "num_tokens": 30832822.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1119, "step_time": 14.803225938230753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.28757575154304504, "epoch": 0.05187586845761927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011737167369574308, "kl": 0.0012856294924858958, "learning_rate": 9.89634089856415e-07, "loss": 0.0001, "num_tokens": 30853286.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1120, "step_time": 13.634919803589582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.23788436874747276, "epoch": 0.05192218619731357, "frac_reward_zero_std": 0.0, "grad_norm": 0.07899540662765503, "kl": 0.0010609175369609147, "learning_rate": 9.896248263084761e-07, "loss": -0.0196, "num_tokens": 30887120.0, "reward": 0.7089771628379822, "reward_std": 0.30815425515174866, "rewards/reward_func/mean": 0.7089771628379822, "rewards/reward_func/std": 0.30815425515174866, "step": 1121, "step_time": 23.72310246527195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.23498577252030373, "epoch": 0.05196850393700787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009531736723147333, "kl": 0.0010055435268441215, "learning_rate": 9.896155627605373e-07, "loss": 0.0001, "num_tokens": 30906974.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1122, "step_time": 14.71127225831151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.31246964633464813, "epoch": 0.052014821676702176, "frac_reward_zero_std": 0.0, "grad_norm": 0.07301823794841766, "kl": 0.0012709646834991872, "learning_rate": 9.896062992125984e-07, "loss": -0.0116, "num_tokens": 30942368.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 1123, "step_time": 27.333131555467844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.29382146522402763, "epoch": 0.05206113941639648, "frac_reward_zero_std": 0.0, "grad_norm": 0.07238540798425674, "kl": 0.0013140554656274617, "learning_rate": 9.895970356646595e-07, "loss": -0.0442, "num_tokens": 30973616.0, "reward": 0.18292942643165588, "reward_std": 0.07961179316043854, "rewards/reward_func/mean": 0.18292942643165588, "rewards/reward_func/std": 0.07961180061101913, "step": 1124, "step_time": 25.977344941347837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 131.3125, "completions/mean_terminated_length": 131.3125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2118467167019844, "epoch": 0.05210745715609078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007855461444705725, "kl": 0.0009936098067555577, "learning_rate": 9.895877721167206e-07, "loss": 0.0, "num_tokens": 30993173.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1125, "step_time": 13.984401155263186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.43601953983306885, "epoch": 0.052153774895785085, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009769224561750889, "kl": 0.001453546603443101, "learning_rate": 9.895785085687818e-07, "loss": 0.0001, "num_tokens": 31016733.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1126, "step_time": 19.12609263509512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.31714512407779694, "epoch": 0.05220009263547939, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007875919691286981, "kl": 0.0011351430439390242, "learning_rate": 9.895692450208429e-07, "loss": 0.0001, "num_tokens": 31039059.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1127, "step_time": 14.93588675931096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2508166506886482, "epoch": 0.05224641037517369, "frac_reward_zero_std": 1.0, "grad_norm": 0.006805529352277517, "kl": 0.0027232636348344386, "learning_rate": 9.89559981472904e-07, "loss": 0.0001, "num_tokens": 31058641.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1128, "step_time": 14.223127357661724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 190.5625, "completions/mean_terminated_length": 190.5625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3678119406104088, "epoch": 0.052292728114867994, "frac_reward_zero_std": 0.0, "grad_norm": 0.17108699679374695, "kl": 0.007267465902259573, "learning_rate": 9.895507179249653e-07, "loss": 0.0606, "num_tokens": 31082202.0, "reward": 0.40776169300079346, "reward_std": 0.4784238338470459, "rewards/reward_func/mean": 0.40776169300079346, "rewards/reward_func/std": 0.4784238338470459, "step": 1129, "step_time": 21.02282042056322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2689135894179344, "epoch": 0.0523390458545623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010215247748419642, "kl": 0.0011080264812335372, "learning_rate": 9.895414543770265e-07, "loss": 0.0001, "num_tokens": 31102413.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1130, "step_time": 13.169312495738268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 198.9375, "completions/mean_terminated_length": 198.9375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.37807053327560425, "epoch": 0.0523853635942566, "frac_reward_zero_std": 0.0, "grad_norm": 0.14330777525901794, "kl": 0.0016479375190101564, "learning_rate": 9.895321908290874e-07, "loss": 0.0268, "num_tokens": 31131404.0, "reward": 0.28276169300079346, "reward_std": 0.43315792083740234, "rewards/reward_func/mean": 0.28276169300079346, "rewards/reward_func/std": 0.43315792083740234, "step": 1131, "step_time": 22.18549221381545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2906291112303734, "epoch": 0.0524316813339509, "frac_reward_zero_std": 0.0, "grad_norm": 0.09652494639158249, "kl": 0.0015081569727044553, "learning_rate": 9.895229272811485e-07, "loss": -0.0877, "num_tokens": 31153652.0, "reward": 0.5340820550918579, "reward_std": 0.3908340036869049, "rewards/reward_func/mean": 0.5340820550918579, "rewards/reward_func/std": 0.3908340334892273, "step": 1132, "step_time": 20.599905110895634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 199.1875, "completions/mean_terminated_length": 199.1875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3890107646584511, "epoch": 0.052477999073645205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014516408555209637, "kl": 0.0018202054779976606, "learning_rate": 9.895136637332098e-07, "loss": 0.0001, "num_tokens": 31183367.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1133, "step_time": 24.62832786515355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 170.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.199891347438097, "epoch": 0.05252431681333951, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007594860508106649, "kl": 0.0009866192995104939, "learning_rate": 9.89504400185271e-07, "loss": 0.0, "num_tokens": 31220416.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1134, "step_time": 20.979295525699854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.15802664309740067, "epoch": 0.05257063455303381, "frac_reward_zero_std": 0.0, "grad_norm": 0.05633833259344101, "kl": 0.0006402080762200058, "learning_rate": 9.89495136637332e-07, "loss": 0.3036, "num_tokens": 31245774.0, "reward": 0.8136550188064575, "reward_std": 0.333344042301178, "rewards/reward_func/mean": 0.8136550188064575, "rewards/reward_func/std": 0.333344042301178, "step": 1135, "step_time": 29.257002096623182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 161.6875, "completions/mean_terminated_length": 161.6875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.202426265925169, "epoch": 0.052616952292728114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014544824371114373, "kl": 0.001244504179339856, "learning_rate": 9.894858730893932e-07, "loss": 0.0001, "num_tokens": 31270857.0, "reward": 0.48153844475746155, "reward_std": 0.0, "rewards/reward_func/mean": 0.48153844475746155, "rewards/reward_func/std": 0.0, "step": 1136, "step_time": 17.424078673124313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 190.875, "completions/mean_terminated_length": 190.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.23381546139717102, "epoch": 0.05266327003242242, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005960245616734028, "kl": 0.0009035322000272572, "learning_rate": 9.894766095414543e-07, "loss": 0.0, "num_tokens": 31306791.0, "reward": 0.8657099008560181, "reward_std": 0.0, "rewards/reward_func/mean": 0.8657099008560181, "rewards/reward_func/std": 0.0, "step": 1137, "step_time": 22.284610524773598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 172.5625, "completions/mean_terminated_length": 172.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.2746679037809372, "epoch": 0.05270958777211672, "frac_reward_zero_std": 1.0, "grad_norm": 0.000632460811175406, "kl": 0.0009402802097611129, "learning_rate": 9.894673459935155e-07, "loss": 0.0, "num_tokens": 31332064.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1138, "step_time": 17.917193945497274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.31648997217416763, "epoch": 0.05275590551181102, "frac_reward_zero_std": 0.0, "grad_norm": 0.0961669459939003, "kl": 0.0015563028864562511, "learning_rate": 9.894580824455766e-07, "loss": 0.0091, "num_tokens": 31361258.0, "reward": 0.5844430327415466, "reward_std": 0.19984734058380127, "rewards/reward_func/mean": 0.5844430327415466, "rewards/reward_func/std": 0.19984734058380127, "step": 1139, "step_time": 22.793893687427044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.41440724581480026, "epoch": 0.052802223251505326, "frac_reward_zero_std": 1.0, "grad_norm": 0.001131863216869533, "kl": 0.0013655726215802133, "learning_rate": 9.894488188976377e-07, "loss": 0.0001, "num_tokens": 31405834.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1140, "step_time": 21.52340066432953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 188.6875, "completions/mean_terminated_length": 188.6875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4362773597240448, "epoch": 0.05284854099119963, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015412763459607959, "kl": 0.002005956834182143, "learning_rate": 9.894395553496988e-07, "loss": 0.0001, "num_tokens": 31433925.0, "reward": 3.123703251617371e-08, "reward_std": 1.2494813006469485e-07, "rewards/reward_func/mean": 3.123703251617371e-08, "rewards/reward_func/std": 1.2494813006469485e-07, "step": 1141, "step_time": 21.189816020429134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 206.0625, "completions/mean_terminated_length": 206.0625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.33258073776960373, "epoch": 0.05289485873089393, "frac_reward_zero_std": 0.0, "grad_norm": 0.061463385820388794, "kl": 0.0018322475953027606, "learning_rate": 9.8943029180176e-07, "loss": -0.0428, "num_tokens": 31466278.0, "reward": 0.9463515877723694, "reward_std": 0.09895078837871552, "rewards/reward_func/mean": 0.9463515877723694, "rewards/reward_func/std": 0.09895079582929611, "step": 1142, "step_time": 22.81053288653493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.24916226789355278, "epoch": 0.052941176470588235, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006731266039423645, "kl": 0.0008510870393365622, "learning_rate": 9.89421028253821e-07, "loss": 0.0, "num_tokens": 31487991.0, "reward": 0.6411803960800171, "reward_std": 0.0, "rewards/reward_func/mean": 0.6411803960800171, "rewards/reward_func/std": 0.0, "step": 1143, "step_time": 19.348943434655666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 185.6875, "completions/mean_terminated_length": 185.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.21044782549142838, "epoch": 0.05298749421028254, "frac_reward_zero_std": 0.0, "grad_norm": 0.08757368475198746, "kl": 0.0008568099146941677, "learning_rate": 9.894117647058822e-07, "loss": -0.0052, "num_tokens": 31511490.0, "reward": 0.9388407468795776, "reward_std": 0.019452007487416267, "rewards/reward_func/mean": 0.9388407468795776, "rewards/reward_func/std": 0.019452018663287163, "step": 1144, "step_time": 20.02642299607396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.22547230869531631, "epoch": 0.05303381194997684, "frac_reward_zero_std": 0.0, "grad_norm": 0.08268888294696808, "kl": 0.0016655647195875645, "learning_rate": 9.894025011579433e-07, "loss": 0.0064, "num_tokens": 31533286.0, "reward": 0.877037763595581, "reward_std": 0.23387674987316132, "rewards/reward_func/mean": 0.877037763595581, "rewards/reward_func/std": 0.2338767647743225, "step": 1145, "step_time": 17.61340966448188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.40419651567935944, "epoch": 0.05308012968967114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009964038617908955, "kl": 0.00159594661090523, "learning_rate": 9.893932376100047e-07, "loss": 0.0001, "num_tokens": 31554401.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1146, "step_time": 17.55034012719989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3126239702105522, "epoch": 0.053126447429365446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022115272004157305, "kl": 0.0015431630599778146, "learning_rate": 9.893839740620658e-07, "loss": 0.0001, "num_tokens": 31591521.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1147, "step_time": 19.716587338596582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1232264544814825, "epoch": 0.05317276516905975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009384130826219916, "kl": 0.0005930302359047346, "learning_rate": 9.89374710514127e-07, "loss": 0.0, "num_tokens": 31614532.0, "reward": 0.910879909992218, "reward_std": 0.0, "rewards/reward_func/mean": 0.910879909992218, "rewards/reward_func/std": 0.0, "step": 1148, "step_time": 16.905188091099262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.15040616318583488, "epoch": 0.05321908290875405, "frac_reward_zero_std": 0.0, "grad_norm": 0.07871180027723312, "kl": 0.0007263211882673204, "learning_rate": 9.89365446966188e-07, "loss": 0.0078, "num_tokens": 31642126.0, "reward": 0.7928786277770996, "reward_std": 0.11949107050895691, "rewards/reward_func/mean": 0.7928786277770996, "rewards/reward_func/std": 0.11949107050895691, "step": 1149, "step_time": 27.762278582900763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 200.3125, "completions/mean_terminated_length": 200.3125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.29706262052059174, "epoch": 0.053265400648448355, "frac_reward_zero_std": 0.0, "grad_norm": 0.10142429172992706, "kl": 0.0018482427985873073, "learning_rate": 9.893561834182492e-07, "loss": 0.0923, "num_tokens": 31665299.0, "reward": 0.18208497762680054, "reward_std": 0.12976574897766113, "rewards/reward_func/mean": 0.18208497762680054, "rewards/reward_func/std": 0.12976574897766113, "step": 1150, "step_time": 29.01048344746232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 211.625, "completions/mean_terminated_length": 211.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.37995871901512146, "epoch": 0.05331171838814266, "frac_reward_zero_std": 1.0, "grad_norm": 0.00652031134814024, "kl": 0.0019164951227139682, "learning_rate": 9.893469198703103e-07, "loss": 0.0001, "num_tokens": 31697565.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1151, "step_time": 26.846133541315794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 185.5625, "completions/mean_terminated_length": 185.5625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.38745927810668945, "epoch": 0.05335803612783696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008911601034924388, "kl": 0.0012029419594909996, "learning_rate": 9.893376563223714e-07, "loss": 0.0001, "num_tokens": 31722310.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1152, "step_time": 21.08556577935815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4134936258196831, "epoch": 0.053404353867531264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010713719530031085, "kl": 0.00138545356458053, "learning_rate": 9.893283927744326e-07, "loss": 0.0001, "num_tokens": 31749766.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1153, "step_time": 22.26845372840762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.14946546405553818, "epoch": 0.05345067160722557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008816413464955986, "kl": 0.0009540772443870082, "learning_rate": 9.893191292264937e-07, "loss": 0.0, "num_tokens": 31778002.0, "reward": 0.9534969329833984, "reward_std": 0.0, "rewards/reward_func/mean": 0.9534969329833984, "rewards/reward_func/std": 0.0, "step": 1154, "step_time": 17.70001668483019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4476577043533325, "epoch": 0.05349698934691987, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009187766700051725, "kl": 0.0016943172086030245, "learning_rate": 9.893098656785548e-07, "loss": 0.0001, "num_tokens": 31820182.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1155, "step_time": 23.56076095253229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 151.6875, "completions/mean_terminated_length": 151.6875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3805708736181259, "epoch": 0.05354330708661417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012020737631246448, "kl": 0.0014396813930943608, "learning_rate": 9.89300602130616e-07, "loss": 0.0001, "num_tokens": 31841009.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1156, "step_time": 16.54355525225401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 183.5625, "completions/mean_terminated_length": 183.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.37823300063610077, "epoch": 0.053589624826308475, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013947762781754136, "kl": 0.001362399838399142, "learning_rate": 9.89291338582677e-07, "loss": 0.0001, "num_tokens": 31869370.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1157, "step_time": 23.252543453127146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3380546420812607, "epoch": 0.05363594256600278, "frac_reward_zero_std": 1.0, "grad_norm": 0.002495003864169121, "kl": 0.0019408497610129416, "learning_rate": 9.892820750347382e-07, "loss": 0.0001, "num_tokens": 31901066.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1158, "step_time": 17.884928941726685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2847992070019245, "epoch": 0.05368226030569708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015233656158670783, "kl": 0.0013830008392687887, "learning_rate": 9.892728114867995e-07, "loss": 0.0001, "num_tokens": 31920691.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1159, "step_time": 15.752903632819653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.23879999667406082, "epoch": 0.053728578045391384, "frac_reward_zero_std": 0.0, "grad_norm": 0.12035561352968216, "kl": 0.0010685274610295892, "learning_rate": 9.892635479388606e-07, "loss": -0.0452, "num_tokens": 31953173.0, "reward": 0.5953280925750732, "reward_std": 0.23239226639270782, "rewards/reward_func/mean": 0.5953280925750732, "rewards/reward_func/std": 0.23239228129386902, "step": 1160, "step_time": 22.353445518761873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 165.875, "completions/mean_terminated_length": 165.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.1503465175628662, "epoch": 0.05377489578508569, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005322801880538464, "kl": 0.0007290776702575386, "learning_rate": 9.892542843909216e-07, "loss": 0.0, "num_tokens": 31990947.0, "reward": 0.9534969329833984, "reward_std": 0.0, "rewards/reward_func/mean": 0.9534969329833984, "rewards/reward_func/std": 0.0, "step": 1161, "step_time": 21.50071908161044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4274817705154419, "epoch": 0.05382121352477999, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010776615235954523, "kl": 0.0014149823691695929, "learning_rate": 9.892450208429827e-07, "loss": 0.0001, "num_tokens": 32022468.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1162, "step_time": 22.83984173834324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.13758426159620285, "epoch": 0.05386753126447429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005651933024637401, "kl": 0.0006001776200719178, "learning_rate": 9.89235757295044e-07, "loss": 0.0, "num_tokens": 32043794.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1163, "step_time": 17.197531394660473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3581901490688324, "epoch": 0.053913849004168596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011381652439013124, "kl": 0.0012314816704019904, "learning_rate": 9.892264937471051e-07, "loss": 0.0001, "num_tokens": 32072748.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1164, "step_time": 18.88974129408598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 143.4375, "completions/mean_terminated_length": 143.4375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2144453153014183, "epoch": 0.0539601667438629, "frac_reward_zero_std": 0.0, "grad_norm": 0.14213523268699646, "kl": 0.0011701754992827773, "learning_rate": 9.892172301991663e-07, "loss": -0.0588, "num_tokens": 32092979.0, "reward": 0.9465265274047852, "reward_std": 0.0712980255484581, "rewards/reward_func/mean": 0.9465265274047852, "rewards/reward_func/std": 0.0712980329990387, "step": 1165, "step_time": 15.757043085992336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 177.8125, "completions/mean_terminated_length": 177.8125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3314700797200203, "epoch": 0.0540064844835572, "frac_reward_zero_std": 0.0, "grad_norm": 0.13920459151268005, "kl": 0.001862278499174863, "learning_rate": 9.892079666512274e-07, "loss": -0.0647, "num_tokens": 32114688.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 1166, "step_time": 19.252146907150745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 116.375, "completions/mean_terminated_length": 116.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2919832020998001, "epoch": 0.054052802223251505, "frac_reward_zero_std": 1.0, "grad_norm": 0.001161219784989953, "kl": 0.0014462788676610216, "learning_rate": 9.891987031032885e-07, "loss": 0.0001, "num_tokens": 32134950.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1167, "step_time": 12.909465335309505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 111.625, "completions/mean_terminated_length": 111.625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2536385953426361, "epoch": 0.05409911996294581, "frac_reward_zero_std": 1.0, "grad_norm": 0.01523127406835556, "kl": 0.0036446138692554086, "learning_rate": 9.891894395553496e-07, "loss": 0.0002, "num_tokens": 32154224.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1168, "step_time": 12.699834518134594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 141.75, "completions/mean_terminated_length": 141.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3179233968257904, "epoch": 0.05414543770264011, "frac_reward_zero_std": 1.0, "grad_norm": 0.001962218666449189, "kl": 0.001700406864983961, "learning_rate": 9.891801760074108e-07, "loss": 0.0001, "num_tokens": 32177292.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1169, "step_time": 15.205468282103539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.9375, "completions/mean_terminated_length": 122.9375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.26094047352671623, "epoch": 0.05419175544233441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013599522644653916, "kl": 0.0013964295794721693, "learning_rate": 9.891709124594719e-07, "loss": 0.0001, "num_tokens": 32197835.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1170, "step_time": 13.122005488723516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.36509843170642853, "epoch": 0.054238073182028716, "frac_reward_zero_std": 1.0, "grad_norm": 0.004078730009496212, "kl": 0.002764565113466233, "learning_rate": 9.89161648911533e-07, "loss": 0.0001, "num_tokens": 32234411.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1171, "step_time": 18.717920504510403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.35093528777360916, "epoch": 0.05428439092172302, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012465318432077765, "kl": 0.001481231243815273, "learning_rate": 9.891523853635941e-07, "loss": 0.0001, "num_tokens": 32259306.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1172, "step_time": 15.76487848162651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 219.1875, "completions/mean_terminated_length": 219.1875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.19795706868171692, "epoch": 0.05433070866141732, "frac_reward_zero_std": 0.0, "grad_norm": 0.07324954867362976, "kl": 0.0010112493037013337, "learning_rate": 9.891431218156555e-07, "loss": -0.0311, "num_tokens": 32297421.0, "reward": 0.8825951814651489, "reward_std": 0.10946193337440491, "rewards/reward_func/mean": 0.8825951814651489, "rewards/reward_func/std": 0.10946192592382431, "step": 1173, "step_time": 24.80542005226016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 189.6875, "completions/mean_terminated_length": 189.6875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.419601745903492, "epoch": 0.054377026401111625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008468477753922343, "kl": 0.0015795796643942595, "learning_rate": 9.891338582677164e-07, "loss": 0.0001, "num_tokens": 32334888.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1174, "step_time": 23.496656615287066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 155.1875, "completions/mean_terminated_length": 155.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.20617348700761795, "epoch": 0.05442334414080593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014567967737093568, "kl": 0.001166216330602765, "learning_rate": 9.891245947197775e-07, "loss": 0.0001, "num_tokens": 32361995.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1175, "step_time": 17.300379287451506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.4495939090847969, "epoch": 0.05446966188050023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009323477279394865, "kl": 0.0013735440734308213, "learning_rate": 9.891153311718389e-07, "loss": 0.0001, "num_tokens": 32389189.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1176, "step_time": 27.233849480748177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3000299036502838, "epoch": 0.054515979620194534, "frac_reward_zero_std": 1.0, "grad_norm": 0.003468952374532819, "kl": 0.002486981393303722, "learning_rate": 9.891060676239e-07, "loss": 0.0001, "num_tokens": 32411219.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1177, "step_time": 16.326974477618933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.42353833466768265, "epoch": 0.05456229735988884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015964413760229945, "kl": 0.0015052240050863475, "learning_rate": 9.89096804075961e-07, "loss": 0.0001, "num_tokens": 32435569.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1178, "step_time": 20.23856322094798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.31634391844272614, "epoch": 0.05460861509958314, "frac_reward_zero_std": 0.0, "grad_norm": 0.11076902598142624, "kl": 0.0015534661069978029, "learning_rate": 9.890875405280222e-07, "loss": -0.0165, "num_tokens": 32471479.0, "reward": 0.9502800703048706, "reward_std": 0.07616504281759262, "rewards/reward_func/mean": 0.9502800703048706, "rewards/reward_func/std": 0.07616503536701202, "step": 1179, "step_time": 22.191253505647182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.43323180079460144, "epoch": 0.05465493283927744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010547670535743237, "kl": 0.0017853601893875748, "learning_rate": 9.890782769800834e-07, "loss": 0.0001, "num_tokens": 32501641.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1180, "step_time": 18.018289901316166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3106352239847183, "epoch": 0.054701250578971745, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018181770574301481, "kl": 0.0016223404090851545, "learning_rate": 9.890690134321445e-07, "loss": 0.0001, "num_tokens": 32527018.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1181, "step_time": 18.92172461748123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.39867446571588516, "epoch": 0.05474756831866605, "frac_reward_zero_std": 1.0, "grad_norm": 0.001653126673772931, "kl": 0.001793490257114172, "learning_rate": 9.890597498842056e-07, "loss": 0.0001, "num_tokens": 32573906.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1182, "step_time": 23.76753769442439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2648680992424488, "epoch": 0.05479388605836035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011571758659556508, "kl": 0.0013324348838068545, "learning_rate": 9.890504863362667e-07, "loss": 0.0001, "num_tokens": 32593970.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1183, "step_time": 12.634309638291597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 202.5625, "completions/mean_terminated_length": 202.5625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2193916030228138, "epoch": 0.054840203798054654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024302343372255564, "kl": 0.001229857763973996, "learning_rate": 9.890412227883279e-07, "loss": 0.0001, "num_tokens": 32629547.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1184, "step_time": 23.20802417770028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 169.4375, "completions/mean_terminated_length": 169.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.34451137483119965, "epoch": 0.05488652153774896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021194086875766516, "kl": 0.0012443136802176014, "learning_rate": 9.89031959240389e-07, "loss": 0.0001, "num_tokens": 32650642.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1185, "step_time": 19.13880906626582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2950673848390579, "epoch": 0.05493283927744326, "frac_reward_zero_std": 1.0, "grad_norm": 0.001744111068546772, "kl": 0.0014879111549817026, "learning_rate": 9.8902269569245e-07, "loss": 0.0001, "num_tokens": 32670204.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1186, "step_time": 16.21994575858116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 127.6875, "completions/mean_terminated_length": 127.6875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3094567731022835, "epoch": 0.05497915701713756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009803370339795947, "kl": 0.0011915433715330437, "learning_rate": 9.890134321445112e-07, "loss": 0.0001, "num_tokens": 32692039.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1187, "step_time": 13.96587160229683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.1543944925069809, "epoch": 0.055025474756831866, "frac_reward_zero_std": 1.0, "grad_norm": 0.006705935113132, "kl": 0.0017742169729899615, "learning_rate": 9.890041685965724e-07, "loss": 0.0001, "num_tokens": 32727337.0, "reward": 0.9343348145484924, "reward_std": 0.0, "rewards/reward_func/mean": 0.9343348145484924, "rewards/reward_func/std": 0.0, "step": 1188, "step_time": 21.417829602956772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 127.0625, "completions/mean_terminated_length": 127.0625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2924730107188225, "epoch": 0.05507179249652617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020096004009246826, "kl": 0.001471274415962398, "learning_rate": 9.889949050486337e-07, "loss": 0.0001, "num_tokens": 32749546.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1189, "step_time": 14.448669698089361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.31685884296894073, "epoch": 0.05511811023622047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013721493305638433, "kl": 0.00156321088434197, "learning_rate": 9.889856415006948e-07, "loss": 0.0001, "num_tokens": 32779216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1190, "step_time": 16.542190439999104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 144.6875, "completions/mean_terminated_length": 144.6875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.20903745666146278, "epoch": 0.055164427975914775, "frac_reward_zero_std": 1.0, "grad_norm": 0.001146889291703701, "kl": 0.0010980116057908162, "learning_rate": 9.88976377952756e-07, "loss": 0.0001, "num_tokens": 32803179.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 1191, "step_time": 15.837680958211422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 136.1875, "completions/mean_terminated_length": 136.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.28850478678941727, "epoch": 0.05521074571560908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017243772745132446, "kl": 0.0012638849730137736, "learning_rate": 9.889671144048169e-07, "loss": 0.0001, "num_tokens": 32822974.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1192, "step_time": 14.529932048171759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 145.6875, "completions/mean_terminated_length": 145.6875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3776225745677948, "epoch": 0.05525706345530338, "frac_reward_zero_std": 1.0, "grad_norm": 0.002197980647906661, "kl": 0.002267822274006903, "learning_rate": 9.889578508568782e-07, "loss": 0.0001, "num_tokens": 32879657.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1193, "step_time": 25.480321776121855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 126.4375, "completions/mean_terminated_length": 126.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3518354445695877, "epoch": 0.05530338119499768, "frac_reward_zero_std": 1.0, "grad_norm": 0.002201296156272292, "kl": 0.0016532180598005652, "learning_rate": 9.889485873089393e-07, "loss": 0.0001, "num_tokens": 32905280.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1194, "step_time": 15.141155265271664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.25013238564133644, "epoch": 0.055349698934691986, "frac_reward_zero_std": 0.0, "grad_norm": 0.06812913715839386, "kl": 0.0009467493218835443, "learning_rate": 9.889393237610004e-07, "loss": -0.0446, "num_tokens": 32938930.0, "reward": 0.8982654809951782, "reward_std": 0.005985993891954422, "rewards/reward_func/mean": 0.8982654809951782, "rewards/reward_func/std": 0.005985994357615709, "step": 1195, "step_time": 23.7807700894773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.36999668926000595, "epoch": 0.05539601667438629, "frac_reward_zero_std": 1.0, "grad_norm": 0.000533161626663059, "kl": 0.0010291463549947366, "learning_rate": 9.889300602130616e-07, "loss": 0.0001, "num_tokens": 32968164.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1196, "step_time": 20.96423965319991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 120.5, "completions/mean_terminated_length": 120.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.23959479480981827, "epoch": 0.05544233441408059, "frac_reward_zero_std": 1.0, "grad_norm": 0.003938731271773577, "kl": 0.001805234351195395, "learning_rate": 9.889207966651227e-07, "loss": 0.0001, "num_tokens": 32988748.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1197, "step_time": 13.279081158339977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 163.5, "completions/mean_terminated_length": 163.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4478013291954994, "epoch": 0.055488652153774895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016535435570403934, "kl": 0.0017541599518153816, "learning_rate": 9.889115331171838e-07, "loss": 0.0001, "num_tokens": 33033604.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1198, "step_time": 23.09657371416688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.35072455555200577, "epoch": 0.0555349698934692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016030474798753858, "kl": 0.001472942967666313, "learning_rate": 9.88902269569245e-07, "loss": 0.0001, "num_tokens": 33053974.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1199, "step_time": 19.738952070474625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 105.125, "completions/mean_terminated_length": 105.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2769193649291992, "epoch": 0.0555812876331635, "frac_reward_zero_std": 1.0, "grad_norm": 0.002373273018747568, "kl": 0.0016173394105862826, "learning_rate": 9.88893006021306e-07, "loss": 0.0001, "num_tokens": 33075656.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1200, "step_time": 12.320656727999449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.34736139327287674, "epoch": 0.055627605372857804, "frac_reward_zero_std": 0.0, "grad_norm": 0.1093740463256836, "kl": 0.0011903214908670634, "learning_rate": 9.888837424733672e-07, "loss": -0.0134, "num_tokens": 33099016.0, "reward": 0.856032133102417, "reward_std": 0.22827444970607758, "rewards/reward_func/mean": 0.856032133102417, "rewards/reward_func/std": 0.22827443480491638, "step": 1201, "step_time": 20.020185366272926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.37158964574337006, "epoch": 0.05567392311255211, "frac_reward_zero_std": 0.0, "grad_norm": 0.09585659950971603, "kl": 0.001849939435487613, "learning_rate": 9.888744789254283e-07, "loss": -0.0273, "num_tokens": 33137322.0, "reward": 0.14222227036952972, "reward_std": 0.12954173982143402, "rewards/reward_func/mean": 0.14222227036952972, "rewards/reward_func/std": 0.12954175472259521, "step": 1202, "step_time": 27.632737696170807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 157.1875, "completions/mean_terminated_length": 157.1875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.2809433713555336, "epoch": 0.05572024085224641, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031857306603342295, "kl": 0.0016545486578252167, "learning_rate": 9.888652153774896e-07, "loss": 0.0001, "num_tokens": 33173853.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1203, "step_time": 20.077237356454134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 106.375, "completions/mean_terminated_length": 106.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.28098398447036743, "epoch": 0.05576655859194071, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024516689591109753, "kl": 0.001706851733615622, "learning_rate": 9.888559518295508e-07, "loss": 0.0001, "num_tokens": 33193779.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1204, "step_time": 12.8146958835423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.41700856387615204, "epoch": 0.055812876331635015, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019709744956344366, "kl": 0.0015683734090998769, "learning_rate": 9.888466882816117e-07, "loss": 0.0001, "num_tokens": 33217503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1205, "step_time": 18.581989858299494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 127.9375, "completions/mean_terminated_length": 127.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2673864811658859, "epoch": 0.05585919407132932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016961885849013925, "kl": 0.0012344536662567407, "learning_rate": 9.88837424733673e-07, "loss": 0.0001, "num_tokens": 33237262.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1206, "step_time": 13.16928181797266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3380234017968178, "epoch": 0.05590551181102362, "frac_reward_zero_std": 0.0, "grad_norm": 0.0912645012140274, "kl": 0.0015399760741274804, "learning_rate": 9.888281611857341e-07, "loss": -0.0836, "num_tokens": 33263304.0, "reward": 0.4474213719367981, "reward_std": 0.49688494205474854, "rewards/reward_func/mean": 0.4474213719367981, "rewards/reward_func/std": 0.4968849718570709, "step": 1207, "step_time": 23.88822455331683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 143.5625, "completions/mean_terminated_length": 143.5625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2060914896428585, "epoch": 0.055951829550717924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038170551415532827, "kl": 0.001533936767373234, "learning_rate": 9.888188976377953e-07, "loss": 0.0001, "num_tokens": 33283761.0, "reward": 0.7420884966850281, "reward_std": 0.0, "rewards/reward_func/mean": 0.7420884966850281, "rewards/reward_func/std": 0.0, "step": 1208, "step_time": 14.947349477559328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2600145637989044, "epoch": 0.05599814729041223, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047537581995129585, "kl": 0.001497937657404691, "learning_rate": 9.888096340898564e-07, "loss": 0.0001, "num_tokens": 33304233.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1209, "step_time": 13.457011204212904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 212.0625, "completions/mean_terminated_length": 212.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.40265387296676636, "epoch": 0.05604446503010653, "frac_reward_zero_std": 1.0, "grad_norm": 0.001125058624893427, "kl": 0.001514648989541456, "learning_rate": 9.888003705419175e-07, "loss": 0.0001, "num_tokens": 33335530.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1210, "step_time": 24.818960841745138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.391539990901947, "epoch": 0.05609078276980083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011921962723135948, "kl": 0.0015644820232409984, "learning_rate": 9.887911069939786e-07, "loss": 0.0001, "num_tokens": 33370685.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1211, "step_time": 20.84784833714366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.35753049701452255, "epoch": 0.056137100509495136, "frac_reward_zero_std": 1.0, "grad_norm": 0.001809712266549468, "kl": 0.0012707883870461956, "learning_rate": 9.887818434460398e-07, "loss": 0.0001, "num_tokens": 33404258.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1212, "step_time": 19.671447813510895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.22351354733109474, "epoch": 0.05618341824918944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008563286392018199, "kl": 0.0010690609051380306, "learning_rate": 9.88772579898101e-07, "loss": 0.0001, "num_tokens": 33439500.0, "reward": 0.8089976906776428, "reward_std": 0.0, "rewards/reward_func/mean": 0.8089976906776428, "rewards/reward_func/std": 0.0, "step": 1213, "step_time": 22.688255954533815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 126.4375, "completions/mean_terminated_length": 126.4375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.22193920984864235, "epoch": 0.05622973598888374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006390352500602603, "kl": 0.0008537031535524875, "learning_rate": 9.88763316350162e-07, "loss": 0.0, "num_tokens": 33465971.0, "reward": 0.1533549726009369, "reward_std": 0.0, "rewards/reward_func/mean": 0.1533549726009369, "rewards/reward_func/std": 0.0, "step": 1214, "step_time": 15.766603652387857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 208.0625, "completions/mean_terminated_length": 208.0625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.26157864928245544, "epoch": 0.056276053728578045, "frac_reward_zero_std": 0.0, "grad_norm": 0.07187444716691971, "kl": 0.0011387507256586105, "learning_rate": 9.887540528022231e-07, "loss": -0.0041, "num_tokens": 33501764.0, "reward": 0.8787410259246826, "reward_std": 0.029620526358485222, "rewards/reward_func/mean": 0.8787410259246826, "rewards/reward_func/std": 0.02962053380906582, "step": 1215, "step_time": 22.862708177417517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2815399691462517, "epoch": 0.05632237146827235, "frac_reward_zero_std": 1.0, "grad_norm": 0.015339228324592113, "kl": 0.0035644937597680837, "learning_rate": 9.887447892542845e-07, "loss": 0.0002, "num_tokens": 33524079.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1216, "step_time": 16.1451876796782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 172.3125, "completions/mean_terminated_length": 172.3125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.37928324192762375, "epoch": 0.05636868920796665, "frac_reward_zero_std": 1.0, "grad_norm": 0.00085402064723894, "kl": 0.0013867141969967633, "learning_rate": 9.887355257063454e-07, "loss": 0.0001, "num_tokens": 33550820.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1217, "step_time": 18.440692875534296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2438240870833397, "epoch": 0.05641500694766095, "frac_reward_zero_std": 1.0, "grad_norm": 0.002116305520758033, "kl": 0.0013620586250908673, "learning_rate": 9.887262621584065e-07, "loss": 0.0001, "num_tokens": 33570518.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1218, "step_time": 15.654694091528654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 121.3125, "completions/mean_terminated_length": 121.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3376108705997467, "epoch": 0.056461324687355256, "frac_reward_zero_std": 1.0, "grad_norm": 0.010410158894956112, "kl": 0.0020358639303594828, "learning_rate": 9.887169986104679e-07, "loss": 0.0001, "num_tokens": 33594411.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1219, "step_time": 14.020819757133722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4610692411661148, "epoch": 0.05650764242704956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010968097485601902, "kl": 0.0017872247844934464, "learning_rate": 9.88707735062529e-07, "loss": 0.0001, "num_tokens": 33643911.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1220, "step_time": 24.36411403864622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 164.0625, "completions/mean_terminated_length": 164.0625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.42141059041023254, "epoch": 0.05655396016674386, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008742156787775457, "kl": 0.0014788057305850089, "learning_rate": 9.886984715145901e-07, "loss": 0.0001, "num_tokens": 33686088.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1221, "step_time": 21.88529008999467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.411409430205822, "epoch": 0.056600277906438165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018694042228162289, "kl": 0.0017541041306685656, "learning_rate": 9.886892079666512e-07, "loss": 0.0001, "num_tokens": 33709326.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1222, "step_time": 16.851913671940565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.460119292140007, "epoch": 0.05664659564613247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016819029115140438, "kl": 0.0019150854786857963, "learning_rate": 9.886799444187124e-07, "loss": 0.0001, "num_tokens": 33752936.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1223, "step_time": 25.403372287750244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.35317961871623993, "epoch": 0.05669291338582677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015910225920379162, "kl": 0.0015965857601258904, "learning_rate": 9.886706808707735e-07, "loss": 0.0001, "num_tokens": 33774436.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1224, "step_time": 15.178873017430305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.31188593804836273, "epoch": 0.056739231125521074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028630243614315987, "kl": 0.0015592757263220847, "learning_rate": 9.886614173228346e-07, "loss": 0.0001, "num_tokens": 33795334.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1225, "step_time": 16.386144682765007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 126.1875, "completions/mean_terminated_length": 126.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.283845491707325, "epoch": 0.05678554886521538, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033356109634041786, "kl": 0.0016911396523937583, "learning_rate": 9.886521537748957e-07, "loss": 0.0001, "num_tokens": 33819289.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1226, "step_time": 14.298079270869493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3166779801249504, "epoch": 0.05683186660490968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013046388048678637, "kl": 0.0015917142445687205, "learning_rate": 9.886428902269569e-07, "loss": 0.0001, "num_tokens": 33846245.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1227, "step_time": 18.61289867013693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.21056405827403069, "epoch": 0.05687818434460398, "frac_reward_zero_std": 0.0, "grad_norm": 0.07488119602203369, "kl": 0.0014644028851762414, "learning_rate": 9.88633626679018e-07, "loss": 0.0307, "num_tokens": 33884749.0, "reward": 0.6096057891845703, "reward_std": 0.024398334324359894, "rewards/reward_func/mean": 0.6096057891845703, "rewards/reward_func/std": 0.02439834736287594, "step": 1228, "step_time": 25.749790344387293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 169.5625, "completions/mean_terminated_length": 169.5625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.38020822405815125, "epoch": 0.056924502084298285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016944898525252938, "kl": 0.001812828064430505, "learning_rate": 9.886243631310791e-07, "loss": 0.0001, "num_tokens": 33911494.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1229, "step_time": 19.604694467037916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 242.8125, "completions/mean_terminated_length": 242.8125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.27098196744918823, "epoch": 0.05697081982399259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004719934950117022, "kl": 0.000758229085477069, "learning_rate": 9.886150995831402e-07, "loss": 0.0, "num_tokens": 33943395.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1230, "step_time": 26.482955258339643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 129.6875, "completions/mean_terminated_length": 129.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2782093659043312, "epoch": 0.05701713756368689, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008192937239073217, "kl": 0.0009703707764856517, "learning_rate": 9.886058360352014e-07, "loss": 0.0, "num_tokens": 33967374.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1231, "step_time": 15.097816903144121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.40940943360328674, "epoch": 0.057063455303381194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016419700114056468, "kl": 0.002018047438468784, "learning_rate": 9.885965724872625e-07, "loss": 0.0001, "num_tokens": 34000716.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1232, "step_time": 23.017534095793962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 128.9375, "completions/mean_terminated_length": 128.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2268138900399208, "epoch": 0.0571097730430755, "frac_reward_zero_std": 1.0, "grad_norm": 0.001921424176543951, "kl": 0.0010678358376026154, "learning_rate": 9.885873089393238e-07, "loss": 0.0001, "num_tokens": 34020619.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1233, "step_time": 13.981297962367535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 141.5625, "completions/mean_terminated_length": 141.5625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3581574782729149, "epoch": 0.0571560907827698, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008119179983623326, "kl": 0.001180766790639609, "learning_rate": 9.88578045391385e-07, "loss": 0.0001, "num_tokens": 34041972.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1234, "step_time": 15.030056152492762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "entropy": 0.245905090123415, "epoch": 0.0572024085224641, "frac_reward_zero_std": 0.0, "grad_norm": 0.12357914447784424, "kl": 0.0013075890019536018, "learning_rate": 9.885687818434459e-07, "loss": -0.0504, "num_tokens": 34078116.0, "reward": 0.9262726306915283, "reward_std": 0.24700602889060974, "rewards/reward_func/mean": 0.9262726306915283, "rewards/reward_func/std": 0.24700602889060974, "step": 1235, "step_time": 32.29124540835619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.35061635822057724, "epoch": 0.057248726262158406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010514730820432305, "kl": 0.0012754994095303118, "learning_rate": 9.885595182955072e-07, "loss": 0.0001, "num_tokens": 34099454.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1236, "step_time": 14.005634594708681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 155.9375, "completions/mean_terminated_length": 155.9375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.32259558141231537, "epoch": 0.05729504400185271, "frac_reward_zero_std": 0.0, "grad_norm": 0.08124643564224243, "kl": 0.0022128417913336307, "learning_rate": 9.885502547475683e-07, "loss": -0.0096, "num_tokens": 34120189.0, "reward": 0.058713316917419434, "reward_std": 0.23485326766967773, "rewards/reward_func/mean": 0.058713316917419434, "rewards/reward_func/std": 0.23485328257083893, "step": 1237, "step_time": 16.244460076093674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.24214792996644974, "epoch": 0.05734136174154701, "frac_reward_zero_std": 0.0, "grad_norm": 0.07267358899116516, "kl": 0.0013592233299277723, "learning_rate": 9.885409911996294e-07, "loss": 0.0104, "num_tokens": 34144075.0, "reward": 0.8419013023376465, "reward_std": 0.23843029141426086, "rewards/reward_func/mean": 0.8419013023376465, "rewards/reward_func/std": 0.23843029141426086, "step": 1238, "step_time": 15.847119845449924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2643110007047653, "epoch": 0.057387679481241315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016059990739449859, "kl": 0.0014530277985613793, "learning_rate": 9.885317276516906e-07, "loss": 0.0001, "num_tokens": 34164225.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1239, "step_time": 13.541017275303602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 138.8125, "completions/mean_terminated_length": 138.8125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3196500539779663, "epoch": 0.05743399722093562, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009587425156496465, "kl": 0.0013537496561184525, "learning_rate": 9.885224641037517e-07, "loss": 0.0001, "num_tokens": 34185982.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1240, "step_time": 14.922610383480787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4293387234210968, "epoch": 0.05748031496062992, "frac_reward_zero_std": 1.0, "grad_norm": 0.001246821484528482, "kl": 0.001713241741526872, "learning_rate": 9.885132005558128e-07, "loss": 0.0001, "num_tokens": 34237198.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1241, "step_time": 25.724534645676613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.38223453611135483, "epoch": 0.05752663270032422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012479997240006924, "kl": 0.001559621945489198, "learning_rate": 9.88503937007874e-07, "loss": 0.0001, "num_tokens": 34263798.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1242, "step_time": 19.806608445942402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 207.1875, "completions/mean_terminated_length": 207.1875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.26252343505620956, "epoch": 0.057572950440018526, "frac_reward_zero_std": 0.0, "grad_norm": 0.08640774339437485, "kl": 0.0012831655621994287, "learning_rate": 9.88494673459935e-07, "loss": -0.0306, "num_tokens": 34301817.0, "reward": 0.8629425764083862, "reward_std": 0.3384329676628113, "rewards/reward_func/mean": 0.8629425764083862, "rewards/reward_func/std": 0.3384329676628113, "step": 1243, "step_time": 24.460368610918522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 200.6875, "completions/mean_terminated_length": 200.6875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.42235323786735535, "epoch": 0.05761926817971283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008627405040897429, "kl": 0.0012830222549382597, "learning_rate": 9.884854099119962e-07, "loss": 0.0001, "num_tokens": 34324436.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1244, "step_time": 20.84190797433257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 175.4375, "completions/mean_terminated_length": 175.4375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.15327590703964233, "epoch": 0.05766558591940713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006670938455499709, "kl": 0.0006721837999066338, "learning_rate": 9.884761463640573e-07, "loss": 0.0, "num_tokens": 34369067.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 1245, "step_time": 23.57464948296547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24939283728599548, "epoch": 0.057711903659101435, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009782507549971342, "kl": 0.0010339748696424067, "learning_rate": 9.884668828161187e-07, "loss": 0.0001, "num_tokens": 34390675.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1246, "step_time": 15.103761825710535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 108.875, "completions/mean_terminated_length": 108.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26170995458960533, "epoch": 0.05775822139879574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014053724007681012, "kl": 0.0011575254320632666, "learning_rate": 9.884576192681798e-07, "loss": 0.0001, "num_tokens": 34410673.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1247, "step_time": 12.882812615484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.16441050171852112, "epoch": 0.05780453913849004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004503272648435086, "kl": 0.0005819772122777067, "learning_rate": 9.884483557202407e-07, "loss": 0.0, "num_tokens": 34435201.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1248, "step_time": 21.09315648302436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.36547720432281494, "epoch": 0.057850856878184344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015186961973086, "kl": 0.001675381965469569, "learning_rate": 9.88439092172302e-07, "loss": 0.0001, "num_tokens": 34468457.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1249, "step_time": 20.333508122712374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 263.8125, "completions/mean_terminated_length": 263.8125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.39689671248197556, "epoch": 0.05789717461787865, "frac_reward_zero_std": 0.0, "grad_norm": 0.06250195950269699, "kl": 0.0014216976414900273, "learning_rate": 9.884298286243632e-07, "loss": -0.0612, "num_tokens": 34496134.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 1250, "step_time": 34.22820543497801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 241.6875, "completions/mean_terminated_length": 241.6875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.46069467067718506, "epoch": 0.05794349235757295, "frac_reward_zero_std": 0.0, "grad_norm": 0.05643896013498306, "kl": 0.0015777037187945098, "learning_rate": 9.884205650764243e-07, "loss": 0.0617, "num_tokens": 34535009.0, "reward": 0.00031467806547880173, "reward_std": 0.00018763775005936623, "rewards/reward_func/mean": 0.00031467806547880173, "rewards/reward_func/std": 0.00018763776461128145, "step": 1251, "step_time": 45.43677279353142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.20259954035282135, "epoch": 0.05798981009726725, "frac_reward_zero_std": 0.0, "grad_norm": 0.09602506458759308, "kl": 0.0007747110503260046, "learning_rate": 9.884113015284854e-07, "loss": 0.0285, "num_tokens": 34565969.0, "reward": 0.9419240951538086, "reward_std": 0.018595725297927856, "rewards/reward_func/mean": 0.9419240951538086, "rewards/reward_func/std": 0.018595723435282707, "step": 1252, "step_time": 20.653414957225323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4027465134859085, "epoch": 0.058036127836961555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010446161031723022, "kl": 0.0013756774715147913, "learning_rate": 9.884020379805465e-07, "loss": 0.0001, "num_tokens": 34614959.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1253, "step_time": 23.52247118204832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 163.9375, "completions/mean_terminated_length": 163.9375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.22748686373233795, "epoch": 0.05808244557665586, "frac_reward_zero_std": 0.0, "grad_norm": 0.07628460973501205, "kl": 0.0014305194199550897, "learning_rate": 9.883927744326077e-07, "loss": -0.0661, "num_tokens": 34636654.0, "reward": 0.9226803779602051, "reward_std": 0.03836125135421753, "rewards/reward_func/mean": 0.9226803779602051, "rewards/reward_func/std": 0.038361258804798126, "step": 1254, "step_time": 18.081256940960884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 138.5625, "completions/mean_terminated_length": 138.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3049306273460388, "epoch": 0.05812876331635016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011182550806552172, "kl": 0.001401193527271971, "learning_rate": 9.883835108846688e-07, "loss": 0.0001, "num_tokens": 34658519.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1255, "step_time": 15.809340998530388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2618870660662651, "epoch": 0.058175081056044464, "frac_reward_zero_std": 0.0, "grad_norm": 0.07970226556062698, "kl": 0.0016272406501229852, "learning_rate": 9.8837424733673e-07, "loss": -0.1783, "num_tokens": 34695281.0, "reward": 0.5085287094116211, "reward_std": 0.42029711604118347, "rewards/reward_func/mean": 0.5085287094116211, "rewards/reward_func/std": 0.42029711604118347, "step": 1256, "step_time": 31.63137638568878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3319794610142708, "epoch": 0.05822139879573877, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012234391178935766, "kl": 0.001421384746208787, "learning_rate": 9.88364983788791e-07, "loss": 0.0001, "num_tokens": 34725746.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1257, "step_time": 17.646457955241203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2638702392578125, "epoch": 0.05826771653543307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016106648836284876, "kl": 0.0017507765733171254, "learning_rate": 9.883557202408522e-07, "loss": 0.0001, "num_tokens": 34761598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1258, "step_time": 17.709375075995922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 114.875, "completions/mean_terminated_length": 114.875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26288048177957535, "epoch": 0.05831403427512737, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014916558284312487, "kl": 0.0014745851221960038, "learning_rate": 9.883464566929135e-07, "loss": 0.0001, "num_tokens": 34781132.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1259, "step_time": 13.982169389724731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 192.5625, "completions/mean_terminated_length": 192.5625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4385821372270584, "epoch": 0.058360352014821676, "frac_reward_zero_std": 1.0, "grad_norm": 0.002485638251528144, "kl": 0.0021656985045410693, "learning_rate": 9.883371931449744e-07, "loss": 0.0001, "num_tokens": 34807061.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1260, "step_time": 20.581169545650482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 163.9375, "completions/mean_terminated_length": 163.9375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.21002722531557083, "epoch": 0.05840666975451598, "frac_reward_zero_std": 0.0, "grad_norm": 0.1248169094324112, "kl": 0.001338052112259902, "learning_rate": 9.883279295970355e-07, "loss": -0.1039, "num_tokens": 34830820.0, "reward": 0.2829582095146179, "reward_std": 0.2080743908882141, "rewards/reward_func/mean": 0.2829582095146179, "rewards/reward_func/std": 0.2080743908882141, "step": 1261, "step_time": 19.391571924090385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 181.5625, "completions/mean_terminated_length": 181.5625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.40674079954624176, "epoch": 0.05845298749421028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011472152546048164, "kl": 0.0013039868790656328, "learning_rate": 9.883186660490967e-07, "loss": 0.0001, "num_tokens": 34871181.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1262, "step_time": 24.397856388241053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 193.5625, "completions/mean_terminated_length": 193.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.24332686886191368, "epoch": 0.058499305233904585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013939500786364079, "kl": 0.0011751919810194522, "learning_rate": 9.88309402501158e-07, "loss": 0.0001, "num_tokens": 34893286.0, "reward": 0.7295533418655396, "reward_std": 0.0, "rewards/reward_func/mean": 0.7295533418655396, "rewards/reward_func/std": 0.0, "step": 1263, "step_time": 20.54952061548829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.29128848016262054, "epoch": 0.05854562297359889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011062759440392256, "kl": 0.0012307801225688308, "learning_rate": 9.883001389532191e-07, "loss": 0.0001, "num_tokens": 34914300.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1264, "step_time": 15.922392208129168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.15263668820261955, "epoch": 0.05859194071329319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011531964410096407, "kl": 0.0007745838302071206, "learning_rate": 9.882908754052802e-07, "loss": 0.0, "num_tokens": 34939058.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1265, "step_time": 22.780731935054064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 189.1875, "completions/mean_terminated_length": 189.1875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.21471868455410004, "epoch": 0.05863825845298749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006897761486470699, "kl": 0.0007938119524624199, "learning_rate": 9.882816118573414e-07, "loss": 0.0, "num_tokens": 34981989.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 1266, "step_time": 24.1014525257051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2848397567868233, "epoch": 0.058684576192681796, "frac_reward_zero_std": 0.0, "grad_norm": 0.11768696457147598, "kl": 0.0012865164317190647, "learning_rate": 9.882723483094025e-07, "loss": -0.0419, "num_tokens": 35004679.0, "reward": 0.2808825373649597, "reward_std": 0.018500691279768944, "rewards/reward_func/mean": 0.2808825373649597, "rewards/reward_func/std": 0.018500693142414093, "step": 1267, "step_time": 19.55908903107047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2190609648823738, "epoch": 0.0587308939323761, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008628237410448492, "kl": 0.0010214103240286931, "learning_rate": 9.882630847614636e-07, "loss": 0.0001, "num_tokens": 35024327.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1268, "step_time": 13.416114680469036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 256.4375, "completions/mean_terminated_length": 256.4375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.1592147834599018, "epoch": 0.0587772116720704, "frac_reward_zero_std": 0.0, "grad_norm": 0.057634301483631134, "kl": 0.001346321965684183, "learning_rate": 9.882538212135247e-07, "loss": 0.0037, "num_tokens": 35056590.0, "reward": 0.8999221324920654, "reward_std": 0.16143766045570374, "rewards/reward_func/mean": 0.8999221324920654, "rewards/reward_func/std": 0.16143766045570374, "step": 1269, "step_time": 30.925078090280294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 206.875, "completions/mean_terminated_length": 206.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.32576390355825424, "epoch": 0.058823529411764705, "frac_reward_zero_std": 0.0, "grad_norm": 0.07088025659322739, "kl": 0.0012108280207030475, "learning_rate": 9.882445576655859e-07, "loss": -0.0892, "num_tokens": 35078380.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 1270, "step_time": 21.85751686245203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.6875, "completions/mean_terminated_length": 117.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28740938007831573, "epoch": 0.05886984715145901, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008541119168512523, "kl": 0.0011367111001163721, "learning_rate": 9.88235294117647e-07, "loss": 0.0001, "num_tokens": 35101287.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1271, "step_time": 13.903431259095669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 125.75, "completions/mean_terminated_length": 125.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.322317399084568, "epoch": 0.05891616489115331, "frac_reward_zero_std": 1.0, "grad_norm": 0.003119006287306547, "kl": 0.0015256724145729095, "learning_rate": 9.882260305697081e-07, "loss": 0.0001, "num_tokens": 35121555.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1272, "step_time": 13.342403680086136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.37049461901187897, "epoch": 0.058962482630847614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012520358432084322, "kl": 0.0014613436069339514, "learning_rate": 9.882167670217692e-07, "loss": 0.0001, "num_tokens": 35152551.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1273, "step_time": 19.7287641428411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3411256819963455, "epoch": 0.05900880037054192, "frac_reward_zero_std": 0.0, "grad_norm": 0.10679585486650467, "kl": 0.001722003478789702, "learning_rate": 9.882075034738304e-07, "loss": -0.0156, "num_tokens": 35174957.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 1274, "step_time": 17.63904182612896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 178.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.23396510630846024, "epoch": 0.05905511811023622, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011064562713727355, "kl": 0.0009488572250120342, "learning_rate": 9.881982399258915e-07, "loss": 0.0, "num_tokens": 35195965.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1275, "step_time": 17.236826792359352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.205683171749115, "epoch": 0.05910143584993052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011295033618807793, "kl": 0.001086074553313665, "learning_rate": 9.881889763779528e-07, "loss": 0.0001, "num_tokens": 35227211.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 1276, "step_time": 18.04539056122303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 118.1875, "completions/mean_terminated_length": 118.1875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.26655275374650955, "epoch": 0.059147753589624825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017749374965205789, "kl": 0.0015684491663705558, "learning_rate": 9.88179712830014e-07, "loss": 0.0001, "num_tokens": 35250622.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1277, "step_time": 13.596657756716013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 154.3125, "completions/mean_terminated_length": 154.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3734418898820877, "epoch": 0.05919407132931913, "frac_reward_zero_std": 1.0, "grad_norm": 0.002765932120382786, "kl": 0.002765625366009772, "learning_rate": 9.881704492820749e-07, "loss": 0.0001, "num_tokens": 35274563.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1278, "step_time": 17.611996166408062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 147.8125, "completions/mean_terminated_length": 147.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3976045846939087, "epoch": 0.05924038906901343, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010244401637464762, "kl": 0.0012972986733075231, "learning_rate": 9.88161185734136e-07, "loss": 0.0001, "num_tokens": 35309248.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1279, "step_time": 19.944551046937704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3238281235098839, "epoch": 0.059286706808707734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025384421460330486, "kl": 0.0015252811426762491, "learning_rate": 9.881519221861973e-07, "loss": 0.0001, "num_tokens": 35333140.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1280, "step_time": 16.57121830061078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 224.75, "completions/mean_terminated_length": 224.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.14412657544016838, "epoch": 0.05933302454840204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005944859585724771, "kl": 0.0006109927489887923, "learning_rate": 9.881426586382584e-07, "loss": 0.0, "num_tokens": 35363456.0, "reward": 0.9622687101364136, "reward_std": 0.0, "rewards/reward_func/mean": 0.9622687101364136, "rewards/reward_func/std": 0.0, "step": 1281, "step_time": 23.807335074990988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 253.875, "completions/mean_terminated_length": 253.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.18086369708180428, "epoch": 0.05937934228809634, "frac_reward_zero_std": 0.0, "grad_norm": 0.0869535431265831, "kl": 0.0009457567939534783, "learning_rate": 9.881333950903196e-07, "loss": -0.0278, "num_tokens": 35387310.0, "reward": 0.9387601017951965, "reward_std": 0.048821549862623215, "rewards/reward_func/mean": 0.9387601017951965, "rewards/reward_func/std": 0.048821575939655304, "step": 1282, "step_time": 25.28862490877509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 231.6875, "completions/mean_terminated_length": 231.6875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.14501139521598816, "epoch": 0.05942566002779064, "frac_reward_zero_std": 1.0, "grad_norm": 0.00144374358933419, "kl": 0.000982460391242057, "learning_rate": 9.881241315423807e-07, "loss": 0.0, "num_tokens": 35411833.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1283, "step_time": 22.92037371918559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.3042704053223133, "epoch": 0.059471977767484946, "frac_reward_zero_std": 0.0, "grad_norm": 0.06681200116872787, "kl": 0.001692799385637045, "learning_rate": 9.881148679944418e-07, "loss": -0.0392, "num_tokens": 35449601.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 1284, "step_time": 26.501106817275286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 126.4375, "completions/mean_terminated_length": 126.4375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2799791172146797, "epoch": 0.05951829550717925, "frac_reward_zero_std": 1.0, "grad_norm": 0.001297675189562142, "kl": 0.0012755271745845675, "learning_rate": 9.88105604446503e-07, "loss": 0.0001, "num_tokens": 35469704.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1285, "step_time": 15.396445531398058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 115.9375, "completions/mean_terminated_length": 115.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.23795905709266663, "epoch": 0.05956461324687355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011905039427801967, "kl": 0.00119230174459517, "learning_rate": 9.88096340898564e-07, "loss": 0.0001, "num_tokens": 35489623.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1286, "step_time": 12.979618959128857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.26678039133548737, "epoch": 0.059610930986567855, "frac_reward_zero_std": 1.0, "grad_norm": 0.001481172046624124, "kl": 0.001021220610709861, "learning_rate": 9.880870773506252e-07, "loss": 0.0001, "num_tokens": 35509633.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1287, "step_time": 15.040730103850365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 130.8125, "completions/mean_terminated_length": 130.8125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.21279476955533028, "epoch": 0.05965724872626216, "frac_reward_zero_std": 1.0, "grad_norm": 0.001137681887485087, "kl": 0.0009014675742946565, "learning_rate": 9.880778138026863e-07, "loss": 0.0, "num_tokens": 35531342.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1288, "step_time": 14.034538641571999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 126.75, "completions/mean_terminated_length": 126.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2391769178211689, "epoch": 0.05970356646595646, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014369251439347863, "kl": 0.0011704412463586777, "learning_rate": 9.880685502547477e-07, "loss": 0.0001, "num_tokens": 35550986.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1289, "step_time": 13.299801394343376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.1921832300722599, "epoch": 0.05974988420565076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008338350453414023, "kl": 0.0008639673033030704, "learning_rate": 9.880592867068088e-07, "loss": 0.0, "num_tokens": 35576340.0, "reward": 0.7221074104309082, "reward_std": 0.0, "rewards/reward_func/mean": 0.7221074104309082, "rewards/reward_func/std": 0.0, "step": 1290, "step_time": 21.38058177381754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.25062207877635956, "epoch": 0.059796201945345066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013031045673415065, "kl": 0.0013300326390890405, "learning_rate": 9.880500231588697e-07, "loss": 0.0001, "num_tokens": 35596167.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1291, "step_time": 14.002082046121359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3573547676205635, "epoch": 0.05984251968503937, "frac_reward_zero_std": 1.0, "grad_norm": 0.002497861860319972, "kl": 0.0017638968129176646, "learning_rate": 9.880407596109308e-07, "loss": 0.0001, "num_tokens": 35616959.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1292, "step_time": 18.8273093290627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 190.5625, "completions/mean_terminated_length": 190.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4123840406537056, "epoch": 0.05988883742473367, "frac_reward_zero_std": 1.0, "grad_norm": 0.001903548021800816, "kl": 0.001797056320356205, "learning_rate": 9.880314960629922e-07, "loss": 0.0001, "num_tokens": 35654136.0, "reward": 0.780767560005188, "reward_std": 0.0, "rewards/reward_func/mean": 0.780767560005188, "rewards/reward_func/std": 0.0, "step": 1293, "step_time": 24.41575490310788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 131.375, "completions/mean_terminated_length": 131.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2838471829891205, "epoch": 0.059935155164427975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008910238975659013, "kl": 0.0011368074774509296, "learning_rate": 9.880222325150533e-07, "loss": 0.0001, "num_tokens": 35676110.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1294, "step_time": 14.732032056897879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 206.1875, "completions/mean_terminated_length": 206.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.25893597304821014, "epoch": 0.05998147290412228, "frac_reward_zero_std": 0.0, "grad_norm": 0.07036730647087097, "kl": 0.0012522713805083185, "learning_rate": 9.880129689671144e-07, "loss": -0.0349, "num_tokens": 35698065.0, "reward": 0.8293574452400208, "reward_std": 0.301551878452301, "rewards/reward_func/mean": 0.8293574452400208, "rewards/reward_func/std": 0.3015519082546234, "step": 1295, "step_time": 22.613749779760838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 142.3125, "completions/mean_terminated_length": 142.3125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3176952674984932, "epoch": 0.06002779064381658, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009578858152963221, "kl": 0.0009685322729637846, "learning_rate": 9.880037054191755e-07, "loss": 0.0, "num_tokens": 35731030.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1296, "step_time": 19.185499880462885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.41428501158952713, "epoch": 0.060074108383510884, "frac_reward_zero_std": 1.0, "grad_norm": 0.001212472328916192, "kl": 0.0012661851360462606, "learning_rate": 9.879944418712367e-07, "loss": 0.0001, "num_tokens": 35768928.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1297, "step_time": 27.58428728580475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 196.1875, "completions/mean_terminated_length": 196.1875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.18155839666724205, "epoch": 0.06012042612320519, "frac_reward_zero_std": 0.0, "grad_norm": 0.11216012388467789, "kl": 0.0015206912066787481, "learning_rate": 9.879851783232978e-07, "loss": 0.0224, "num_tokens": 35792579.0, "reward": 0.9409475922584534, "reward_std": 0.06039387732744217, "rewards/reward_func/mean": 0.9409475922584534, "rewards/reward_func/std": 0.06039387360215187, "step": 1298, "step_time": 21.5261762291193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.25570859387516975, "epoch": 0.06016674386289949, "frac_reward_zero_std": 0.0, "grad_norm": 0.10892330855131149, "kl": 0.0020363611401990056, "learning_rate": 9.87975914775359e-07, "loss": -0.0393, "num_tokens": 35830133.0, "reward": 0.7691745162010193, "reward_std": 0.16072623431682587, "rewards/reward_func/mean": 0.7691745162010193, "rewards/reward_func/std": 0.16072624921798706, "step": 1299, "step_time": 23.691280510276556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.22463082149624825, "epoch": 0.06021306160259379, "frac_reward_zero_std": 0.0, "grad_norm": 0.05471609905362129, "kl": 0.001213410694617778, "learning_rate": 9.8796665122742e-07, "loss": 0.0071, "num_tokens": 35859425.0, "reward": 0.98279869556427, "reward_std": 0.004587030503898859, "rewards/reward_func/mean": 0.98279869556427, "rewards/reward_func/std": 0.004587024915963411, "step": 1300, "step_time": 26.6886116117239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3098113238811493, "epoch": 0.060259379342288096, "frac_reward_zero_std": 0.0, "grad_norm": 0.09868577122688293, "kl": 0.001806522865081206, "learning_rate": 9.879573876794812e-07, "loss": -0.0978, "num_tokens": 35880999.0, "reward": 0.19428689777851105, "reward_std": 0.3475509583950043, "rewards/reward_func/mean": 0.19428689777851105, "rewards/reward_func/std": 0.34755098819732666, "step": 1301, "step_time": 17.72952525690198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 141.8125, "completions/mean_terminated_length": 141.8125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3010546714067459, "epoch": 0.0603056970819824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011427297722548246, "kl": 0.0011611956870183349, "learning_rate": 9.879481241315423e-07, "loss": 0.0001, "num_tokens": 35902020.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1302, "step_time": 15.329928517341614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3341658264398575, "epoch": 0.0603520148216767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009805286535993218, "kl": 0.0010981812374666333, "learning_rate": 9.879388605836034e-07, "loss": 0.0001, "num_tokens": 35937991.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1303, "step_time": 19.20741555839777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.2592865750193596, "epoch": 0.060398332561371004, "frac_reward_zero_std": 1.0, "grad_norm": 0.001082071801647544, "kl": 0.001120303408242762, "learning_rate": 9.879295970356645e-07, "loss": 0.0001, "num_tokens": 35963131.0, "reward": 0.8243306875228882, "reward_std": 0.0, "rewards/reward_func/mean": 0.8243306875228882, "rewards/reward_func/std": 0.0, "step": 1304, "step_time": 22.70706956088543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 166.9375, "completions/mean_terminated_length": 166.9375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.19587522745132446, "epoch": 0.06044465030106531, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007689369958825409, "kl": 0.0009125641372520477, "learning_rate": 9.879203334877257e-07, "loss": 0.0, "num_tokens": 35997546.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 1305, "step_time": 19.82010806724429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 226.3125, "completions/mean_terminated_length": 226.3125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2953353226184845, "epoch": 0.06049096804075961, "frac_reward_zero_std": 1.0, "grad_norm": 0.001081167720258236, "kl": 0.0009990074031520635, "learning_rate": 9.87911069939787e-07, "loss": 0.0001, "num_tokens": 36024911.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1306, "step_time": 23.03389771655202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 160.4375, "completions/mean_terminated_length": 160.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.20224759727716446, "epoch": 0.06053728578045391, "frac_reward_zero_std": 0.0, "grad_norm": 0.1349053829908371, "kl": 0.001746593916323036, "learning_rate": 9.879018063918481e-07, "loss": -0.0521, "num_tokens": 36047238.0, "reward": 0.6257256269454956, "reward_std": 0.10125773400068283, "rewards/reward_func/mean": 0.6257256269454956, "rewards/reward_func/std": 0.10125772655010223, "step": 1307, "step_time": 17.689881186932325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 179.5625, "completions/mean_terminated_length": 179.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.39270056784152985, "epoch": 0.060583603520148216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011465889401733875, "kl": 0.0012371263874229044, "learning_rate": 9.878925428439092e-07, "loss": 0.0001, "num_tokens": 36076975.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1308, "step_time": 20.26331490278244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4041343480348587, "epoch": 0.06062992125984252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011341717327013612, "kl": 0.001408853247994557, "learning_rate": 9.878832792959702e-07, "loss": 0.0001, "num_tokens": 36102255.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1309, "step_time": 20.550407517701387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.0625, "completions/mean_terminated_length": 133.0625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.22065860033035278, "epoch": 0.06067623899953682, "frac_reward_zero_std": 1.0, "grad_norm": 0.000924820953514427, "kl": 0.0011635018163360655, "learning_rate": 9.878740157480315e-07, "loss": 0.0001, "num_tokens": 36122208.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1310, "step_time": 14.049900580197573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 225.8125, "completions/mean_terminated_length": 225.8125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.25286129862070084, "epoch": 0.060722556739231125, "frac_reward_zero_std": 1.0, "grad_norm": 0.001199951395392418, "kl": 0.0012331777979852632, "learning_rate": 9.878647522000926e-07, "loss": 0.0001, "num_tokens": 36160285.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1311, "step_time": 25.68111901730299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.28861740976572037, "epoch": 0.06076887447892543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007949213613756001, "kl": 0.0009198406187351793, "learning_rate": 9.878554886521537e-07, "loss": 0.0, "num_tokens": 36188192.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1312, "step_time": 15.98707052692771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 130.125, "completions/mean_terminated_length": 130.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.32708143442869186, "epoch": 0.06081519221861973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011910056928172708, "kl": 0.0015137761947698891, "learning_rate": 9.878462251042149e-07, "loss": 0.0001, "num_tokens": 36209762.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1313, "step_time": 15.773848168551922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.30693046003580093, "epoch": 0.06086150995831403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016096553299576044, "kl": 0.0012392305507091805, "learning_rate": 9.87836961556276e-07, "loss": 0.0001, "num_tokens": 36239667.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1314, "step_time": 17.582204215228558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.44022227823734283, "epoch": 0.060907827698008336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009159357869066298, "kl": 0.0014115108933765441, "learning_rate": 9.878276980083371e-07, "loss": 0.0001, "num_tokens": 36264693.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1315, "step_time": 20.20029328763485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.40278957784175873, "epoch": 0.06095414543770264, "frac_reward_zero_std": 0.0, "grad_norm": 0.09605013579130173, "kl": 0.0016143825778272003, "learning_rate": 9.878184344603982e-07, "loss": -0.1135, "num_tokens": 36287765.0, "reward": 0.09924251586198807, "reward_std": 0.26821577548980713, "rewards/reward_func/mean": 0.09924251586198807, "rewards/reward_func/std": 0.2682158052921295, "step": 1316, "step_time": 27.463576547801495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.20220749825239182, "epoch": 0.06100046317739694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005912294727750123, "kl": 0.0007472722791135311, "learning_rate": 9.878091709124594e-07, "loss": 0.0, "num_tokens": 36311837.0, "reward": 0.9534969329833984, "reward_std": 0.0, "rewards/reward_func/mean": 0.9534969329833984, "rewards/reward_func/std": 0.0, "step": 1317, "step_time": 24.03943707793951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.22347812727093697, "epoch": 0.061046780917091245, "frac_reward_zero_std": 0.0, "grad_norm": 0.08405770361423492, "kl": 0.0007646936719538644, "learning_rate": 9.877999073645205e-07, "loss": -0.0184, "num_tokens": 36340172.0, "reward": 0.8599967956542969, "reward_std": 0.08930132538080215, "rewards/reward_func/mean": 0.8599967956542969, "rewards/reward_func/std": 0.08930133283138275, "step": 1318, "step_time": 19.7347002774477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.30847054719924927, "epoch": 0.06109309865678555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024909113999456167, "kl": 0.0015048130590002984, "learning_rate": 9.877906438165818e-07, "loss": 0.0001, "num_tokens": 36360890.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1319, "step_time": 19.199145317077637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 117.1875, "completions/mean_terminated_length": 117.1875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2440360188484192, "epoch": 0.06113941639647985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012768898159265518, "kl": 0.0012907666387036443, "learning_rate": 9.87781380268643e-07, "loss": 0.0001, "num_tokens": 36384269.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1320, "step_time": 13.399408016353846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3299025222659111, "epoch": 0.061185734136174154, "frac_reward_zero_std": 1.0, "grad_norm": 0.002699008211493492, "kl": 0.0015059859433677047, "learning_rate": 9.877721167207039e-07, "loss": 0.0001, "num_tokens": 36418437.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1321, "step_time": 18.462588392198086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.25493139028549194, "epoch": 0.06123205187586846, "frac_reward_zero_std": 0.0, "grad_norm": 0.10637736320495605, "kl": 0.0016697085520718247, "learning_rate": 9.87762853172765e-07, "loss": -0.0492, "num_tokens": 36443637.0, "reward": 0.6062300205230713, "reward_std": 0.02733754739165306, "rewards/reward_func/mean": 0.6062300205230713, "rewards/reward_func/std": 0.027337554842233658, "step": 1322, "step_time": 24.628826271742582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3845409229397774, "epoch": 0.06127836961556276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013907892862334847, "kl": 0.001733310054987669, "learning_rate": 9.877535896248263e-07, "loss": 0.0001, "num_tokens": 36480532.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1323, "step_time": 20.58817472308874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 126.3125, "completions/mean_terminated_length": 126.3125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.34343016892671585, "epoch": 0.06132468735525706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020387242548167706, "kl": 0.0015359602111857384, "learning_rate": 9.877443260768875e-07, "loss": 0.0001, "num_tokens": 36501257.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1324, "step_time": 13.899054154753685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.329635426402092, "epoch": 0.061371005094951366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009411834180355072, "kl": 0.0012850109487771988, "learning_rate": 9.877350625289486e-07, "loss": 0.0001, "num_tokens": 36523564.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1325, "step_time": 18.96903756260872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.379032664000988, "epoch": 0.06141732283464567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011547692120075226, "kl": 0.001265781669644639, "learning_rate": 9.877257989810097e-07, "loss": 0.0001, "num_tokens": 36546198.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1326, "step_time": 19.25311341881752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.30368292331695557, "epoch": 0.06146364057433997, "frac_reward_zero_std": 1.0, "grad_norm": 0.000730156316421926, "kl": 0.0010984012042172253, "learning_rate": 9.877165354330708e-07, "loss": 0.0001, "num_tokens": 36567672.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1327, "step_time": 13.52169605344534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2857901155948639, "epoch": 0.061509958314034274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018799130339175463, "kl": 0.001398787455400452, "learning_rate": 9.87707271885132e-07, "loss": 0.0001, "num_tokens": 36588944.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1328, "step_time": 15.058954171836376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 145.5625, "completions/mean_terminated_length": 145.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.32882945239543915, "epoch": 0.06155627605372858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016379902372136712, "kl": 0.0013487855030689389, "learning_rate": 9.87698008337193e-07, "loss": 0.0001, "num_tokens": 36622473.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1329, "step_time": 18.894803293049335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 177.4375, "completions/mean_terminated_length": 177.4375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4255825951695442, "epoch": 0.06160259379342288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012570770923048258, "kl": 0.002042806096142158, "learning_rate": 9.876887447892542e-07, "loss": 0.0001, "num_tokens": 36673488.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1330, "step_time": 26.708086907863617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.45978182554244995, "epoch": 0.06164891153311718, "frac_reward_zero_std": 0.0, "grad_norm": 0.09499292820692062, "kl": 0.0015033484087325633, "learning_rate": 9.876794812413153e-07, "loss": 0.0735, "num_tokens": 36695524.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 1331, "step_time": 23.335980210453272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 198.1875, "completions/mean_terminated_length": 198.1875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2236262559890747, "epoch": 0.061695229272811486, "frac_reward_zero_std": 0.0, "grad_norm": 0.07165606319904327, "kl": 0.001192555035231635, "learning_rate": 9.876702176933765e-07, "loss": -0.0652, "num_tokens": 36718039.0, "reward": 0.4348215162754059, "reward_std": 0.18556353449821472, "rewards/reward_func/mean": 0.4348215162754059, "rewards/reward_func/std": 0.18556353449821472, "step": 1332, "step_time": 21.416344843804836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.298039510846138, "epoch": 0.06174154701250579, "frac_reward_zero_std": 1.0, "grad_norm": 0.001903527183458209, "kl": 0.0014055129431653768, "learning_rate": 9.876609541454378e-07, "loss": 0.0001, "num_tokens": 36739675.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1333, "step_time": 15.189780503511429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.29572784155607224, "epoch": 0.06178786475220009, "frac_reward_zero_std": 0.0, "grad_norm": 0.08701220899820328, "kl": 0.0014234512636903673, "learning_rate": 9.876516905974987e-07, "loss": 0.0053, "num_tokens": 36763840.0, "reward": 0.009960266761481762, "reward_std": 0.009204850532114506, "rewards/reward_func/mean": 0.009960266761481762, "rewards/reward_func/std": 0.00920485146343708, "step": 1334, "step_time": 21.215655487030745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.20869983360171318, "epoch": 0.061834182491894395, "frac_reward_zero_std": 0.0, "grad_norm": 0.06351931393146515, "kl": 0.0010749039647635072, "learning_rate": 9.876424270495598e-07, "loss": -0.0059, "num_tokens": 36801148.0, "reward": 0.9219220876693726, "reward_std": 0.02082076109945774, "rewards/reward_func/mean": 0.9219220876693726, "rewards/reward_func/std": 0.02082076668739319, "step": 1335, "step_time": 21.6966012082994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 125.1875, "completions/mean_terminated_length": 125.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3079006224870682, "epoch": 0.0618805002315887, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007926784455776215, "kl": 0.0009599182958481833, "learning_rate": 9.876331635016212e-07, "loss": 0.0, "num_tokens": 36822591.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1336, "step_time": 14.76633208990097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 224.1875, "completions/mean_terminated_length": 224.1875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.2204713374376297, "epoch": 0.061926817971283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004801633767783642, "kl": 0.000696543458616361, "learning_rate": 9.876238999536823e-07, "loss": 0.0, "num_tokens": 36854370.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 1337, "step_time": 24.375063110142946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 235.75, "completions/mean_terminated_length": 235.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.4921463578939438, "epoch": 0.0619731357109773, "frac_reward_zero_std": 0.0, "grad_norm": 0.07507622241973877, "kl": 0.0015840266714803874, "learning_rate": 9.876146364057434e-07, "loss": 0.1382, "num_tokens": 36886046.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 1338, "step_time": 28.953611817210913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2921140268445015, "epoch": 0.062019453450671606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016031945124268532, "kl": 0.0011878642835654318, "learning_rate": 9.876053728578045e-07, "loss": 0.0001, "num_tokens": 36909492.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1339, "step_time": 17.372348058968782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.31634822487831116, "epoch": 0.06206577119036591, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011940133990719914, "kl": 0.001183247019071132, "learning_rate": 9.875961093098657e-07, "loss": 0.0001, "num_tokens": 36931660.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1340, "step_time": 14.516185022890568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2151433601975441, "epoch": 0.06211208893006021, "frac_reward_zero_std": 1.0, "grad_norm": 0.002757185371592641, "kl": 0.0011798940249718726, "learning_rate": 9.875868457619268e-07, "loss": 0.0001, "num_tokens": 36951132.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1341, "step_time": 13.34553236886859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 287.5625, "completions/mean_terminated_length": 287.5625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.3032371699810028, "epoch": 0.062158406669754515, "frac_reward_zero_std": 0.0, "grad_norm": 0.05955071002244949, "kl": 0.0010868890822166577, "learning_rate": 9.87577582213988e-07, "loss": -0.0221, "num_tokens": 36990805.0, "reward": 0.43345510959625244, "reward_std": 0.11558802425861359, "rewards/reward_func/mean": 0.43345510959625244, "rewards/reward_func/std": 0.11558802425861359, "step": 1342, "step_time": 32.559711404144764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 196.8125, "completions/mean_terminated_length": 196.8125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.22023657709360123, "epoch": 0.06220472440944882, "frac_reward_zero_std": 0.0, "grad_norm": 0.06820940971374512, "kl": 0.0012514598201960325, "learning_rate": 9.87568318666049e-07, "loss": -0.0263, "num_tokens": 37015586.0, "reward": 0.9275899529457092, "reward_std": 0.035925447940826416, "rewards/reward_func/mean": 0.9275899529457092, "rewards/reward_func/std": 0.03592545539140701, "step": 1343, "step_time": 20.035180181264877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 205.375, "completions/mean_terminated_length": 205.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.1878780871629715, "epoch": 0.06225104214914312, "frac_reward_zero_std": 0.0, "grad_norm": 0.06910555064678192, "kl": 0.0008483437850372866, "learning_rate": 9.875590551181102e-07, "loss": 0.0081, "num_tokens": 37041144.0, "reward": 0.9754081964492798, "reward_std": 0.09836733341217041, "rewards/reward_func/mean": 0.9754081964492798, "rewards/reward_func/std": 0.09836734086275101, "step": 1344, "step_time": 20.38943938910961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.22444728761911392, "epoch": 0.062297359888837424, "frac_reward_zero_std": 1.0, "grad_norm": 0.001037672394886613, "kl": 0.0008089821494650096, "learning_rate": 9.875497915701713e-07, "loss": 0.0, "num_tokens": 37065548.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 1345, "step_time": 18.363298401236534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 112.5625, "completions/mean_terminated_length": 112.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.297483429312706, "epoch": 0.06234367762853173, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014063261914998293, "kl": 0.0014439236838370562, "learning_rate": 9.875405280222324e-07, "loss": 0.0001, "num_tokens": 37086277.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1346, "step_time": 12.818291902542114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 169.5625, "completions/mean_terminated_length": 169.5625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.20191585645079613, "epoch": 0.06238999536822603, "frac_reward_zero_std": 1.0, "grad_norm": 0.004159863572567701, "kl": 0.0018049118225462735, "learning_rate": 9.875312644742935e-07, "loss": 0.0001, "num_tokens": 37108366.0, "reward": 0.8205257058143616, "reward_std": 0.0, "rewards/reward_func/mean": 0.8205257058143616, "rewards/reward_func/std": 0.0, "step": 1347, "step_time": 17.730277586728334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 180.6875, "completions/mean_terminated_length": 180.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.21539785712957382, "epoch": 0.06243631310792033, "frac_reward_zero_std": 0.0, "grad_norm": 0.09756869822740555, "kl": 0.0016226680018007755, "learning_rate": 9.875220009263547e-07, "loss": 0.0179, "num_tokens": 37131433.0, "reward": 0.9275797009468079, "reward_std": 0.04178958758711815, "rewards/reward_func/mean": 0.9275797009468079, "rewards/reward_func/std": 0.041789598762989044, "step": 1348, "step_time": 18.883345417678356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 275.0625, "completions/mean_terminated_length": 275.0625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.31261318176984787, "epoch": 0.062482630847614636, "frac_reward_zero_std": 0.0, "grad_norm": 0.06316092610359192, "kl": 0.0017653298273216933, "learning_rate": 9.875127373784158e-07, "loss": -0.1469, "num_tokens": 37169402.0, "reward": 0.4761143624782562, "reward_std": 0.3884202241897583, "rewards/reward_func/mean": 0.4761143624782562, "rewards/reward_func/std": 0.3884202241897583, "step": 1349, "step_time": 36.37331370264292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3901263177394867, "epoch": 0.06252894858730894, "frac_reward_zero_std": 1.0, "grad_norm": 0.005902454722672701, "kl": 0.002515608735848218, "learning_rate": 9.875034738304771e-07, "loss": 0.0001, "num_tokens": 37191545.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1350, "step_time": 15.089044328778982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2958965077996254, "epoch": 0.06257526632700325, "frac_reward_zero_std": 0.0, "grad_norm": 0.08561558276414871, "kl": 0.0015522035246249288, "learning_rate": 9.874942102825383e-07, "loss": -0.0959, "num_tokens": 37213915.0, "reward": 0.40241771936416626, "reward_std": 0.3842158317565918, "rewards/reward_func/mean": 0.40241771936416626, "rewards/reward_func/std": 0.3842158317565918, "step": 1351, "step_time": 22.39298490062356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 160.75, "completions/mean_terminated_length": 160.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3948901891708374, "epoch": 0.06262158406669754, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008058904786594212, "kl": 0.0014029656304046512, "learning_rate": 9.874849467345992e-07, "loss": 0.0001, "num_tokens": 37248631.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1352, "step_time": 20.79153921827674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.2936493381857872, "epoch": 0.06266790180639185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019970559515058994, "kl": 0.0013638751115649939, "learning_rate": 9.874756831866605e-07, "loss": 0.0001, "num_tokens": 37270063.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1353, "step_time": 16.7253421805799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 135.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3047530725598335, "epoch": 0.06271421954608615, "frac_reward_zero_std": 1.0, "grad_norm": 0.002005633432418108, "kl": 0.0013498670450644568, "learning_rate": 9.874664196387216e-07, "loss": 0.0001, "num_tokens": 37297634.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1354, "step_time": 16.681302469223738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3298976346850395, "epoch": 0.06276053728578046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014601984294131398, "kl": 0.0017668300424702466, "learning_rate": 9.874571560907828e-07, "loss": 0.0001, "num_tokens": 37328252.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1355, "step_time": 17.33435459434986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.14845091477036476, "epoch": 0.06280685502547476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007003385107964277, "kl": 0.0007584914710605517, "learning_rate": 9.874478925428439e-07, "loss": 0.0, "num_tokens": 37351588.0, "reward": 0.92438805103302, "reward_std": 0.0, "rewards/reward_func/mean": 0.92438805103302, "rewards/reward_func/std": 0.0, "step": 1356, "step_time": 18.006115213036537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 138.1875, "completions/mean_terminated_length": 138.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.33333951979875565, "epoch": 0.06285317276516907, "frac_reward_zero_std": 1.0, "grad_norm": 0.004086427856236696, "kl": 0.001981131761567667, "learning_rate": 9.87438628994905e-07, "loss": 0.0001, "num_tokens": 37387639.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1357, "step_time": 18.09918538853526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 129.375, "completions/mean_terminated_length": 129.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.29088229686021805, "epoch": 0.06289949050486336, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008748992113396525, "kl": 0.0010925929382210597, "learning_rate": 9.874293654469661e-07, "loss": 0.0001, "num_tokens": 37413933.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1358, "step_time": 15.646902363747358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 198.1875, "completions/mean_terminated_length": 198.1875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.37544991075992584, "epoch": 0.06294580824455767, "frac_reward_zero_std": 0.0, "grad_norm": 0.0776798352599144, "kl": 0.0014099810214247555, "learning_rate": 9.874201018990273e-07, "loss": -0.0494, "num_tokens": 37435520.0, "reward": 0.11230379343032837, "reward_std": 0.30910545587539673, "rewards/reward_func/mean": 0.11230379343032837, "rewards/reward_func/std": 0.3091054856777191, "step": 1359, "step_time": 23.2933401837945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.22312795370817184, "epoch": 0.06299212598425197, "frac_reward_zero_std": 0.0, "grad_norm": 0.12842656672000885, "kl": 0.0025809964863583446, "learning_rate": 9.874108383510884e-07, "loss": -0.1071, "num_tokens": 37465122.0, "reward": 0.518261730670929, "reward_std": 0.41461795568466187, "rewards/reward_func/mean": 0.518261730670929, "rewards/reward_func/std": 0.41461798548698425, "step": 1360, "step_time": 20.39900228381157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28706835955381393, "epoch": 0.06303844372394628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009399523260071874, "kl": 0.0009960871102521196, "learning_rate": 9.874015748031495e-07, "loss": 0.0, "num_tokens": 37487004.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1361, "step_time": 14.537100818008184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 165.4375, "completions/mean_terminated_length": 165.4375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4404766261577606, "epoch": 0.06308476146364057, "frac_reward_zero_std": 1.0, "grad_norm": 0.004660594742745161, "kl": 0.002517607470508665, "learning_rate": 9.873923112552106e-07, "loss": 0.0001, "num_tokens": 37516483.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1362, "step_time": 20.90831706300378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 116.3125, "completions/mean_terminated_length": 116.3125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2569211646914482, "epoch": 0.06313107920333488, "frac_reward_zero_std": 1.0, "grad_norm": 0.003202579217031598, "kl": 0.0018702812667470425, "learning_rate": 9.87383047707272e-07, "loss": 0.0001, "num_tokens": 37536504.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1363, "step_time": 13.529532633721828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 213.5625, "completions/mean_terminated_length": 213.5625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2348765693604946, "epoch": 0.06317739694302918, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009015243267640471, "kl": 0.0009753518679644912, "learning_rate": 9.87373784159333e-07, "loss": 0.0, "num_tokens": 37564641.0, "reward": 0.8510449528694153, "reward_std": 0.0, "rewards/reward_func/mean": 0.8510449528694153, "rewards/reward_func/std": 0.0, "step": 1364, "step_time": 23.56071775779128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 200.1875, "completions/mean_terminated_length": 200.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4207221120595932, "epoch": 0.06322371468272349, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010644063586369157, "kl": 0.0013695932284463197, "learning_rate": 9.87364520611394e-07, "loss": 0.0001, "num_tokens": 37588404.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1365, "step_time": 22.19744211435318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 195.5, "completions/mean_terminated_length": 195.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3812335655093193, "epoch": 0.06327003242241779, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017430639127269387, "kl": 0.0016534636088181287, "learning_rate": 9.873552570634553e-07, "loss": 0.0001, "num_tokens": 37619708.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1366, "step_time": 22.575360488146544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 201.1875, "completions/mean_terminated_length": 201.1875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.39354346692562103, "epoch": 0.0633163501621121, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020605064928531647, "kl": 0.0016023786447476596, "learning_rate": 9.873459935155165e-07, "loss": 0.0001, "num_tokens": 37653567.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1367, "step_time": 23.613397791981697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 145.4375, "completions/mean_terminated_length": 145.4375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.22626493498682976, "epoch": 0.06336266790180639, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026905538979917765, "kl": 0.0013218133826740086, "learning_rate": 9.873367299675776e-07, "loss": 0.0001, "num_tokens": 37673878.0, "reward": 0.8559471368789673, "reward_std": 0.0, "rewards/reward_func/mean": 0.8559471368789673, "rewards/reward_func/std": 0.0, "step": 1368, "step_time": 15.370013508945704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3830214664340019, "epoch": 0.0634089856415007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009626580867916346, "kl": 0.0013536399346776307, "learning_rate": 9.873274664196387e-07, "loss": 0.0001, "num_tokens": 37716229.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1369, "step_time": 21.070292565971613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 260.1875, "completions/mean_terminated_length": 260.1875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.1670360341668129, "epoch": 0.063455303381195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016005614306777716, "kl": 0.0009976603032555431, "learning_rate": 9.873182028716998e-07, "loss": 0.0, "num_tokens": 37755416.0, "reward": 0.9677302837371826, "reward_std": 0.0, "rewards/reward_func/mean": 0.9677302837371826, "rewards/reward_func/std": 0.0, "step": 1370, "step_time": 26.825051859021187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.17469500377774239, "epoch": 0.0635016211208893, "frac_reward_zero_std": 0.0, "grad_norm": 0.0870797410607338, "kl": 0.0008207607170334086, "learning_rate": 9.87308939323761e-07, "loss": 0.008, "num_tokens": 37778546.0, "reward": 0.33064115047454834, "reward_std": 0.1531805396080017, "rewards/reward_func/mean": 0.33064115047454834, "rewards/reward_func/std": 0.1531805396080017, "step": 1371, "step_time": 19.08588433265686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 129.0625, "completions/mean_terminated_length": 129.0625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3086008355021477, "epoch": 0.0635479388605836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017839764477685094, "kl": 0.001566823193570599, "learning_rate": 9.87299675775822e-07, "loss": 0.0001, "num_tokens": 37799539.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1372, "step_time": 13.565544940531254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 181.8125, "completions/mean_terminated_length": 181.8125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.13856897875666618, "epoch": 0.06359425660027791, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004261926806066185, "kl": 0.0005177938946872018, "learning_rate": 9.872904122278832e-07, "loss": 0.0, "num_tokens": 37832592.0, "reward": 0.9167169332504272, "reward_std": 0.0, "rewards/reward_func/mean": 0.9167169332504272, "rewards/reward_func/std": 0.0, "step": 1373, "step_time": 19.823412846773863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.24950135871767998, "epoch": 0.06364057433997221, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011088309111073613, "kl": 0.001101298417779617, "learning_rate": 9.872811486799443e-07, "loss": 0.0001, "num_tokens": 37852216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1374, "step_time": 12.886257383972406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 257.4375, "completions/mean_terminated_length": 257.4375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.35273078829050064, "epoch": 0.06368689207966652, "frac_reward_zero_std": 0.0, "grad_norm": 0.061511896550655365, "kl": 0.0013419757597148418, "learning_rate": 9.872718851320055e-07, "loss": -0.0959, "num_tokens": 37887695.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 1375, "step_time": 29.076378416270018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2635773569345474, "epoch": 0.06373320981936081, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016760352300480008, "kl": 0.001321380288572982, "learning_rate": 9.872626215840668e-07, "loss": 0.0001, "num_tokens": 37912490.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1376, "step_time": 15.871300362050533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 107.4375, "completions/mean_terminated_length": 107.4375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.28921304643154144, "epoch": 0.06377952755905512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010980983497574925, "kl": 0.0013212210615165532, "learning_rate": 9.872533580361277e-07, "loss": 0.0001, "num_tokens": 37935649.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1377, "step_time": 13.701858673244715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3393339291214943, "epoch": 0.06382584529874942, "frac_reward_zero_std": 0.0, "grad_norm": 0.12185642123222351, "kl": 0.002667451975867152, "learning_rate": 9.872440944881888e-07, "loss": -0.0369, "num_tokens": 37956753.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 1378, "step_time": 20.53817980736494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.22990813478827477, "epoch": 0.06387216303844373, "frac_reward_zero_std": 0.0, "grad_norm": 0.08274000138044357, "kl": 0.0011493629135657102, "learning_rate": 9.8723483094025e-07, "loss": -0.0076, "num_tokens": 37977931.0, "reward": 0.5090391635894775, "reward_std": 0.05525410920381546, "rewards/reward_func/mean": 0.5090391635894775, "rewards/reward_func/std": 0.05525410547852516, "step": 1379, "step_time": 17.247316155582666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4396393895149231, "epoch": 0.06391848077813803, "frac_reward_zero_std": 0.0, "grad_norm": 0.10089975595474243, "kl": 0.0018845154263544828, "learning_rate": 9.872255673923113e-07, "loss": 0.0387, "num_tokens": 38000111.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 1380, "step_time": 21.832327533513308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3795551508665085, "epoch": 0.06396479851783234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013261609710752964, "kl": 0.0017147823236882687, "learning_rate": 9.872163038443724e-07, "loss": 0.0001, "num_tokens": 38021600.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1381, "step_time": 18.155334655195475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4547497630119324, "epoch": 0.06401111625752663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017529228935018182, "kl": 0.0019403231563046575, "learning_rate": 9.872070402964335e-07, "loss": 0.0001, "num_tokens": 38046546.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1382, "step_time": 24.914843030273914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 161.125, "completions/mean_terminated_length": 161.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3788337707519531, "epoch": 0.06405743399722094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012503358302637935, "kl": 0.0014750408881809562, "learning_rate": 9.871977767484947e-07, "loss": 0.0001, "num_tokens": 38067924.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1383, "step_time": 18.031631872057915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 155.8125, "completions/mean_terminated_length": 155.8125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.1991540491580963, "epoch": 0.06410375173691524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019283011788502336, "kl": 0.001558351592393592, "learning_rate": 9.871885132005558e-07, "loss": 0.0001, "num_tokens": 38092593.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 1384, "step_time": 18.598839037120342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3112274408340454, "epoch": 0.06415006947660955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011574198724702, "kl": 0.0010043757501989603, "learning_rate": 9.87179249652617e-07, "loss": 0.0001, "num_tokens": 38115899.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1385, "step_time": 18.114937491714954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.44041285663843155, "epoch": 0.06419638721630384, "frac_reward_zero_std": 1.0, "grad_norm": 0.001231481204740703, "kl": 0.001540001918328926, "learning_rate": 9.87169986104678e-07, "loss": 0.0001, "num_tokens": 38138351.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1386, "step_time": 20.419554706662893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 173.5625, "completions/mean_terminated_length": 173.5625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.37382249534130096, "epoch": 0.06424270495599815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014059273526072502, "kl": 0.0013032660936005414, "learning_rate": 9.871607225567392e-07, "loss": 0.0001, "num_tokens": 38170504.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1387, "step_time": 20.110991090536118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.47965505719184875, "epoch": 0.06428902269569245, "frac_reward_zero_std": 0.0, "grad_norm": 0.07660506665706635, "kl": 0.0021281543886289, "learning_rate": 9.871514590088003e-07, "loss": -0.0959, "num_tokens": 38201098.0, "reward": 0.0021462831646203995, "reward_std": 0.008585091680288315, "rewards/reward_func/mean": 0.0021462831646203995, "rewards/reward_func/std": 0.008585091680288315, "step": 1388, "step_time": 28.15257778763771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.34423044323921204, "epoch": 0.06433534043538676, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012541321339085698, "kl": 0.0014289073587860912, "learning_rate": 9.871421954608614e-07, "loss": 0.0001, "num_tokens": 38226082.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1389, "step_time": 16.990872882306576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.37246695905923843, "epoch": 0.06438165817508106, "frac_reward_zero_std": 0.0, "grad_norm": 0.09345704317092896, "kl": 0.0019007799564860761, "learning_rate": 9.871329319129225e-07, "loss": -0.0781, "num_tokens": 38250008.0, "reward": 0.28770166635513306, "reward_std": 0.2688891291618347, "rewards/reward_func/mean": 0.28770166635513306, "rewards/reward_func/std": 0.2688891291618347, "step": 1390, "step_time": 21.541621766984463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24001146480441093, "epoch": 0.06442797591477537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012315355706959963, "kl": 0.0012011609505861998, "learning_rate": 9.871236683649837e-07, "loss": 0.0001, "num_tokens": 38269638.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1391, "step_time": 15.285881139338017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.44844141602516174, "epoch": 0.06447429365446966, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013259457191452384, "kl": 0.0015823344583623111, "learning_rate": 9.871144048170448e-07, "loss": 0.0001, "num_tokens": 38312812.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1392, "step_time": 21.187436882406473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 131.8125, "completions/mean_terminated_length": 131.8125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.31581492722034454, "epoch": 0.06452061139416397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014123879373073578, "kl": 0.0013365675986278802, "learning_rate": 9.871051412691061e-07, "loss": 0.0001, "num_tokens": 38332729.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1393, "step_time": 14.641268495470285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.14506584033370018, "epoch": 0.06456692913385827, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016818299191072583, "kl": 0.0008369435818167403, "learning_rate": 9.870958777211673e-07, "loss": 0.0, "num_tokens": 38369861.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 1394, "step_time": 20.446988452225924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3028858155012131, "epoch": 0.06461324687355258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013620060635730624, "kl": 0.0012630632263608277, "learning_rate": 9.870866141732282e-07, "loss": 0.0001, "num_tokens": 38391037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1395, "step_time": 15.609694961458445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.36677612364292145, "epoch": 0.06465956461324687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024252363946288824, "kl": 0.0016025640070438385, "learning_rate": 9.870773506252895e-07, "loss": 0.0001, "num_tokens": 38417067.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1396, "step_time": 20.601425986737013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 220.8125, "completions/mean_terminated_length": 220.8125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3483920767903328, "epoch": 0.06470588235294118, "frac_reward_zero_std": 0.0, "grad_norm": 0.08609218895435333, "kl": 0.0013980403309687972, "learning_rate": 9.870680870773506e-07, "loss": -0.0195, "num_tokens": 38441176.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 1397, "step_time": 29.755803864449263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 189.6875, "completions/mean_terminated_length": 189.6875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.30657874792814255, "epoch": 0.06475220009263548, "frac_reward_zero_std": 0.0, "grad_norm": 0.07561224699020386, "kl": 0.0014870265440549701, "learning_rate": 9.870588235294118e-07, "loss": 0.0254, "num_tokens": 38467299.0, "reward": 0.41634833812713623, "reward_std": 0.33341115713119507, "rewards/reward_func/mean": 0.41634833812713623, "rewards/reward_func/std": 0.3334111273288727, "step": 1398, "step_time": 21.73190562427044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.21688029915094376, "epoch": 0.06479851783232979, "frac_reward_zero_std": 1.0, "grad_norm": 0.001277267001569271, "kl": 0.0009995947912102565, "learning_rate": 9.870495599814729e-07, "loss": 0.0, "num_tokens": 38490231.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 1399, "step_time": 17.318347416818142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2146575041115284, "epoch": 0.06484483557202408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00092996348394081, "kl": 0.0008030234603211284, "learning_rate": 9.87040296433534e-07, "loss": 0.0, "num_tokens": 38513119.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 1400, "step_time": 16.49129395186901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 190.6875, "completions/mean_terminated_length": 190.6875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2414935976266861, "epoch": 0.0648911533117184, "frac_reward_zero_std": 0.0, "grad_norm": 0.10723493993282318, "kl": 0.0012726300628855824, "learning_rate": 9.870310328855951e-07, "loss": 0.0472, "num_tokens": 38537002.0, "reward": 0.5636385679244995, "reward_std": 0.15132883191108704, "rewards/reward_func/mean": 0.5636385679244995, "rewards/reward_func/std": 0.15132883191108704, "step": 1401, "step_time": 21.77669233083725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2189713902771473, "epoch": 0.06493747105141269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011065290309488773, "kl": 0.0009664138196967542, "learning_rate": 9.870217693376563e-07, "loss": 0.0, "num_tokens": 38556944.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1402, "step_time": 13.41659290716052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 186.1875, "completions/mean_terminated_length": 186.1875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.15766077861189842, "epoch": 0.064983788791107, "frac_reward_zero_std": 1.0, "grad_norm": 0.006077347788959742, "kl": 0.0014418928913073614, "learning_rate": 9.870125057897174e-07, "loss": 0.0001, "num_tokens": 38582371.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1403, "step_time": 19.582397300750017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2059054709970951, "epoch": 0.0650301065308013, "frac_reward_zero_std": 1.0, "grad_norm": 0.002235325751826167, "kl": 0.0010782177560031414, "learning_rate": 9.870032422417785e-07, "loss": 0.0001, "num_tokens": 38605942.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1404, "step_time": 18.263175208121538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1598551720380783, "epoch": 0.0650764242704956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006986630032770336, "kl": 0.0007289715140359476, "learning_rate": 9.869939786938396e-07, "loss": 0.0, "num_tokens": 38636293.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 1405, "step_time": 18.041317779570818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 162.3125, "completions/mean_terminated_length": 162.3125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2856503203511238, "epoch": 0.0651227420101899, "frac_reward_zero_std": 0.0, "grad_norm": 0.15712042152881622, "kl": 0.0012672470038523898, "learning_rate": 9.86984715145901e-07, "loss": 0.0381, "num_tokens": 38657098.0, "reward": 0.862541675567627, "reward_std": 0.23001109063625336, "rewards/reward_func/mean": 0.862541675567627, "rewards/reward_func/std": 0.23001112043857574, "step": 1406, "step_time": 18.65338582545519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 158.375, "completions/mean_terminated_length": 158.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.18908322602510452, "epoch": 0.06516905974988421, "frac_reward_zero_std": 0.0, "grad_norm": 0.09071298688650131, "kl": 0.0018847110186470672, "learning_rate": 9.86975451597962e-07, "loss": -0.0321, "num_tokens": 38693984.0, "reward": 0.9001584053039551, "reward_std": 0.05953400582075119, "rewards/reward_func/mean": 0.9001584053039551, "rewards/reward_func/std": 0.05953400954604149, "step": 1407, "step_time": 20.77293001487851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3009890466928482, "epoch": 0.06521537748957851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008008633740246296, "kl": 0.0011896536743734032, "learning_rate": 9.86966188050023e-07, "loss": 0.0001, "num_tokens": 38714764.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1408, "step_time": 15.979082588106394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1599237322807312, "epoch": 0.06526169522927282, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008259209571406245, "kl": 0.0008421517413808033, "learning_rate": 9.869569245020841e-07, "loss": 0.0, "num_tokens": 38737021.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 1409, "step_time": 17.57414334639907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 148.6875, "completions/mean_terminated_length": 148.6875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.32970255613327026, "epoch": 0.06530801296896711, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008611916564404964, "kl": 0.0013319231511559337, "learning_rate": 9.869476609541455e-07, "loss": 0.0001, "num_tokens": 38768536.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1410, "step_time": 18.89270857349038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 198.5, "completions/mean_terminated_length": 198.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.30774350464344025, "epoch": 0.06535433070866142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006944688502699137, "kl": 0.0010836839501280338, "learning_rate": 9.869383974062066e-07, "loss": 0.0001, "num_tokens": 38793552.0, "reward": 0.38889557123184204, "reward_std": 0.0, "rewards/reward_func/mean": 0.38889557123184204, "rewards/reward_func/std": 0.0, "step": 1411, "step_time": 21.76561936363578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 139.8125, "completions/mean_terminated_length": 139.8125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28513088822364807, "epoch": 0.06540064844835572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019855641294270754, "kl": 0.0013956335897091776, "learning_rate": 9.869291338582677e-07, "loss": 0.0001, "num_tokens": 38813325.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1412, "step_time": 15.08096693456173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 127.6875, "completions/mean_terminated_length": 127.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.35748113691806793, "epoch": 0.06544696618805003, "frac_reward_zero_std": 1.0, "grad_norm": 0.001560273696668446, "kl": 0.001687679992755875, "learning_rate": 9.869198703103288e-07, "loss": 0.0001, "num_tokens": 38840872.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1413, "step_time": 16.32881325483322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.12357214279472828, "epoch": 0.06549328392774433, "frac_reward_zero_std": 0.0, "grad_norm": 0.06670991331338882, "kl": 0.0006051260425010696, "learning_rate": 9.8691060676239e-07, "loss": -0.004, "num_tokens": 38873682.0, "reward": 0.9907451272010803, "reward_std": 0.02528911456465721, "rewards/reward_func/mean": 0.9907451272010803, "rewards/reward_func/std": 0.025289107114076614, "step": 1414, "step_time": 21.369696903973818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.35167011618614197, "epoch": 0.06553960166743864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015633010771125555, "kl": 0.0016057019238360226, "learning_rate": 9.86901343214451e-07, "loss": 0.0001, "num_tokens": 38895799.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1415, "step_time": 15.864126328378916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.22239339724183083, "epoch": 0.06558591940713293, "frac_reward_zero_std": 0.0, "grad_norm": 0.09842777997255325, "kl": 0.0010389309318270534, "learning_rate": 9.868920796665122e-07, "loss": 0.0034, "num_tokens": 38916995.0, "reward": 0.39312544465065, "reward_std": 0.007969260215759277, "rewards/reward_func/mean": 0.39312544465065, "rewards/reward_func/std": 0.007969260215759277, "step": 1416, "step_time": 16.630940418690443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3057669699192047, "epoch": 0.06563223714682724, "frac_reward_zero_std": 1.0, "grad_norm": 0.002081182785332203, "kl": 0.001386734249535948, "learning_rate": 9.868828161185733e-07, "loss": 0.0001, "num_tokens": 38939975.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1417, "step_time": 16.80906194075942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.33575063198804855, "epoch": 0.06567855488652154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010036277817562222, "kl": 0.0013450011028908193, "learning_rate": 9.868735525706345e-07, "loss": 0.0001, "num_tokens": 38966508.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1418, "step_time": 16.723738331347704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.33988042175769806, "epoch": 0.06572487262621585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010299875866621733, "kl": 0.0013958606286905706, "learning_rate": 9.868642890226956e-07, "loss": 0.0001, "num_tokens": 38999232.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1419, "step_time": 17.246197946369648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3560640439391136, "epoch": 0.06577119036591014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010152118047699332, "kl": 0.001361678441753611, "learning_rate": 9.868550254747567e-07, "loss": 0.0001, "num_tokens": 39021911.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1420, "step_time": 15.976904805749655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4093324691057205, "epoch": 0.06581750810560445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015361782861873507, "kl": 0.0014861428062431514, "learning_rate": 9.868457619268178e-07, "loss": 0.0001, "num_tokens": 39049523.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1421, "step_time": 19.95411391928792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 196.8125, "completions/mean_terminated_length": 196.8125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.193643506616354, "epoch": 0.06586382584529875, "frac_reward_zero_std": 0.0, "grad_norm": 0.08457839488983154, "kl": 0.0010921327047981322, "learning_rate": 9.86836498378879e-07, "loss": -0.0378, "num_tokens": 39087440.0, "reward": 0.9686717987060547, "reward_std": 0.12531274557113647, "rewards/reward_func/mean": 0.9686717987060547, "rewards/reward_func/std": 0.12531273066997528, "step": 1422, "step_time": 22.384692903608084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.25259891524910927, "epoch": 0.06591014358499306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011868654983118176, "kl": 0.000967387793934904, "learning_rate": 9.868272348309403e-07, "loss": 0.0, "num_tokens": 39110910.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1423, "step_time": 14.067098706960678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 106.5, "completions/mean_terminated_length": 106.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3011389747262001, "epoch": 0.06595646132468735, "frac_reward_zero_std": 1.0, "grad_norm": 0.006358596961945295, "kl": 0.0024511981464456767, "learning_rate": 9.868179712830014e-07, "loss": 0.0001, "num_tokens": 39132022.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1424, "step_time": 12.666402902454138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3955685421824455, "epoch": 0.06600277906438166, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008592951926402748, "kl": 0.0011703874333761632, "learning_rate": 9.868087077350626e-07, "loss": 0.0001, "num_tokens": 39156838.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1425, "step_time": 18.08092623576522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 155.0625, "completions/mean_terminated_length": 155.0625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3590179830789566, "epoch": 0.06604909680407596, "frac_reward_zero_std": 1.0, "grad_norm": 0.002121493685990572, "kl": 0.0016639675304759294, "learning_rate": 9.867994441871237e-07, "loss": 0.0001, "num_tokens": 39193223.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1426, "step_time": 19.95289271697402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 197.9375, "completions/mean_terminated_length": 197.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.31179778277873993, "epoch": 0.06609541454377027, "frac_reward_zero_std": 0.0, "grad_norm": 0.1187836229801178, "kl": 0.002213226573076099, "learning_rate": 9.867901806391848e-07, "loss": 0.0492, "num_tokens": 39224806.0, "reward": 0.7406715154647827, "reward_std": 0.28912854194641113, "rewards/reward_func/mean": 0.7406715154647827, "rewards/reward_func/std": 0.2891285717487335, "step": 1427, "step_time": 25.411048222333193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 194.5625, "completions/mean_terminated_length": 194.5625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2525024935603142, "epoch": 0.06614173228346457, "frac_reward_zero_std": 0.0, "grad_norm": 0.09274663776159286, "kl": 0.0013812848483212292, "learning_rate": 9.86780917091246e-07, "loss": -0.001, "num_tokens": 39249119.0, "reward": 0.6005731225013733, "reward_std": 0.16380806267261505, "rewards/reward_func/mean": 0.6005731225013733, "rewards/reward_func/std": 0.16380807757377625, "step": 1428, "step_time": 20.594968132674694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 189.3125, "completions/mean_terminated_length": 189.3125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.23059514537453651, "epoch": 0.06618805002315888, "frac_reward_zero_std": 0.0, "grad_norm": 0.062491029500961304, "kl": 0.0010999125079251826, "learning_rate": 9.86771653543307e-07, "loss": -0.0285, "num_tokens": 39272916.0, "reward": 0.9853121042251587, "reward_std": 0.04013495892286301, "rewards/reward_func/mean": 0.9853121042251587, "rewards/reward_func/std": 0.04013495147228241, "step": 1429, "step_time": 19.905599888414145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 184.5625, "completions/mean_terminated_length": 184.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.20655705034732819, "epoch": 0.06623436776285317, "frac_reward_zero_std": 0.0, "grad_norm": 0.14017370343208313, "kl": 0.0018786978616844863, "learning_rate": 9.867623899953682e-07, "loss": -0.0422, "num_tokens": 39297725.0, "reward": 0.6085429787635803, "reward_std": 0.35655471682548523, "rewards/reward_func/mean": 0.6085429787635803, "rewards/reward_func/std": 0.3565547466278076, "step": 1430, "step_time": 19.59533415362239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 112.375, "completions/mean_terminated_length": 112.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.30003518611192703, "epoch": 0.06628068550254748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011676916619762778, "kl": 0.0016723115113563836, "learning_rate": 9.867531264474293e-07, "loss": 0.0001, "num_tokens": 39318339.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1431, "step_time": 12.637915696948767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 183.1875, "completions/mean_terminated_length": 183.1875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.1929231435060501, "epoch": 0.06632700324224178, "frac_reward_zero_std": 0.0, "grad_norm": 0.07338245958089828, "kl": 0.0008385853288928047, "learning_rate": 9.867438628994904e-07, "loss": -0.012, "num_tokens": 39348022.0, "reward": 0.6517519950866699, "reward_std": 0.08575544506311417, "rewards/reward_func/mean": 0.6517519950866699, "rewards/reward_func/std": 0.08575543761253357, "step": 1432, "step_time": 19.820004228502512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 198.9375, "completions/mean_terminated_length": 198.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.35739368200302124, "epoch": 0.06637332098193609, "frac_reward_zero_std": 0.0, "grad_norm": 0.09327326714992523, "kl": 0.0018914308166131377, "learning_rate": 9.867345993515516e-07, "loss": 0.1309, "num_tokens": 39370981.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 1433, "step_time": 22.179261937737465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3161913976073265, "epoch": 0.06641963872163038, "frac_reward_zero_std": 0.0, "grad_norm": 0.08861406147480011, "kl": 0.0019559416105039418, "learning_rate": 9.867253358036127e-07, "loss": -0.0142, "num_tokens": 39408389.0, "reward": 0.9501120448112488, "reward_std": 0.014566393569111824, "rewards/reward_func/mean": 0.9501120448112488, "rewards/reward_func/std": 0.014566399157047272, "step": 1434, "step_time": 25.339416343718767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 202.9375, "completions/mean_terminated_length": 202.9375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.36778371781110764, "epoch": 0.0664659564613247, "frac_reward_zero_std": 0.0, "grad_norm": 0.0415370799601078, "kl": 0.0016239224059972912, "learning_rate": 9.867160722556738e-07, "loss": -0.0364, "num_tokens": 39431428.0, "reward": 5.785605026176199e-05, "reward_std": 0.0001243867736775428, "rewards/reward_func/mean": 5.785605026176199e-05, "rewards/reward_func/std": 0.0001243867736775428, "step": 1435, "step_time": 28.81412662193179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 176.6875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.36970124393701553, "epoch": 0.06651227420101899, "frac_reward_zero_std": 1.0, "grad_norm": 0.00309493625536561, "kl": 0.00200174271594733, "learning_rate": 9.867068087077351e-07, "loss": 0.0001, "num_tokens": 39464127.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1436, "step_time": 20.601984571665525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3056706562638283, "epoch": 0.0665585919407133, "frac_reward_zero_std": 1.0, "grad_norm": 0.001805571955628693, "kl": 0.0016618611116427928, "learning_rate": 9.866975451597963e-07, "loss": 0.0001, "num_tokens": 39484637.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1437, "step_time": 14.69915597513318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 181.8125, "completions/mean_terminated_length": 181.8125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.32961641252040863, "epoch": 0.0666049096804076, "frac_reward_zero_std": 0.0, "grad_norm": 0.13558849692344666, "kl": 0.0017127757600974292, "learning_rate": 9.866882816118572e-07, "loss": 0.0372, "num_tokens": 39506714.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 1438, "step_time": 21.642262279987335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4146069064736366, "epoch": 0.0666512274201019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012880455469712615, "kl": 0.0014203835744410753, "learning_rate": 9.866790180639183e-07, "loss": 0.0001, "num_tokens": 39528004.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1439, "step_time": 19.393419571220875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2672436833381653, "epoch": 0.0666975451597962, "frac_reward_zero_std": 1.0, "grad_norm": 0.001372100436128676, "kl": 0.0011037438380299136, "learning_rate": 9.866697545159796e-07, "loss": 0.0001, "num_tokens": 39551101.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1440, "step_time": 14.55172961205244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2265946827828884, "epoch": 0.06674386289949051, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018938696011900902, "kl": 0.0014089698670431972, "learning_rate": 9.866604909680408e-07, "loss": 0.0001, "num_tokens": 39573240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1441, "step_time": 19.77555612847209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 262.8125, "completions/mean_terminated_length": 262.8125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.42162948846817017, "epoch": 0.0667901806391848, "frac_reward_zero_std": 0.0, "grad_norm": 0.07937724143266678, "kl": 0.0015220747154671699, "learning_rate": 9.866512274201019e-07, "loss": -0.2059, "num_tokens": 39602725.0, "reward": 0.17238262295722961, "reward_std": 0.36804524064064026, "rewards/reward_func/mean": 0.17238262295722961, "rewards/reward_func/std": 0.36804524064064026, "step": 1442, "step_time": 33.67408147081733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3209586590528488, "epoch": 0.06683649837887912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019692759960889816, "kl": 0.0015766523429192603, "learning_rate": 9.86641963872163e-07, "loss": 0.0001, "num_tokens": 39638527.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1443, "step_time": 17.754509408026934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 118.375, "completions/mean_terminated_length": 118.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.288492776453495, "epoch": 0.06688281611857341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015787968877702951, "kl": 0.001139527521445416, "learning_rate": 9.866327003242241e-07, "loss": 0.0001, "num_tokens": 39659685.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1444, "step_time": 14.985423538833857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 180.6875, "completions/mean_terminated_length": 180.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.38609933108091354, "epoch": 0.06692913385826772, "frac_reward_zero_std": 1.0, "grad_norm": 0.004827553406357765, "kl": 0.0024070857325568795, "learning_rate": 9.866234367762853e-07, "loss": 0.0001, "num_tokens": 39694064.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1445, "step_time": 21.36505849659443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4466777443885803, "epoch": 0.06697545159796202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007941923686303198, "kl": 0.0013845481153111905, "learning_rate": 9.866141732283464e-07, "loss": 0.0001, "num_tokens": 39732296.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1446, "step_time": 20.569972027093172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4187733381986618, "epoch": 0.06702176933765633, "frac_reward_zero_std": 1.0, "grad_norm": 0.004035596270114183, "kl": 0.0014605073956772685, "learning_rate": 9.866049096804075e-07, "loss": 0.0001, "num_tokens": 39760149.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1447, "step_time": 18.4068068228662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 155.75, "completions/mean_terminated_length": 155.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.21700581908226013, "epoch": 0.06706808707735062, "frac_reward_zero_std": 0.0, "grad_norm": 0.1402587741613388, "kl": 0.0012336500367382541, "learning_rate": 9.865956461324686e-07, "loss": 0.1356, "num_tokens": 39786785.0, "reward": 0.42196887731552124, "reward_std": 0.11329527944326401, "rewards/reward_func/mean": 0.42196887731552124, "rewards/reward_func/std": 0.1132952868938446, "step": 1448, "step_time": 21.55864104256034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 169.5625, "completions/mean_terminated_length": 169.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3580230697989464, "epoch": 0.06711440481704493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010121813975274563, "kl": 0.001544493658002466, "learning_rate": 9.865863825845298e-07, "loss": 0.0001, "num_tokens": 39839178.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1449, "step_time": 24.623450193554163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1634041853249073, "epoch": 0.06716072255673923, "frac_reward_zero_std": 1.0, "grad_norm": 0.000649867404717952, "kl": 0.0007056358299450949, "learning_rate": 9.86577119036591e-07, "loss": 0.0, "num_tokens": 39875624.0, "reward": 0.8668779134750366, "reward_std": 0.0, "rewards/reward_func/mean": 0.8668779134750366, "rewards/reward_func/std": 0.0, "step": 1450, "step_time": 20.678642854094505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 163.8125, "completions/mean_terminated_length": 163.8125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.15259314700961113, "epoch": 0.06720704029643354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006768021266907454, "kl": 0.0007882158679421991, "learning_rate": 9.86567855488652e-07, "loss": 0.0, "num_tokens": 39902101.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 1451, "step_time": 17.91073151677847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 133.6875, "completions/mean_terminated_length": 133.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2859661877155304, "epoch": 0.06725335803612784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023830877617001534, "kl": 0.0018887875776272267, "learning_rate": 9.865585919407131e-07, "loss": 0.0001, "num_tokens": 39922688.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1452, "step_time": 15.119699243456125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.29905904084444046, "epoch": 0.06729967577582215, "frac_reward_zero_std": 0.0, "grad_norm": 0.06141403689980507, "kl": 0.0010330545774195343, "learning_rate": 9.865493283927745e-07, "loss": 0.0127, "num_tokens": 39961964.0, "reward": 0.8056573867797852, "reward_std": 0.10677226632833481, "rewards/reward_func/mean": 0.8056573867797852, "rewards/reward_func/std": 0.10677226632833481, "step": 1453, "step_time": 27.148686934262514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 231.9375, "completions/mean_terminated_length": 231.9375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.2375781461596489, "epoch": 0.06734599351551644, "frac_reward_zero_std": 0.0, "grad_norm": 0.09551922231912613, "kl": 0.001087108044885099, "learning_rate": 9.865400648448356e-07, "loss": 0.011, "num_tokens": 39992699.0, "reward": 0.9854298830032349, "reward_std": 0.019426822662353516, "rewards/reward_func/mean": 0.9854298830032349, "rewards/reward_func/std": 0.019426824524998665, "step": 1454, "step_time": 23.67501976713538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 146.4375, "completions/mean_terminated_length": 146.4375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3509632274508476, "epoch": 0.06739231125521075, "frac_reward_zero_std": 1.0, "grad_norm": 0.005953608546406031, "kl": 0.0023052508768159896, "learning_rate": 9.865308012968967e-07, "loss": 0.0001, "num_tokens": 40013906.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1455, "step_time": 16.13047195971012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 165.1875, "completions/mean_terminated_length": 165.1875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2485116682946682, "epoch": 0.06743862899490505, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007991045131348073, "kl": 0.0010293241939507425, "learning_rate": 9.865215377489578e-07, "loss": 0.0001, "num_tokens": 40034709.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 1456, "step_time": 17.48965096846223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 180.0, "completions/mean_terminated_length": 180.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4434218630194664, "epoch": 0.06748494673459936, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033606907818466425, "kl": 0.0022161082015372813, "learning_rate": 9.86512274201019e-07, "loss": 0.0001, "num_tokens": 40058789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1457, "step_time": 18.66258154064417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.42968491464853287, "epoch": 0.06753126447429365, "frac_reward_zero_std": 1.0, "grad_norm": 0.005685332231223583, "kl": 0.004056319885421544, "learning_rate": 9.8650301065308e-07, "loss": 0.0002, "num_tokens": 40092819.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1458, "step_time": 19.91689220443368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 160.5, "completions/mean_terminated_length": 160.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.15658705309033394, "epoch": 0.06757758221398796, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006694571929983795, "kl": 0.000618590900558047, "learning_rate": 9.864937471051412e-07, "loss": 0.0, "num_tokens": 40116347.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 1459, "step_time": 17.509197983890772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2313786968588829, "epoch": 0.06762389995368226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009354195208288729, "kl": 0.000870649964781478, "learning_rate": 9.864844835572023e-07, "loss": 0.0, "num_tokens": 40136315.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1460, "step_time": 12.83052709326148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.24410752579569817, "epoch": 0.06767021769337657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011613599490374327, "kl": 0.0012364076392259449, "learning_rate": 9.864752200092635e-07, "loss": 0.0001, "num_tokens": 40160050.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1461, "step_time": 17.837164908647537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 112.25, "completions/mean_terminated_length": 112.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2768171429634094, "epoch": 0.06771653543307087, "frac_reward_zero_std": 1.0, "grad_norm": 0.001716109924018383, "kl": 0.0013961562945041806, "learning_rate": 9.864659564613246e-07, "loss": 0.0001, "num_tokens": 40180454.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1462, "step_time": 13.131119310855865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 251.1875, "completions/mean_terminated_length": 251.1875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2666243202984333, "epoch": 0.06776285317276518, "frac_reward_zero_std": 0.0, "grad_norm": 0.08627095818519592, "kl": 0.0017540419066790491, "learning_rate": 9.864566929133857e-07, "loss": -0.092, "num_tokens": 40207017.0, "reward": 0.8939310312271118, "reward_std": 0.2388184517621994, "rewards/reward_func/mean": 0.8939310312271118, "rewards/reward_func/std": 0.2388184517621994, "step": 1463, "step_time": 27.345908522605896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 162.0, "completions/mean_terminated_length": 162.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3583075553178787, "epoch": 0.06780917091245947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011339073535054922, "kl": 0.0013876005250494927, "learning_rate": 9.864474293654468e-07, "loss": 0.0001, "num_tokens": 40240185.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1464, "step_time": 21.53124388307333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 219.3125, "completions/mean_terminated_length": 219.3125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.37661032378673553, "epoch": 0.06785548865215378, "frac_reward_zero_std": 0.0, "grad_norm": 0.09709648042917252, "kl": 0.001658972178120166, "learning_rate": 9.86438165817508e-07, "loss": -0.1201, "num_tokens": 40278446.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 1465, "step_time": 29.761932767927647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.17612246423959732, "epoch": 0.06790180639184808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017276330618187785, "kl": 0.0008722720958758146, "learning_rate": 9.864289022695693e-07, "loss": 0.0, "num_tokens": 40299484.0, "reward": 0.6563555598258972, "reward_std": 0.0, "rewards/reward_func/mean": 0.6563555598258972, "rewards/reward_func/std": 0.0, "step": 1466, "step_time": 17.53114926069975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4142700731754303, "epoch": 0.06794812413154239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010640016989782453, "kl": 0.0016559420037083328, "learning_rate": 9.864196387216304e-07, "loss": 0.0001, "num_tokens": 40329392.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1467, "step_time": 21.341175697743893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.33521513640880585, "epoch": 0.06799444187123668, "frac_reward_zero_std": 1.0, "grad_norm": 0.001112421858124435, "kl": 0.0010444036161061376, "learning_rate": 9.864103751736916e-07, "loss": 0.0001, "num_tokens": 40362160.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1468, "step_time": 19.364713236689568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 134.8125, "completions/mean_terminated_length": 134.8125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3196274861693382, "epoch": 0.06804075961093099, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016130013391375542, "kl": 0.001321357995038852, "learning_rate": 9.864011116257525e-07, "loss": 0.0001, "num_tokens": 40382189.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1469, "step_time": 16.84289249405265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.1728663481771946, "epoch": 0.06808707735062529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009273255127482116, "kl": 0.0007691927021369338, "learning_rate": 9.863918480778138e-07, "loss": 0.0, "num_tokens": 40407407.0, "reward": 0.8464817404747009, "reward_std": 0.0, "rewards/reward_func/mean": 0.8464817404747009, "rewards/reward_func/std": 0.0, "step": 1470, "step_time": 21.170282408595085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 109.5, "completions/mean_terminated_length": 109.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2908935993909836, "epoch": 0.0681333950903196, "frac_reward_zero_std": 1.0, "grad_norm": 0.001340805203653872, "kl": 0.0012932246027048677, "learning_rate": 9.86382584529875e-07, "loss": 0.0001, "num_tokens": 40426871.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1471, "step_time": 12.124918069690466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.34815602749586105, "epoch": 0.0681797128300139, "frac_reward_zero_std": 1.0, "grad_norm": 0.004635367076843977, "kl": 0.0017149716150015593, "learning_rate": 9.86373320981936e-07, "loss": 0.0001, "num_tokens": 40461171.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1472, "step_time": 20.40032485499978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 195.1875, "completions/mean_terminated_length": 195.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.38137224316596985, "epoch": 0.0682260305697082, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007770671509206295, "kl": 0.001301302167121321, "learning_rate": 9.863640574339972e-07, "loss": 0.0001, "num_tokens": 40492854.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1473, "step_time": 23.277416292577982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3474675193428993, "epoch": 0.0682723483094025, "frac_reward_zero_std": 0.0, "grad_norm": 0.0800715908408165, "kl": 0.0016894775035325438, "learning_rate": 9.863547938860583e-07, "loss": -0.0142, "num_tokens": 40520517.0, "reward": 0.9474196434020996, "reward_std": 0.05430473014712334, "rewards/reward_func/mean": 0.9474196434020996, "rewards/reward_func/std": 0.05430473014712334, "step": 1474, "step_time": 20.104807291179895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 189.4375, "completions/mean_terminated_length": 189.4375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.20633366331458092, "epoch": 0.06831866604909681, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006397018441930413, "kl": 0.0008177153358701617, "learning_rate": 9.863455303381194e-07, "loss": 0.0, "num_tokens": 40556188.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1475, "step_time": 22.91086822375655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 212.8125, "completions/mean_terminated_length": 212.8125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.40713974833488464, "epoch": 0.0683649837887911, "frac_reward_zero_std": 0.0, "grad_norm": 0.08309012651443481, "kl": 0.001738260907586664, "learning_rate": 9.863362667901806e-07, "loss": -0.0043, "num_tokens": 40583769.0, "reward": 0.5439493060112, "reward_std": 0.43928495049476624, "rewards/reward_func/mean": 0.5439493060112, "rewards/reward_func/std": 0.43928495049476624, "step": 1476, "step_time": 24.10078265890479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 170.5625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.23392557352781296, "epoch": 0.06841130152848542, "frac_reward_zero_std": 0.0, "grad_norm": 0.179819718003273, "kl": 0.0026494267222005874, "learning_rate": 9.863270032422417e-07, "loss": 0.0009, "num_tokens": 40604786.0, "reward": 0.958366334438324, "reward_std": 0.04875630885362625, "rewards/reward_func/mean": 0.958366334438324, "rewards/reward_func/std": 0.04875630885362625, "step": 1477, "step_time": 18.110772479325533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.23127169162034988, "epoch": 0.06845761926817971, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010967552661895752, "kl": 0.0012604213261511177, "learning_rate": 9.863177396943028e-07, "loss": 0.0001, "num_tokens": 40624272.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1478, "step_time": 12.11015397682786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 288.0625, "completions/mean_terminated_length": 288.0625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.34514622390270233, "epoch": 0.06850393700787402, "frac_reward_zero_std": 0.0, "grad_norm": 0.06308825314044952, "kl": 0.0014682095497846603, "learning_rate": 9.86308476146364e-07, "loss": 0.0653, "num_tokens": 40651281.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 1479, "step_time": 40.92439138144255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 138.5625, "completions/mean_terminated_length": 138.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.28622929006814957, "epoch": 0.06855025474756832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013986499980092049, "kl": 0.0013489331759046763, "learning_rate": 9.862992125984253e-07, "loss": 0.0001, "num_tokens": 40677050.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1480, "step_time": 15.85581860691309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 183.3125, "completions/mean_terminated_length": 183.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2172854319214821, "epoch": 0.06859657248726263, "frac_reward_zero_std": 0.0, "grad_norm": 0.0795762836933136, "kl": 0.0007507414120482281, "learning_rate": 9.862899490504862e-07, "loss": -0.0639, "num_tokens": 40701039.0, "reward": 0.9469864368438721, "reward_std": 0.020694375038146973, "rewards/reward_func/mean": 0.9469864368438721, "rewards/reward_func/std": 0.020694376900792122, "step": 1481, "step_time": 19.871653094887733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 193.0625, "completions/mean_terminated_length": 193.0625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.20894937589764595, "epoch": 0.06864289022695692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010044968221336603, "kl": 0.000901092033018358, "learning_rate": 9.862806855025473e-07, "loss": 0.0, "num_tokens": 40729696.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 1482, "step_time": 21.80692085623741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 182.4375, "completions/mean_terminated_length": 182.4375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3963339775800705, "epoch": 0.06868920796665123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014606897020712495, "kl": 0.0014750435075256974, "learning_rate": 9.862714219546086e-07, "loss": 0.0001, "num_tokens": 40753495.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1483, "step_time": 20.837008390575647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 111.8125, "completions/mean_terminated_length": 111.8125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2510620839893818, "epoch": 0.06873552570634553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027503983583301306, "kl": 0.0013421807816484943, "learning_rate": 9.862621584066698e-07, "loss": 0.0001, "num_tokens": 40773092.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1484, "step_time": 13.886992286890745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3748626112937927, "epoch": 0.06878184344603984, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009795506484806538, "kl": 0.0013817641884088516, "learning_rate": 9.862528948587309e-07, "loss": 0.0001, "num_tokens": 40808183.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1485, "step_time": 21.10917278006673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 196.75, "completions/mean_terminated_length": 196.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.4569511339068413, "epoch": 0.06882816118573414, "frac_reward_zero_std": 1.0, "grad_norm": 0.003057107562199235, "kl": 0.0021614345896523446, "learning_rate": 9.86243631310792e-07, "loss": 0.0001, "num_tokens": 40836915.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1486, "step_time": 22.84413205832243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.33205533027648926, "epoch": 0.06887447892542845, "frac_reward_zero_std": 1.0, "grad_norm": 0.001092462451197207, "kl": 0.0013362477766349912, "learning_rate": 9.862343677628531e-07, "loss": 0.0001, "num_tokens": 40872756.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1487, "step_time": 19.739625692367554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.30517494678497314, "epoch": 0.06892079666512274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013137499336153269, "kl": 0.0014369545096997172, "learning_rate": 9.862251042149143e-07, "loss": 0.0001, "num_tokens": 40892970.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1488, "step_time": 13.604293052107096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3870902359485626, "epoch": 0.06896711440481705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012158052995800972, "kl": 0.0013868208043277264, "learning_rate": 9.862158406669754e-07, "loss": 0.0001, "num_tokens": 40920314.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1489, "step_time": 18.90192151069641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3637043759226799, "epoch": 0.06901343214451135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009444152819924057, "kl": 0.0014103133289609104, "learning_rate": 9.862065771190365e-07, "loss": 0.0001, "num_tokens": 40954872.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1490, "step_time": 25.023595243692398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.40260692685842514, "epoch": 0.06905974988420566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007991600432433188, "kl": 0.0014214868424460292, "learning_rate": 9.861973135710976e-07, "loss": 0.0001, "num_tokens": 40989215.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1491, "step_time": 21.194200597703457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 120.75, "completions/mean_terminated_length": 120.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.30024923384189606, "epoch": 0.06910606762389995, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011232905089855194, "kl": 0.0013113229360897094, "learning_rate": 9.861880500231588e-07, "loss": 0.0001, "num_tokens": 41010619.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1492, "step_time": 15.170989360660315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 212.75, "completions/mean_terminated_length": 212.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.35055892914533615, "epoch": 0.06915238536359426, "frac_reward_zero_std": 0.0, "grad_norm": 0.07122951745986938, "kl": 0.0015935766277834773, "learning_rate": 9.8617878647522e-07, "loss": -0.0742, "num_tokens": 41035111.0, "reward": 0.17740625143051147, "reward_std": 0.12721239030361176, "rewards/reward_func/mean": 0.17740625143051147, "rewards/reward_func/std": 0.12721239030361176, "step": 1493, "step_time": 25.952363431453705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3645920604467392, "epoch": 0.06919870310328856, "frac_reward_zero_std": 1.0, "grad_norm": 0.001596534508280456, "kl": 0.0016429739189334214, "learning_rate": 9.86169522927281e-07, "loss": 0.0001, "num_tokens": 41088657.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1494, "step_time": 23.631266605108976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.30949675291776657, "epoch": 0.06924502084298287, "frac_reward_zero_std": 0.0, "grad_norm": 0.08504604548215866, "kl": 0.002443965640850365, "learning_rate": 9.861602593793421e-07, "loss": 0.0474, "num_tokens": 41114423.0, "reward": 0.611486554145813, "reward_std": 0.36575570702552795, "rewards/reward_func/mean": 0.611486554145813, "rewards/reward_func/std": 0.36575576663017273, "step": 1495, "step_time": 21.925276305526495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.16924650967121124, "epoch": 0.06929133858267716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006864202441647649, "kl": 0.0007491152500733733, "learning_rate": 9.861509958314035e-07, "loss": 0.0, "num_tokens": 41168279.0, "reward": 0.5623413324356079, "reward_std": 0.0, "rewards/reward_func/mean": 0.5623413324356079, "rewards/reward_func/std": 0.0, "step": 1496, "step_time": 26.513740804046392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3513307720422745, "epoch": 0.06933765632237147, "frac_reward_zero_std": 1.0, "grad_norm": 0.002650262787938118, "kl": 0.0017704792262520641, "learning_rate": 9.861417322834646e-07, "loss": 0.0001, "num_tokens": 41200180.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1497, "step_time": 19.84450488165021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3050597086548805, "epoch": 0.06938397406206577, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011662907199934125, "kl": 0.0015261732041835785, "learning_rate": 9.861324687355257e-07, "loss": 0.0001, "num_tokens": 41236061.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1498, "step_time": 17.509271383285522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 134.8125, "completions/mean_terminated_length": 134.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2897963374853134, "epoch": 0.06943029180176008, "frac_reward_zero_std": 1.0, "grad_norm": 0.001060917042195797, "kl": 0.0011977645626757294, "learning_rate": 9.861232051875869e-07, "loss": 0.0001, "num_tokens": 41257994.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1499, "step_time": 14.279197268188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.31334809958934784, "epoch": 0.06947660954145438, "frac_reward_zero_std": 0.0, "grad_norm": 0.09188850969076157, "kl": 0.0020196071709506214, "learning_rate": 9.86113941639648e-07, "loss": -0.0171, "num_tokens": 41278694.0, "reward": 0.8301382660865784, "reward_std": 0.3240528106689453, "rewards/reward_func/mean": 0.8301382660865784, "rewards/reward_func/std": 0.3240528106689453, "step": 1500, "step_time": 20.810988426208496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 169.6875, "completions/mean_terminated_length": 169.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.20143620669841766, "epoch": 0.06952292728114869, "frac_reward_zero_std": 1.0, "grad_norm": 0.00185035087633878, "kl": 0.0014070851902943105, "learning_rate": 9.86104678091709e-07, "loss": 0.0001, "num_tokens": 41311921.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 1501, "step_time": 21.402443937957287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 150.1875, "completions/mean_terminated_length": 150.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.34827468544244766, "epoch": 0.06956924502084298, "frac_reward_zero_std": 1.0, "grad_norm": 0.002683652099221945, "kl": 0.0019498355686664581, "learning_rate": 9.860954145437702e-07, "loss": 0.0001, "num_tokens": 41334388.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1502, "step_time": 16.499029833823442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.18773585185408592, "epoch": 0.06961556276053729, "frac_reward_zero_std": 1.0, "grad_norm": 0.003130219643935561, "kl": 0.001884453697130084, "learning_rate": 9.860861509958314e-07, "loss": 0.0001, "num_tokens": 41387554.0, "reward": 0.8890097737312317, "reward_std": 0.0, "rewards/reward_func/mean": 0.8890097737312317, "rewards/reward_func/std": 0.0, "step": 1503, "step_time": 26.752739500254393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.25347447022795677, "epoch": 0.06966188050023159, "frac_reward_zero_std": 1.0, "grad_norm": 0.002005944726988673, "kl": 0.0013062612706562504, "learning_rate": 9.860768874478925e-07, "loss": 0.0001, "num_tokens": 41408214.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1504, "step_time": 14.816963702440262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 126.1875, "completions/mean_terminated_length": 126.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2511373423039913, "epoch": 0.0697081982399259, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014657479478046298, "kl": 0.0011138530389871448, "learning_rate": 9.860676238999536e-07, "loss": 0.0001, "num_tokens": 41428889.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1505, "step_time": 13.845789287239313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 222.25, "completions/mean_terminated_length": 222.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.14946654438972473, "epoch": 0.0697545159796202, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005345660028979182, "kl": 0.0006829862395534292, "learning_rate": 9.860583603520147e-07, "loss": 0.0, "num_tokens": 41456813.0, "reward": 0.9636404514312744, "reward_std": 0.0, "rewards/reward_func/mean": 0.9636404514312744, "rewards/reward_func/std": 0.0, "step": 1506, "step_time": 23.404409043490887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 169.1875, "completions/mean_terminated_length": 169.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3296942487359047, "epoch": 0.0698008337193145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017873396864160895, "kl": 0.0013602831750176847, "learning_rate": 9.860490968040759e-07, "loss": 0.0001, "num_tokens": 41481136.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1507, "step_time": 18.71742406859994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 234.6875, "completions/mean_terminated_length": 234.6875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.244209386408329, "epoch": 0.0698471514590088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007023353246040642, "kl": 0.0011829751019831747, "learning_rate": 9.86039833256137e-07, "loss": 0.0001, "num_tokens": 41503851.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1508, "step_time": 21.888627737760544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 142.8125, "completions/mean_terminated_length": 142.8125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.33800843358039856, "epoch": 0.06989346919870311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007219344261102378, "kl": 0.001116905506933108, "learning_rate": 9.86030569708198e-07, "loss": 0.0001, "num_tokens": 41529960.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1509, "step_time": 16.180447284132242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 128.3125, "completions/mean_terminated_length": 128.3125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2081788331270218, "epoch": 0.0699397869383974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009349219035357237, "kl": 0.0007721676956862211, "learning_rate": 9.860213061602594e-07, "loss": 0.0, "num_tokens": 41549389.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1510, "step_time": 13.564359836280346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 211.3125, "completions/mean_terminated_length": 211.3125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3014207184314728, "epoch": 0.06998610467809172, "frac_reward_zero_std": 0.0, "grad_norm": 0.0804465040564537, "kl": 0.0013809468073304743, "learning_rate": 9.860120426123206e-07, "loss": -0.0494, "num_tokens": 41587394.0, "reward": 0.3422159254550934, "reward_std": 0.4596257507801056, "rewards/reward_func/mean": 0.3422159254550934, "rewards/reward_func/std": 0.459625780582428, "step": 1511, "step_time": 28.52531709894538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 129.125, "completions/mean_terminated_length": 129.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.20699350163340569, "epoch": 0.07003242241778601, "frac_reward_zero_std": 0.0, "grad_norm": 0.12705107033252716, "kl": 0.0009369904728373513, "learning_rate": 9.860027790643815e-07, "loss": 0.0046, "num_tokens": 41609716.0, "reward": 0.24965901672840118, "reward_std": 0.008113396354019642, "rewards/reward_func/mean": 0.24965901672840118, "rewards/reward_func/std": 0.008113403804600239, "step": 1512, "step_time": 17.932420033961535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.22529035061597824, "epoch": 0.07007874015748032, "frac_reward_zero_std": 1.0, "grad_norm": 0.00819733738899231, "kl": 0.0019448845705483109, "learning_rate": 9.859935155164428e-07, "loss": 0.0001, "num_tokens": 41629596.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1513, "step_time": 15.249301470816135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.27332042157649994, "epoch": 0.07012505789717462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030940512660890818, "kl": 0.0012383492721710354, "learning_rate": 9.85984251968504e-07, "loss": 0.0001, "num_tokens": 41660104.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1514, "step_time": 17.43046096712351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 150.3125, "completions/mean_terminated_length": 150.3125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.4052044004201889, "epoch": 0.07017137563686893, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012219988275319338, "kl": 0.0013732522202190012, "learning_rate": 9.85974988420565e-07, "loss": 0.0001, "num_tokens": 41697901.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1515, "step_time": 21.802428640425205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4087982252240181, "epoch": 0.07021769337656322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033047806937247515, "kl": 0.0025360305444337428, "learning_rate": 9.859657248726262e-07, "loss": 0.0001, "num_tokens": 41723666.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1516, "step_time": 21.03728961199522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 187.8125, "completions/mean_terminated_length": 187.8125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3777107447385788, "epoch": 0.07026401111625753, "frac_reward_zero_std": 1.0, "grad_norm": 0.004337935708463192, "kl": 0.0023344085493590683, "learning_rate": 9.859564613246873e-07, "loss": 0.0001, "num_tokens": 41748447.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1517, "step_time": 19.76603312790394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.19641326740384102, "epoch": 0.07031032885595183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007934992900118232, "kl": 0.00101024258765392, "learning_rate": 9.859471977767484e-07, "loss": 0.0001, "num_tokens": 41775056.0, "reward": 0.19390326738357544, "reward_std": 0.0, "rewards/reward_func/mean": 0.19390326738357544, "rewards/reward_func/std": 0.0, "step": 1518, "step_time": 18.733803275972605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.43163342773914337, "epoch": 0.07035664659564614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010543644893914461, "kl": 0.0015909954381641, "learning_rate": 9.859379342288096e-07, "loss": 0.0001, "num_tokens": 41819117.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1519, "step_time": 23.68850500881672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.27073976024985313, "epoch": 0.07040296433534043, "frac_reward_zero_std": 0.0, "grad_norm": 0.0937320739030838, "kl": 0.001118386906455271, "learning_rate": 9.859286706808707e-07, "loss": -0.0047, "num_tokens": 41845861.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 1520, "step_time": 23.97225707396865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 147.6875, "completions/mean_terminated_length": 147.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.351545050740242, "epoch": 0.07044928207503474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018799840472638607, "kl": 0.0013992716849315912, "learning_rate": 9.859194071329318e-07, "loss": 0.0001, "num_tokens": 41868080.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1521, "step_time": 17.113589253276587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 143.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2863616496324539, "epoch": 0.07049559981472904, "frac_reward_zero_std": 1.0, "grad_norm": 0.005100011359900236, "kl": 0.0014330961857922375, "learning_rate": 9.85910143584993e-07, "loss": 0.0001, "num_tokens": 41888838.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1522, "step_time": 15.039297252893448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.287519246339798, "epoch": 0.07054191755442335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013172749895602465, "kl": 0.001129176904214546, "learning_rate": 9.859008800370543e-07, "loss": 0.0001, "num_tokens": 41911457.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1523, "step_time": 13.754213828593493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.22019926458597183, "epoch": 0.07058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006993401912041008, "kl": 0.0008566299948142841, "learning_rate": 9.858916164891154e-07, "loss": 0.0, "num_tokens": 41939349.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1524, "step_time": 23.889339812099934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 205.4375, "completions/mean_terminated_length": 205.4375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.3681313022971153, "epoch": 0.07063455303381196, "frac_reward_zero_std": 0.0, "grad_norm": 0.1134500578045845, "kl": 0.0015738861984573305, "learning_rate": 9.858823529411763e-07, "loss": -0.0365, "num_tokens": 41977212.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 1525, "step_time": 24.386278919875622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.26344146579504013, "epoch": 0.07068087077350625, "frac_reward_zero_std": 0.0, "grad_norm": 0.06881923973560333, "kl": 0.001262011195649393, "learning_rate": 9.858730893932376e-07, "loss": -0.0533, "num_tokens": 42017464.0, "reward": 0.6692327260971069, "reward_std": 0.3416149616241455, "rewards/reward_func/mean": 0.6692327260971069, "rewards/reward_func/std": 0.3416149914264679, "step": 1526, "step_time": 29.74383533746004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.18717294186353683, "epoch": 0.07072718851320056, "frac_reward_zero_std": 0.0, "grad_norm": 0.10462996363639832, "kl": 0.001257291398360394, "learning_rate": 9.858638258452988e-07, "loss": -0.0005, "num_tokens": 42039098.0, "reward": 0.9080443978309631, "reward_std": 0.03526148200035095, "rewards/reward_func/mean": 0.9080443978309631, "rewards/reward_func/std": 0.035261478275060654, "step": 1527, "step_time": 15.544252336025238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.2726113051176071, "epoch": 0.07077350625289486, "frac_reward_zero_std": 0.0, "grad_norm": 0.08319716900587082, "kl": 0.001257637224625796, "learning_rate": 9.8585456229736e-07, "loss": -0.0156, "num_tokens": 42076981.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 1528, "step_time": 24.489597510546446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.19127242267131805, "epoch": 0.07081982399258917, "frac_reward_zero_std": 0.0, "grad_norm": 0.09630709886550903, "kl": 0.0011548186303116381, "learning_rate": 9.85845298749421e-07, "loss": -0.051, "num_tokens": 42097373.0, "reward": 0.9364910125732422, "reward_std": 0.18305334448814392, "rewards/reward_func/mean": 0.9364910125732422, "rewards/reward_func/std": 0.18305335938930511, "step": 1529, "step_time": 17.424638710916042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.17711615934967995, "epoch": 0.07086614173228346, "frac_reward_zero_std": 0.0, "grad_norm": 0.1408398449420929, "kl": 0.00129914321587421, "learning_rate": 9.858360352014821e-07, "loss": 0.0449, "num_tokens": 42128620.0, "reward": 0.9340888261795044, "reward_std": 0.08788163214921951, "rewards/reward_func/mean": 0.9340888261795044, "rewards/reward_func/std": 0.08788162469863892, "step": 1530, "step_time": 20.511590659618378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 189.75, "completions/mean_terminated_length": 189.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.17410175502300262, "epoch": 0.07091245947197777, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045576211996376514, "kl": 0.0012940284796059132, "learning_rate": 9.858267716535433e-07, "loss": 0.0001, "num_tokens": 42151368.0, "reward": 0.8553453087806702, "reward_std": 0.0, "rewards/reward_func/mean": 0.8553453087806702, "rewards/reward_func/std": 0.0, "step": 1531, "step_time": 18.73728959262371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2945939749479294, "epoch": 0.07095877721167207, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010982095263898373, "kl": 0.0012439826678019017, "learning_rate": 9.858175081056044e-07, "loss": 0.0001, "num_tokens": 42172393.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1532, "step_time": 14.610783133655787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 141.5625, "completions/mean_terminated_length": 141.5625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2522287368774414, "epoch": 0.07100509495136638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022103125229477882, "kl": 0.0011451515456428751, "learning_rate": 9.858082445576655e-07, "loss": 0.0001, "num_tokens": 42202178.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1533, "step_time": 17.727550856769085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 140.9375, "completions/mean_terminated_length": 140.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.32373616844415665, "epoch": 0.07105141269106068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015049474313855171, "kl": 0.0013415070425253361, "learning_rate": 9.857989810097266e-07, "loss": 0.0001, "num_tokens": 42230017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1534, "step_time": 16.30304079130292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3020930588245392, "epoch": 0.07109773043075499, "frac_reward_zero_std": 0.0, "grad_norm": 0.03848927095532417, "kl": 0.001073820938472636, "learning_rate": 9.857897174617878e-07, "loss": 0.0075, "num_tokens": 42251159.0, "reward": 0.0004132419126108289, "reward_std": 0.00011019785597454756, "rewards/reward_func/mean": 0.0004132419126108289, "rewards/reward_func/std": 0.00011019784869858995, "step": 1535, "step_time": 16.230277463793755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.37558628618717194, "epoch": 0.07114404817044928, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028143664821982384, "kl": 0.0021767235593870282, "learning_rate": 9.857804539138491e-07, "loss": 0.0001, "num_tokens": 42273734.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1536, "step_time": 18.9822254255414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3253486528992653, "epoch": 0.07119036591014359, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019250869518145919, "kl": 0.0018611763371154666, "learning_rate": 9.8577119036591e-07, "loss": 0.0001, "num_tokens": 42295814.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1537, "step_time": 14.144557140767574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 234.875, "completions/mean_terminated_length": 234.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.28953490406274796, "epoch": 0.07123668364983789, "frac_reward_zero_std": 0.0, "grad_norm": 0.05726927891373634, "kl": 0.0012834136723540723, "learning_rate": 9.857619268179711e-07, "loss": 0.019, "num_tokens": 42320676.0, "reward": 0.005292844492942095, "reward_std": 0.0027755454648286104, "rewards/reward_func/mean": 0.005292844492942095, "rewards/reward_func/std": 0.0027755454648286104, "step": 1538, "step_time": 22.797854535281658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 142.9375, "completions/mean_terminated_length": 142.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3037726953625679, "epoch": 0.0712830013895322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012993714772164822, "kl": 0.0015164695214480162, "learning_rate": 9.857526632700323e-07, "loss": 0.0001, "num_tokens": 42348339.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1539, "step_time": 16.37822227180004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.257300678640604, "epoch": 0.07132931912922649, "frac_reward_zero_std": 0.0, "grad_norm": 0.0826321393251419, "kl": 0.0015540139575023204, "learning_rate": 9.857433997220936e-07, "loss": 0.0212, "num_tokens": 42378407.0, "reward": 0.9962121248245239, "reward_std": 0.005802525207400322, "rewards/reward_func/mean": 0.9962121248245239, "rewards/reward_func/std": 0.005802526138722897, "step": 1540, "step_time": 28.582728251814842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3340243548154831, "epoch": 0.0713756368689208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017559159314259887, "kl": 0.001430229633115232, "learning_rate": 9.857341361741547e-07, "loss": 0.0001, "num_tokens": 42407289.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1541, "step_time": 21.22570151463151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4333033561706543, "epoch": 0.0714219546086151, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011042456608265638, "kl": 0.0015356171934399754, "learning_rate": 9.857248726262159e-07, "loss": 0.0001, "num_tokens": 42457591.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1542, "step_time": 24.36034982651472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 118.625, "completions/mean_terminated_length": 118.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24477159976959229, "epoch": 0.07146827234830941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065881493501365185, "kl": 0.0025198623770847917, "learning_rate": 9.85715609078277e-07, "loss": 0.0001, "num_tokens": 42478753.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1543, "step_time": 13.17404094710946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.21906909719109535, "epoch": 0.0715145900880037, "frac_reward_zero_std": 0.0, "grad_norm": 0.0927964523434639, "kl": 0.0017490903264842927, "learning_rate": 9.857063455303381e-07, "loss": -0.0622, "num_tokens": 42500164.0, "reward": 0.3209686875343323, "reward_std": 0.18616418540477753, "rewards/reward_func/mean": 0.3209686875343323, "rewards/reward_func/std": 0.18616418540477753, "step": 1544, "step_time": 20.35236304998398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.33880386501550674, "epoch": 0.07156090782769801, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019858982414007187, "kl": 0.0014867189747747034, "learning_rate": 9.856970819823992e-07, "loss": 0.0001, "num_tokens": 42522932.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1545, "step_time": 17.753128845244646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 210.4375, "completions/mean_terminated_length": 210.4375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.35662582516670227, "epoch": 0.07160722556739231, "frac_reward_zero_std": 0.0, "grad_norm": 0.09567058086395264, "kl": 0.0016411022515967488, "learning_rate": 9.856878184344604e-07, "loss": -0.0326, "num_tokens": 42557371.0, "reward": 0.7110782861709595, "reward_std": 0.3527936637401581, "rewards/reward_func/mean": 0.7110782861709595, "rewards/reward_func/std": 0.3527936339378357, "step": 1546, "step_time": 24.16016223654151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.3172084540128708, "epoch": 0.07165354330708662, "frac_reward_zero_std": 0.0, "grad_norm": 0.07621504366397858, "kl": 0.0023754789726808667, "learning_rate": 9.856785548865215e-07, "loss": -0.0338, "num_tokens": 42579945.0, "reward": 0.5271327495574951, "reward_std": 0.31990692019462585, "rewards/reward_func/mean": 0.5271327495574951, "rewards/reward_func/std": 0.31990692019462585, "step": 1547, "step_time": 21.686159301549196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 129.4375, "completions/mean_terminated_length": 129.4375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2654934898018837, "epoch": 0.07169986104678092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008809834835119545, "kl": 0.0012236626353114843, "learning_rate": 9.856692913385826e-07, "loss": 0.0001, "num_tokens": 42602880.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1548, "step_time": 15.267893463373184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 276.9375, "completions/mean_terminated_length": 276.9375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.48166823387145996, "epoch": 0.07174617878647523, "frac_reward_zero_std": 0.0, "grad_norm": 0.07281829416751862, "kl": 0.0012351717159617692, "learning_rate": 9.856600277906437e-07, "loss": 0.0372, "num_tokens": 42635519.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 1549, "step_time": 36.32584190368652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2776329219341278, "epoch": 0.07179249652616952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011818609200417995, "kl": 0.0012460009602364153, "learning_rate": 9.856507642427049e-07, "loss": 0.0001, "num_tokens": 42660474.0, "reward": 5.965462696622126e-05, "reward_std": 0.0, "rewards/reward_func/mean": 5.965462696622126e-05, "rewards/reward_func/std": 0.0, "step": 1550, "step_time": 17.04089403897524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 119.5625, "completions/mean_terminated_length": 119.5625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23602842167019844, "epoch": 0.07183881426586383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013200322864577174, "kl": 0.0011846898560179397, "learning_rate": 9.85641500694766e-07, "loss": 0.0001, "num_tokens": 42680131.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1551, "step_time": 13.902039337903261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 103.6875, "completions/mean_terminated_length": 103.6875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.27024680376052856, "epoch": 0.07188513200555813, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014684591442346573, "kl": 0.001529490080429241, "learning_rate": 9.856322371468271e-07, "loss": 0.0001, "num_tokens": 42699710.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1552, "step_time": 12.134970366954803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2280416190624237, "epoch": 0.07193144974525244, "frac_reward_zero_std": 0.0, "grad_norm": 0.0778835192322731, "kl": 0.001227862056111917, "learning_rate": 9.856229735988884e-07, "loss": -0.0424, "num_tokens": 42723541.0, "reward": 0.9569141864776611, "reward_std": 0.034468624740839005, "rewards/reward_func/mean": 0.9569141864776611, "rewards/reward_func/std": 0.034468621015548706, "step": 1553, "step_time": 19.63005105406046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 123.6875, "completions/mean_terminated_length": 123.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.27046234905719757, "epoch": 0.07197776748494673, "frac_reward_zero_std": 1.0, "grad_norm": 0.001302762539125979, "kl": 0.0013518096529878676, "learning_rate": 9.856137100509496e-07, "loss": 0.0001, "num_tokens": 42743248.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1554, "step_time": 14.400965578854084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.15213468298316002, "epoch": 0.07202408522464104, "frac_reward_zero_std": 1.0, "grad_norm": 0.001393636455759406, "kl": 0.0008806453843135387, "learning_rate": 9.856044465030105e-07, "loss": 0.0, "num_tokens": 42763646.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1555, "step_time": 15.216861758381128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.22033360600471497, "epoch": 0.07207040296433534, "frac_reward_zero_std": 0.0, "grad_norm": 0.1002715602517128, "kl": 0.0021226152894087136, "learning_rate": 9.855951829550718e-07, "loss": -0.0179, "num_tokens": 42791758.0, "reward": 0.5921446084976196, "reward_std": 0.2171451300382614, "rewards/reward_func/mean": 0.5921446084976196, "rewards/reward_func/std": 0.2171451449394226, "step": 1556, "step_time": 18.4462536200881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 187.75, "completions/mean_terminated_length": 187.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2702718712389469, "epoch": 0.07211672070402965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011533231008797884, "kl": 0.001254095055628568, "learning_rate": 9.85585919407133e-07, "loss": 0.0001, "num_tokens": 42818602.0, "reward": 0.694277822971344, "reward_std": 0.0, "rewards/reward_func/mean": 0.694277822971344, "rewards/reward_func/std": 0.0, "step": 1557, "step_time": 22.43714876100421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 131.9375, "completions/mean_terminated_length": 131.9375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3226363807916641, "epoch": 0.07216303844372395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008729996625334024, "kl": 0.0011182344460394233, "learning_rate": 9.85576655859194e-07, "loss": 0.0001, "num_tokens": 42841369.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1558, "step_time": 14.938069373369217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 249.5625, "completions/mean_terminated_length": 249.5625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.3340029790997505, "epoch": 0.07220935618341826, "frac_reward_zero_std": 0.0, "grad_norm": 0.09114965796470642, "kl": 0.00174188960227184, "learning_rate": 9.855673923112552e-07, "loss": -0.1493, "num_tokens": 42880370.0, "reward": 0.27733278274536133, "reward_std": 0.39612674713134766, "rewards/reward_func/mean": 0.27733278274536133, "rewards/reward_func/std": 0.39612674713134766, "step": 1559, "step_time": 33.74260265380144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 198.3125, "completions/mean_terminated_length": 198.3125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.24380797892808914, "epoch": 0.07225567392311255, "frac_reward_zero_std": 0.0, "grad_norm": 0.06857086718082428, "kl": 0.0008929253672249615, "learning_rate": 9.855581287633163e-07, "loss": -0.0055, "num_tokens": 42916183.0, "reward": 0.874759316444397, "reward_std": 0.10368803143501282, "rewards/reward_func/mean": 0.874759316444397, "rewards/reward_func/std": 0.10368802398443222, "step": 1560, "step_time": 22.03244859352708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.21797719597816467, "epoch": 0.07230199166280686, "frac_reward_zero_std": 0.0, "grad_norm": 0.08479955047369003, "kl": 0.0018371949554421008, "learning_rate": 9.855488652153774e-07, "loss": -0.0467, "num_tokens": 42939257.0, "reward": 0.9113311767578125, "reward_std": 0.24497197568416595, "rewards/reward_func/mean": 0.9113311767578125, "rewards/reward_func/std": 0.24497199058532715, "step": 1561, "step_time": 25.829491283744574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 163.1875, "completions/mean_terminated_length": 163.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.26966360956430435, "epoch": 0.07234830940250116, "frac_reward_zero_std": 0.0, "grad_norm": 0.2612365484237671, "kl": 0.0017238709842786193, "learning_rate": 9.855396016674386e-07, "loss": -0.169, "num_tokens": 42963420.0, "reward": 0.19720938801765442, "reward_std": 0.26262229681015015, "rewards/reward_func/mean": 0.19720938801765442, "rewards/reward_func/std": 0.26262232661247253, "step": 1562, "step_time": 20.299479123204947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.281155988574028, "epoch": 0.07239462714219547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015874463133513927, "kl": 0.0014850206207484007, "learning_rate": 9.855303381194997e-07, "loss": 0.0001, "num_tokens": 43000862.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1563, "step_time": 24.06028039380908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 206.1875, "completions/mean_terminated_length": 206.1875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.33932363986968994, "epoch": 0.07244094488188976, "frac_reward_zero_std": 0.0, "grad_norm": 0.06308679282665253, "kl": 0.0017190151556860656, "learning_rate": 9.855210745715608e-07, "loss": 0.0288, "num_tokens": 43025905.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 1564, "step_time": 21.441253323107958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.34898870438337326, "epoch": 0.07248726262158407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019805605988949537, "kl": 0.0015508705982938409, "learning_rate": 9.85511811023622e-07, "loss": 0.0001, "num_tokens": 43051713.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1565, "step_time": 18.230930637568235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.33082958310842514, "epoch": 0.07253358036127837, "frac_reward_zero_std": 0.0, "grad_norm": 0.062147557735443115, "kl": 0.001354100095340982, "learning_rate": 9.855025474756833e-07, "loss": 0.0063, "num_tokens": 43084087.0, "reward": 0.8738458156585693, "reward_std": 0.23302555084228516, "rewards/reward_func/mean": 0.8738458156585693, "rewards/reward_func/std": 0.23302556574344635, "step": 1566, "step_time": 28.961606048047543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 195.4375, "completions/mean_terminated_length": 195.4375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.27759377658367157, "epoch": 0.07257989810097268, "frac_reward_zero_std": 1.0, "grad_norm": 0.00172783515881747, "kl": 0.0012512272805906832, "learning_rate": 9.854932839277444e-07, "loss": 0.0001, "num_tokens": 43106286.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 1567, "step_time": 20.849223010241985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2594129666686058, "epoch": 0.07262621584066697, "frac_reward_zero_std": 1.0, "grad_norm": 0.001080922782421112, "kl": 0.0012128448870498687, "learning_rate": 9.854840203798053e-07, "loss": 0.0001, "num_tokens": 43125944.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1568, "step_time": 13.997385706752539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 179.9375, "completions/mean_terminated_length": 179.9375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.284223347902298, "epoch": 0.07267253358036128, "frac_reward_zero_std": 1.0, "grad_norm": 0.001301810727454722, "kl": 0.001221503916895017, "learning_rate": 9.854747568318664e-07, "loss": 0.0001, "num_tokens": 43149511.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 1569, "step_time": 18.766128912568092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 209.8125, "completions/mean_terminated_length": 209.8125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.27987564355134964, "epoch": 0.07271885132005558, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011029279557988048, "kl": 0.001090632751584053, "learning_rate": 9.854654932839278e-07, "loss": 0.0001, "num_tokens": 43176004.0, "reward": 0.4111122786998749, "reward_std": 0.0, "rewards/reward_func/mean": 0.4111122786998749, "rewards/reward_func/std": 0.0, "step": 1570, "step_time": 21.385261174291372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.276499480009079, "epoch": 0.07276516905974989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014302186900749803, "kl": 0.0012794569629477337, "learning_rate": 9.85456229735989e-07, "loss": 0.0001, "num_tokens": 43199784.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1571, "step_time": 20.255388107150793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 228.6875, "completions/mean_terminated_length": 228.6875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.223116934299469, "epoch": 0.07281148679944419, "frac_reward_zero_std": 1.0, "grad_norm": 0.000991158070974052, "kl": 0.0010674456279957667, "learning_rate": 9.8544696618805e-07, "loss": 0.0001, "num_tokens": 43224595.0, "reward": 0.8668779134750366, "reward_std": 0.0, "rewards/reward_func/mean": 0.8668779134750366, "rewards/reward_func/std": 0.0, "step": 1572, "step_time": 22.18226484954357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 122.0625, "completions/mean_terminated_length": 122.0625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2856691852211952, "epoch": 0.0728578045391385, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024775387719273567, "kl": 0.0017244815244339406, "learning_rate": 9.854377026401112e-07, "loss": 0.0001, "num_tokens": 43245108.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1573, "step_time": 13.81055148690939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.22614125907421112, "epoch": 0.07290412227883279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031758092809468508, "kl": 0.0012325557472649962, "learning_rate": 9.854284390921723e-07, "loss": 0.0001, "num_tokens": 43266486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1574, "step_time": 14.72935138642788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.18126782774925232, "epoch": 0.0729504400185271, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023152644280344248, "kl": 0.0010906383686233312, "learning_rate": 9.854191755442334e-07, "loss": 0.0001, "num_tokens": 43294134.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1575, "step_time": 17.822266314178705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 133.9375, "completions/mean_terminated_length": 133.9375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.31912506371736526, "epoch": 0.0729967577582214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012292397441342473, "kl": 0.001754308439558372, "learning_rate": 9.854099119962945e-07, "loss": 0.0001, "num_tokens": 43321141.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1576, "step_time": 16.450386211276054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 129.8125, "completions/mean_terminated_length": 129.8125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2551668845117092, "epoch": 0.07304307549791571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017605704488232732, "kl": 0.0014316203887574375, "learning_rate": 9.854006484483557e-07, "loss": 0.0001, "num_tokens": 43340706.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1577, "step_time": 13.634104192256927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.37115003913640976, "epoch": 0.07308939323761, "frac_reward_zero_std": 1.0, "grad_norm": 0.004548309370875359, "kl": 0.001881248492281884, "learning_rate": 9.853913849004168e-07, "loss": 0.0001, "num_tokens": 43364722.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1578, "step_time": 18.42423866316676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 138.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.35234903544187546, "epoch": 0.07313571097730431, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015647612744942307, "kl": 0.0015877830155659467, "learning_rate": 9.85382121352478e-07, "loss": 0.0001, "num_tokens": 43400653.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1579, "step_time": 18.515632305294275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 158.8125, "completions/mean_terminated_length": 158.8125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3327949196100235, "epoch": 0.07318202871699861, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010463129729032516, "kl": 0.0013174892228562385, "learning_rate": 9.85372857804539e-07, "loss": 0.0001, "num_tokens": 43434346.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1580, "step_time": 22.90632711723447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 424.5625, "completions/mean_terminated_length": 424.5625, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.16697721183300018, "epoch": 0.07322834645669292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005113129736855626, "kl": 0.0006655202596448362, "learning_rate": 9.853635942566002e-07, "loss": 0.0, "num_tokens": 43472355.0, "reward": 0.7753521203994751, "reward_std": 0.0, "rewards/reward_func/mean": 0.7753521203994751, "rewards/reward_func/std": 0.0, "step": 1581, "step_time": 38.204998683184385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 127.0, "completions/mean_terminated_length": 127.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.34289418160915375, "epoch": 0.07327466419638722, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014903299743309617, "kl": 0.001732117059873417, "learning_rate": 9.853543307086613e-07, "loss": 0.0001, "num_tokens": 43495331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1582, "step_time": 14.250513847917318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.336424358189106, "epoch": 0.07332098193608153, "frac_reward_zero_std": 1.0, "grad_norm": 0.002108678687363863, "kl": 0.0015903627790976316, "learning_rate": 9.853450671607226e-07, "loss": 0.0001, "num_tokens": 43524531.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1583, "step_time": 19.401917461305857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.36616717278957367, "epoch": 0.07336729967577582, "frac_reward_zero_std": 0.0, "grad_norm": 0.08924608677625656, "kl": 0.0015478063724003732, "learning_rate": 9.853358036127837e-07, "loss": -0.0963, "num_tokens": 43546737.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 1584, "step_time": 22.100545328110456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 164.3125, "completions/mean_terminated_length": 164.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.34544944763183594, "epoch": 0.07341361741547013, "frac_reward_zero_std": 1.0, "grad_norm": 0.003837409196421504, "kl": 0.0018765542190521955, "learning_rate": 9.853265400648449e-07, "loss": 0.0001, "num_tokens": 43575014.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1585, "step_time": 18.930589731782675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 131.375, "completions/mean_terminated_length": 131.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2863118126988411, "epoch": 0.07345993515516443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011196363484486938, "kl": 0.0011951862688874826, "learning_rate": 9.853172765169058e-07, "loss": 0.0001, "num_tokens": 43602924.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1586, "step_time": 16.04774810373783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.2627539038658142, "epoch": 0.07350625289485874, "frac_reward_zero_std": 1.0, "grad_norm": 0.001070684753358364, "kl": 0.0009951856918632984, "learning_rate": 9.853080129689671e-07, "loss": 0.0, "num_tokens": 43634160.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1587, "step_time": 21.734125968068838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.39849239587783813, "epoch": 0.07355257063455303, "frac_reward_zero_std": 1.0, "grad_norm": 0.00107945513445884, "kl": 0.0014529027102980763, "learning_rate": 9.852987494210282e-07, "loss": 0.0001, "num_tokens": 43664735.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1588, "step_time": 21.05367875471711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.3285161554813385, "epoch": 0.07359888837424734, "frac_reward_zero_std": 0.0, "grad_norm": 0.10005734860897064, "kl": 0.0024754175101406872, "learning_rate": 9.852894858730894e-07, "loss": 0.0074, "num_tokens": 43689813.0, "reward": 0.7110038995742798, "reward_std": 0.4143187403678894, "rewards/reward_func/mean": 0.7110038995742798, "rewards/reward_func/std": 0.4143187403678894, "step": 1589, "step_time": 24.597700908780098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3973710164427757, "epoch": 0.07364520611394164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014514324720948935, "kl": 0.001288296072743833, "learning_rate": 9.852802223251505e-07, "loss": 0.0001, "num_tokens": 43713028.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1590, "step_time": 15.701747439801693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.16994201391935349, "epoch": 0.07369152385363595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014112734934315085, "kl": 0.0008843439863994718, "learning_rate": 9.852709587772116e-07, "loss": 0.0, "num_tokens": 43737995.0, "reward": 0.5736212730407715, "reward_std": 0.0, "rewards/reward_func/mean": 0.5736212730407715, "rewards/reward_func/std": 0.0, "step": 1591, "step_time": 16.465637356042862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4402531236410141, "epoch": 0.07373784159333024, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018565183272585273, "kl": 0.0017671606037765741, "learning_rate": 9.852616952292727e-07, "loss": 0.0001, "num_tokens": 43759647.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1592, "step_time": 18.82415586337447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.16491558775305748, "epoch": 0.07378415933302455, "frac_reward_zero_std": 1.0, "grad_norm": 0.004695961717516184, "kl": 0.0017564288573339581, "learning_rate": 9.852524316813339e-07, "loss": 0.0001, "num_tokens": 43782939.0, "reward": 0.765928328037262, "reward_std": 0.0, "rewards/reward_func/mean": 0.765928328037262, "rewards/reward_func/std": 0.0, "step": 1593, "step_time": 15.761767968535423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.22686590254306793, "epoch": 0.07383047707271885, "frac_reward_zero_std": 0.0, "grad_norm": 0.08620725572109222, "kl": 0.0012763368431478739, "learning_rate": 9.85243168133395e-07, "loss": 0.0026, "num_tokens": 43822381.0, "reward": 0.7883949279785156, "reward_std": 0.0050514861941337585, "rewards/reward_func/mean": 0.7883949279785156, "rewards/reward_func/std": 0.005051496438682079, "step": 1594, "step_time": 28.364187084138393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.147944588214159, "epoch": 0.07387679481241316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010562414536252618, "kl": 0.0009138431632891297, "learning_rate": 9.852339045854561e-07, "loss": 0.0, "num_tokens": 43843749.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1595, "step_time": 15.635121572762728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 147.0625, "completions/mean_terminated_length": 147.0625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.1813713274896145, "epoch": 0.07392311255210746, "frac_reward_zero_std": 1.0, "grad_norm": 0.004045617301017046, "kl": 0.0015739940863568336, "learning_rate": 9.852246410375174e-07, "loss": 0.0001, "num_tokens": 43864390.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1596, "step_time": 15.721635963767767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 345.125, "completions/mean_terminated_length": 345.125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.24516578018665314, "epoch": 0.07396943029180177, "frac_reward_zero_std": 0.0, "grad_norm": 0.07490377128124237, "kl": 0.0011413791944505647, "learning_rate": 9.852153774895786e-07, "loss": -0.0058, "num_tokens": 43901544.0, "reward": 0.9853454232215881, "reward_std": 0.008085109293460846, "rewards/reward_func/mean": 0.9853454232215881, "rewards/reward_func/std": 0.008085114881396294, "step": 1597, "step_time": 36.766721062362194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 135.9375, "completions/mean_terminated_length": 135.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.24951966106891632, "epoch": 0.07401574803149606, "frac_reward_zero_std": 1.0, "grad_norm": 0.004406257998198271, "kl": 0.001871462882263586, "learning_rate": 9.852061139416395e-07, "loss": 0.0001, "num_tokens": 43921367.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1598, "step_time": 15.098415672779083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3961958587169647, "epoch": 0.07406206577119037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018680575303733349, "kl": 0.0016966285184025764, "learning_rate": 9.851968503937006e-07, "loss": 0.0001, "num_tokens": 43943709.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1599, "step_time": 20.12541764602065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.26840293034911156, "epoch": 0.07410838351088467, "frac_reward_zero_std": 0.0, "grad_norm": 0.14280129969120026, "kl": 0.002122916339430958, "learning_rate": 9.85187586845762e-07, "loss": -0.0056, "num_tokens": 43980901.0, "reward": 0.5336841344833374, "reward_std": 0.451254665851593, "rewards/reward_func/mean": 0.5336841344833374, "rewards/reward_func/std": 0.4512546956539154, "step": 1600, "step_time": 22.47697388380766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.2988668344914913, "epoch": 0.07415470125057898, "frac_reward_zero_std": 0.0, "grad_norm": 0.10289546847343445, "kl": 0.0025755142560228705, "learning_rate": 9.85178323297823e-07, "loss": 0.0135, "num_tokens": 44007541.0, "reward": 0.1463262289762497, "reward_std": 0.17135973274707794, "rewards/reward_func/mean": 0.1463262289762497, "rewards/reward_func/std": 0.17135973274707794, "step": 1601, "step_time": 22.601786609739065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.25361642614006996, "epoch": 0.07420101899027327, "frac_reward_zero_std": 0.0, "grad_norm": 0.08131257444620132, "kl": 0.0017854655161499977, "learning_rate": 9.851690597498842e-07, "loss": 0.0436, "num_tokens": 44029720.0, "reward": 0.8720642328262329, "reward_std": 0.2398042231798172, "rewards/reward_func/mean": 0.8720642328262329, "rewards/reward_func/std": 0.2398042231798172, "step": 1602, "step_time": 19.34695564210415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 150.3125, "completions/mean_terminated_length": 150.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3029944971203804, "epoch": 0.07424733672996758, "frac_reward_zero_std": 1.0, "grad_norm": 0.002730795880779624, "kl": 0.001728356524836272, "learning_rate": 9.851597962019453e-07, "loss": 0.0001, "num_tokens": 44051229.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1603, "step_time": 16.14696368575096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 173.5625, "completions/mean_terminated_length": 173.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2576616480946541, "epoch": 0.07429365446966188, "frac_reward_zero_std": 0.0, "grad_norm": 0.08358705043792725, "kl": 0.0013877996534574777, "learning_rate": 9.851505326540064e-07, "loss": -0.0062, "num_tokens": 44079830.0, "reward": 0.9216919541358948, "reward_std": 0.037023257464170456, "rewards/reward_func/mean": 0.9216919541358948, "rewards/reward_func/std": 0.03702325373888016, "step": 1604, "step_time": 20.236038610339165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 164.375, "completions/mean_terminated_length": 164.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.15927428007125854, "epoch": 0.07433997220935619, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006458433927036822, "kl": 0.0007009407709119841, "learning_rate": 9.851412691060676e-07, "loss": 0.0, "num_tokens": 44116348.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 1605, "step_time": 19.7960554510355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2811974287033081, "epoch": 0.07438628994905049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008349127019755542, "kl": 0.0010238258837489411, "learning_rate": 9.851320055581287e-07, "loss": 0.0001, "num_tokens": 44140162.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1606, "step_time": 14.299282133579254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.34402915835380554, "epoch": 0.0744326076887448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014503782149404287, "kl": 0.0014425396511796862, "learning_rate": 9.851227420101898e-07, "loss": 0.0001, "num_tokens": 44160542.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1607, "step_time": 18.33135963231325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.20412952452898026, "epoch": 0.07447892542843909, "frac_reward_zero_std": 1.0, "grad_norm": 0.001471884548664093, "kl": 0.0010712187213357538, "learning_rate": 9.85113478462251e-07, "loss": 0.0001, "num_tokens": 44195850.0, "reward": 0.4345982074737549, "reward_std": 0.0, "rewards/reward_func/mean": 0.4345982074737549, "rewards/reward_func/std": 0.0, "step": 1608, "step_time": 26.500352159142494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 167.8125, "completions/mean_terminated_length": 167.8125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3420803025364876, "epoch": 0.0745252431681334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014416154008358717, "kl": 0.0013742977171204984, "learning_rate": 9.85104214914312e-07, "loss": 0.0001, "num_tokens": 44216631.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1609, "step_time": 18.223748851567507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.20950258150696754, "epoch": 0.0745715609078277, "frac_reward_zero_std": 0.0, "grad_norm": 0.08828388899564743, "kl": 0.0010750300280051306, "learning_rate": 9.850949513663734e-07, "loss": -0.0267, "num_tokens": 44245278.0, "reward": 0.3630557060241699, "reward_std": 0.15361300110816956, "rewards/reward_func/mean": 0.3630557060241699, "rewards/reward_func/std": 0.15361300110816956, "step": 1610, "step_time": 19.29606306180358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 184.5, "completions/mean_terminated_length": 184.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4001855403184891, "epoch": 0.074617878647522, "frac_reward_zero_std": 0.0, "grad_norm": 0.1239161491394043, "kl": 0.0018652789876796305, "learning_rate": 9.850856878184343e-07, "loss": -0.0575, "num_tokens": 44266630.0, "reward": 0.23001110553741455, "reward_std": 0.4114563763141632, "rewards/reward_func/mean": 0.23001110553741455, "rewards/reward_func/std": 0.4114563763141632, "step": 1611, "step_time": 20.396337650716305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.25959864258766174, "epoch": 0.0746641963872163, "frac_reward_zero_std": 0.0, "grad_norm": 0.08245521783828735, "kl": 0.0014883537369314581, "learning_rate": 9.850764242704954e-07, "loss": 0.0429, "num_tokens": 44303002.0, "reward": 0.9382164478302002, "reward_std": 0.24713435769081116, "rewards/reward_func/mean": 0.9382164478302002, "rewards/reward_func/std": 0.24713437259197235, "step": 1612, "step_time": 23.64640225470066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 0.1900012157857418, "epoch": 0.07471051412691061, "frac_reward_zero_std": 0.0, "grad_norm": 0.06945158541202545, "kl": 0.001918523252243176, "learning_rate": 9.850671607225568e-07, "loss": -0.0014, "num_tokens": 44338950.0, "reward": 0.9522884488105774, "reward_std": 0.1303727775812149, "rewards/reward_func/mean": 0.9522884488105774, "rewards/reward_func/std": 0.1303727775812149, "step": 1613, "step_time": 30.280744925141335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 122.3125, "completions/mean_terminated_length": 122.3125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3087422326207161, "epoch": 0.07475683186660491, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015073319664224982, "kl": 0.0014248942316044122, "learning_rate": 9.85057897174618e-07, "loss": 0.0001, "num_tokens": 44369915.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1614, "step_time": 16.27818715199828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 211.0625, "completions/mean_terminated_length": 211.0625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.2519189082086086, "epoch": 0.07480314960629922, "frac_reward_zero_std": 0.0, "grad_norm": 0.08111374825239182, "kl": 0.0017523058922961354, "learning_rate": 9.85048633626679e-07, "loss": -0.0278, "num_tokens": 44392028.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 1615, "step_time": 20.951961275190115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 157.8125, "completions/mean_terminated_length": 157.8125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3702063113451004, "epoch": 0.07484946734599351, "frac_reward_zero_std": 1.0, "grad_norm": 0.001280409749597311, "kl": 0.0014684416237287223, "learning_rate": 9.850393700787402e-07, "loss": 0.0001, "num_tokens": 44414841.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1616, "step_time": 16.872747104614973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3385022059082985, "epoch": 0.07489578508568782, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014231239911168814, "kl": 0.0013049719855189323, "learning_rate": 9.850301065308013e-07, "loss": 0.0001, "num_tokens": 44451044.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1617, "step_time": 16.985461961477995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.26226063817739487, "epoch": 0.07494210282538212, "frac_reward_zero_std": 1.0, "grad_norm": 0.001386523130349815, "kl": 0.0012260834337212145, "learning_rate": 9.850208429828624e-07, "loss": 0.0001, "num_tokens": 44479293.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1618, "step_time": 15.900365706533194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.30687885731458664, "epoch": 0.07498842056507643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018912419909611344, "kl": 0.0014387187839020044, "learning_rate": 9.850115794349235e-07, "loss": 0.0001, "num_tokens": 44500784.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1619, "step_time": 18.744129680097103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.15258891135454178, "epoch": 0.07503473830477073, "frac_reward_zero_std": 0.0, "grad_norm": 0.04254131019115448, "kl": 0.0006999806792009622, "learning_rate": 9.850023158869847e-07, "loss": -0.1472, "num_tokens": 44529510.0, "reward": 0.6036472320556641, "reward_std": 0.225793719291687, "rewards/reward_func/mean": 0.6036472320556641, "rewards/reward_func/std": 0.225793719291687, "step": 1620, "step_time": 32.11163375899196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.1882282979786396, "epoch": 0.07508105604446504, "frac_reward_zero_std": 0.0, "grad_norm": 0.12540240585803986, "kl": 0.0011414197651902214, "learning_rate": 9.849930523390458e-07, "loss": -0.0573, "num_tokens": 44556328.0, "reward": 0.6468878984451294, "reward_std": 0.3517628312110901, "rewards/reward_func/mean": 0.6468878984451294, "rewards/reward_func/std": 0.3517628610134125, "step": 1621, "step_time": 20.640837877988815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.39952103793621063, "epoch": 0.07512737378415933, "frac_reward_zero_std": 1.0, "grad_norm": 0.001758892904035747, "kl": 0.0016571315354667604, "learning_rate": 9.84983788791107e-07, "loss": 0.0001, "num_tokens": 44584762.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1622, "step_time": 19.941069405525923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.29836906492710114, "epoch": 0.07517369152385364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037823189049959183, "kl": 0.0022289794869720936, "learning_rate": 9.84974525243168e-07, "loss": 0.0001, "num_tokens": 44606314.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1623, "step_time": 14.352275010198355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3897210881114006, "epoch": 0.07522000926354794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012487727217376232, "kl": 0.0014665639901068062, "learning_rate": 9.849652616952292e-07, "loss": 0.0001, "num_tokens": 44630213.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1624, "step_time": 16.667425740510225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 129.8125, "completions/mean_terminated_length": 129.8125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.27875569462776184, "epoch": 0.07526632700324225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027796023059636354, "kl": 0.001816785748815164, "learning_rate": 9.849559981472903e-07, "loss": 0.0001, "num_tokens": 44652434.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1625, "step_time": 15.079639580100775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 173.6875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.36077074706554413, "epoch": 0.07531264474293654, "frac_reward_zero_std": 1.0, "grad_norm": 0.002555923303589225, "kl": 0.00194956932682544, "learning_rate": 9.849467345993516e-07, "loss": 0.0001, "num_tokens": 44698413.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1626, "step_time": 24.086150523275137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3735076040029526, "epoch": 0.07535896248263085, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015340592944994569, "kl": 0.0015938914439175278, "learning_rate": 9.849374710514127e-07, "loss": 0.0001, "num_tokens": 44724267.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1627, "step_time": 20.119130540639162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3603271543979645, "epoch": 0.07540528022232515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019060707418248057, "kl": 0.0018551453249529004, "learning_rate": 9.849282075034739e-07, "loss": 0.0001, "num_tokens": 44746485.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1628, "step_time": 15.419531498104334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 203.3125, "completions/mean_terminated_length": 203.3125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3615184426307678, "epoch": 0.07545159796201946, "frac_reward_zero_std": 0.0, "grad_norm": 0.10142161697149277, "kl": 0.002262524183606729, "learning_rate": 9.849189439555348e-07, "loss": -0.0191, "num_tokens": 44774042.0, "reward": 0.5301079750061035, "reward_std": 0.4834319055080414, "rewards/reward_func/mean": 0.5301079750061035, "rewards/reward_func/std": 0.48343193531036377, "step": 1629, "step_time": 21.876350447535515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3909710720181465, "epoch": 0.07549791570171376, "frac_reward_zero_std": 0.0, "grad_norm": 0.09627868980169296, "kl": 0.006698896875604987, "learning_rate": 9.849096804075961e-07, "loss": -0.0761, "num_tokens": 44797948.0, "reward": 0.05655233934521675, "reward_std": 0.2262093424797058, "rewards/reward_func/mean": 0.05655233934521675, "rewards/reward_func/std": 0.226209357380867, "step": 1630, "step_time": 21.573106106370687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 130.9375, "completions/mean_terminated_length": 130.9375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2726520821452141, "epoch": 0.07554423344140807, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011840288061648607, "kl": 0.0011821826919913292, "learning_rate": 9.849004168596572e-07, "loss": 0.0001, "num_tokens": 44821163.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1631, "step_time": 16.576533257961273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.40685446560382843, "epoch": 0.07559055118110236, "frac_reward_zero_std": 1.0, "grad_norm": 0.001472765114158392, "kl": 0.0014241637545637786, "learning_rate": 9.848911533117184e-07, "loss": 0.0001, "num_tokens": 44851631.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1632, "step_time": 20.087729662656784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 243.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.4612579941749573, "epoch": 0.07563686892079667, "frac_reward_zero_std": 0.0, "grad_norm": 0.09092710167169571, "kl": 0.0021810660255141556, "learning_rate": 9.848818897637795e-07, "loss": -0.133, "num_tokens": 44887845.0, "reward": 0.059552330523729324, "reward_std": 0.2382093220949173, "rewards/reward_func/mean": 0.059552330523729324, "rewards/reward_func/std": 0.2382093369960785, "step": 1633, "step_time": 33.17604085803032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.304825134575367, "epoch": 0.07568318666049097, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015276845078915358, "kl": 0.0012091417738702148, "learning_rate": 9.848726262158406e-07, "loss": 0.0001, "num_tokens": 44912869.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1634, "step_time": 21.763545881956816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.194634061306715, "epoch": 0.07572950440018528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013067872496321797, "kl": 0.0009507545328233391, "learning_rate": 9.848633626679017e-07, "loss": 0.0, "num_tokens": 44962972.0, "reward": 0.8890097737312317, "reward_std": 0.0, "rewards/reward_func/mean": 0.8890097737312317, "rewards/reward_func/std": 0.0, "step": 1635, "step_time": 23.300194274634123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2663589380681515, "epoch": 0.07577582213987957, "frac_reward_zero_std": 0.0, "grad_norm": 0.08443693071603775, "kl": 0.0018062372982967645, "learning_rate": 9.848540991199629e-07, "loss": -0.0166, "num_tokens": 44988190.0, "reward": 0.07100986689329147, "reward_std": 0.0033722228836268187, "rewards/reward_func/mean": 0.07100986689329147, "rewards/reward_func/std": 0.003372224047780037, "step": 1636, "step_time": 19.75159054249525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 163.3125, "completions/mean_terminated_length": 163.3125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.38820867985486984, "epoch": 0.07582213987957388, "frac_reward_zero_std": 1.0, "grad_norm": 0.00270209857262671, "kl": 0.0018760527891572565, "learning_rate": 9.84844835572024e-07, "loss": 0.0001, "num_tokens": 45012515.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1637, "step_time": 17.12164083123207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4071737304329872, "epoch": 0.07586845761926818, "frac_reward_zero_std": 1.0, "grad_norm": 0.004578940104693174, "kl": 0.002200766874011606, "learning_rate": 9.848355720240851e-07, "loss": 0.0001, "num_tokens": 45034874.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1638, "step_time": 17.77417979389429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.383428193628788, "epoch": 0.07591477535896249, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013156926725059748, "kl": 0.001637956127524376, "learning_rate": 9.848263084761462e-07, "loss": 0.0001, "num_tokens": 45060996.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1639, "step_time": 19.570538219064474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.38874175399541855, "epoch": 0.07596109309865678, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013463308569043875, "kl": 0.0016464151558466256, "learning_rate": 9.848170449282076e-07, "loss": 0.0001, "num_tokens": 45095553.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1640, "step_time": 21.0771097317338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.25370409339666367, "epoch": 0.0760074108383511, "frac_reward_zero_std": 0.0, "grad_norm": 0.103360615670681, "kl": 0.0014762329519726336, "learning_rate": 9.848077813802685e-07, "loss": -0.0331, "num_tokens": 45121559.0, "reward": 0.8410100936889648, "reward_std": 0.2242693305015564, "rewards/reward_func/mean": 0.8410100936889648, "rewards/reward_func/std": 0.2242693454027176, "step": 1641, "step_time": 20.875794924795628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 118.3125, "completions/mean_terminated_length": 118.3125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3047468140721321, "epoch": 0.07605372857804539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015164186479523778, "kl": 0.001252879883395508, "learning_rate": 9.847985178323296e-07, "loss": 0.0001, "num_tokens": 45144668.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1642, "step_time": 13.751232896000147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.24536456167697906, "epoch": 0.0761000463177397, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016447966918349266, "kl": 0.0011007676948793232, "learning_rate": 9.84789254284391e-07, "loss": 0.0001, "num_tokens": 45164012.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1643, "step_time": 12.875072479248047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.26822152733802795, "epoch": 0.076146364057434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014926041476428509, "kl": 0.00132352503715083, "learning_rate": 9.84779990736452e-07, "loss": 0.0001, "num_tokens": 45183548.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1644, "step_time": 13.561577666550875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 116.4375, "completions/mean_terminated_length": 116.4375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.34725936502218246, "epoch": 0.0761926817971283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010282954899594188, "kl": 0.0014080363616812974, "learning_rate": 9.847707271885132e-07, "loss": 0.0001, "num_tokens": 45206627.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1645, "step_time": 14.227275379002094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.37325354665517807, "epoch": 0.0762389995368226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016776693519204855, "kl": 0.0019049591792281717, "learning_rate": 9.847614636405743e-07, "loss": 0.0001, "num_tokens": 45231533.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1646, "step_time": 20.6542307138443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4133910909295082, "epoch": 0.07628531727651691, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022800853475928307, "kl": 0.0018994792480953038, "learning_rate": 9.847522000926355e-07, "loss": 0.0001, "num_tokens": 45252281.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1647, "step_time": 18.645898014307022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 129.8125, "completions/mean_terminated_length": 129.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.25476617366075516, "epoch": 0.07633163501621121, "frac_reward_zero_std": 1.0, "grad_norm": 0.002013625344261527, "kl": 0.0015150157851167023, "learning_rate": 9.847429365446966e-07, "loss": 0.0001, "num_tokens": 45272870.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1648, "step_time": 14.163351621478796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.15188464522361755, "epoch": 0.07637795275590552, "frac_reward_zero_std": 0.0, "grad_norm": 0.06743628531694412, "kl": 0.0006581193738384172, "learning_rate": 9.847336729967577e-07, "loss": -0.0468, "num_tokens": 45300924.0, "reward": 0.856032133102417, "reward_std": 0.22827444970607758, "rewards/reward_func/mean": 0.856032133102417, "rewards/reward_func/std": 0.22827443480491638, "step": 1649, "step_time": 18.76086140051484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 197.0625, "completions/mean_terminated_length": 197.0625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.21695607155561447, "epoch": 0.07642427049559981, "frac_reward_zero_std": 0.0, "grad_norm": 0.0834154337644577, "kl": 0.001183260334073566, "learning_rate": 9.847244094488188e-07, "loss": -0.0288, "num_tokens": 45326669.0, "reward": 0.4902341961860657, "reward_std": 0.06334785372018814, "rewards/reward_func/mean": 0.4902341961860657, "rewards/reward_func/std": 0.06334785372018814, "step": 1650, "step_time": 20.830961029976606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 191.6875, "completions/mean_terminated_length": 191.6875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4435461536049843, "epoch": 0.07647058823529412, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008816600893624127, "kl": 0.0012813075154554099, "learning_rate": 9.8471514590088e-07, "loss": 0.0001, "num_tokens": 45355416.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1651, "step_time": 20.361383739858866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 310.8125, "completions/mean_terminated_length": 310.8125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.2690752036869526, "epoch": 0.07651690597498842, "frac_reward_zero_std": 0.0, "grad_norm": 0.057550378143787384, "kl": 0.0015795712533872575, "learning_rate": 9.84705882352941e-07, "loss": -0.169, "num_tokens": 45382005.0, "reward": 0.8120216131210327, "reward_std": 0.3954293727874756, "rewards/reward_func/mean": 0.8120216131210327, "rewards/reward_func/std": 0.3954293727874756, "step": 1652, "step_time": 32.91077008843422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.21322758495807648, "epoch": 0.07656322371468273, "frac_reward_zero_std": 0.0, "grad_norm": 0.076081283390522, "kl": 0.0016789287910796702, "learning_rate": 9.846966188050024e-07, "loss": -0.04, "num_tokens": 45404445.0, "reward": 0.6782035231590271, "reward_std": 0.08704688400030136, "rewards/reward_func/mean": 0.6782035231590271, "rewards/reward_func/std": 0.08704687654972076, "step": 1653, "step_time": 18.4552771858871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 134.8125, "completions/mean_terminated_length": 134.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.20299510657787323, "epoch": 0.07660954145437703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019456911832094193, "kl": 0.0011640992743195966, "learning_rate": 9.846873552570633e-07, "loss": 0.0001, "num_tokens": 45424106.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1654, "step_time": 15.224860787391663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.15970367565751076, "epoch": 0.07665585919407134, "frac_reward_zero_std": 0.0, "grad_norm": 0.28535717725753784, "kl": 0.002173017419409007, "learning_rate": 9.846780917091245e-07, "loss": -0.033, "num_tokens": 45461304.0, "reward": 0.9068578481674194, "reward_std": 0.20066522061824799, "rewards/reward_func/mean": 0.9068578481674194, "rewards/reward_func/std": 0.20066523551940918, "step": 1655, "step_time": 19.46072293817997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 178.4375, "completions/mean_terminated_length": 178.4375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.40137291699647903, "epoch": 0.07670217693376563, "frac_reward_zero_std": 1.0, "grad_norm": 0.002324209315702319, "kl": 0.0016238936514128, "learning_rate": 9.846688281611856e-07, "loss": 0.0001, "num_tokens": 45485135.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1656, "step_time": 18.17797204479575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.4296865537762642, "epoch": 0.07674849467345994, "frac_reward_zero_std": 0.0, "grad_norm": 0.07285812497138977, "kl": 0.0013943906524218619, "learning_rate": 9.84659564613247e-07, "loss": -0.1114, "num_tokens": 45515570.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 1657, "step_time": 30.241368554532528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 168.4375, "completions/mean_terminated_length": 168.4375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.38904421031475067, "epoch": 0.07679481241315424, "frac_reward_zero_std": 1.0, "grad_norm": 0.00254133902490139, "kl": 0.0017448347643949091, "learning_rate": 9.84650301065308e-07, "loss": 0.0001, "num_tokens": 45550745.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1658, "step_time": 20.820831935852766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.29940513521432877, "epoch": 0.07684113015284855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006967560038901865, "kl": 0.0010478577751200646, "learning_rate": 9.846410375173692e-07, "loss": 0.0001, "num_tokens": 45578579.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1659, "step_time": 16.639745496213436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3030737563967705, "epoch": 0.07688744789254284, "frac_reward_zero_std": 1.0, "grad_norm": 0.003103468334302306, "kl": 0.001979161344934255, "learning_rate": 9.846317739694303e-07, "loss": 0.0001, "num_tokens": 45601181.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1660, "step_time": 15.522049743682146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 210.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3735486790537834, "epoch": 0.07693376563223715, "frac_reward_zero_std": 0.0, "grad_norm": 0.08614269644021988, "kl": 0.0014472455659415573, "learning_rate": 9.846225104214914e-07, "loss": -0.0463, "num_tokens": 45623711.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 1661, "step_time": 28.701392497867346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 112.8125, "completions/mean_terminated_length": 112.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2954092025756836, "epoch": 0.07698008337193145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011567731853574514, "kl": 0.0012997614976484329, "learning_rate": 9.846132468735525e-07, "loss": 0.0001, "num_tokens": 45643772.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1662, "step_time": 13.335916545242071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 118.3125, "completions/mean_terminated_length": 118.3125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.26379984617233276, "epoch": 0.07702640111162576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017720244359225035, "kl": 0.0015850586059968919, "learning_rate": 9.846039833256137e-07, "loss": 0.0001, "num_tokens": 45665745.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1663, "step_time": 14.801113799214363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 115.5625, "completions/mean_terminated_length": 115.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.28533728420734406, "epoch": 0.07707271885132005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018279367359355092, "kl": 0.0016257822280749679, "learning_rate": 9.845947197776748e-07, "loss": 0.0001, "num_tokens": 45687946.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1664, "step_time": 12.986000373959541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 150.9375, "completions/mean_terminated_length": 150.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3908330276608467, "epoch": 0.07711903659101436, "frac_reward_zero_std": 1.0, "grad_norm": 0.00139794556889683, "kl": 0.001606762147275731, "learning_rate": 9.84585456229736e-07, "loss": 0.0001, "num_tokens": 45739417.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1665, "step_time": 23.29428631067276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.29769379273056984, "epoch": 0.07716535433070866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015150794060900807, "kl": 0.0017644141626078635, "learning_rate": 9.84576192681797e-07, "loss": 0.0001, "num_tokens": 45763639.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1666, "step_time": 14.191865853965282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.19898555427789688, "epoch": 0.07721167207040297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067122201435267925, "kl": 0.0034142808872275054, "learning_rate": 9.845669291338582e-07, "loss": 0.0002, "num_tokens": 45794748.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1667, "step_time": 16.306269992142916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3299274146556854, "epoch": 0.07725798981009727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016279020346701145, "kl": 0.0014249913510866463, "learning_rate": 9.845576655859193e-07, "loss": 0.0001, "num_tokens": 45818695.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1668, "step_time": 15.443460620939732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 135.9375, "completions/mean_terminated_length": 135.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.245439812541008, "epoch": 0.07730430754979158, "frac_reward_zero_std": 1.0, "grad_norm": 0.001229665125720203, "kl": 0.0012690881558228284, "learning_rate": 9.845484020379804e-07, "loss": 0.0001, "num_tokens": 45838486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1669, "step_time": 14.933586858212948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 110.5, "completions/mean_terminated_length": 110.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2375088967382908, "epoch": 0.07735062528948587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012708210851997137, "kl": 0.0009623114892747253, "learning_rate": 9.845391384900417e-07, "loss": 0.0, "num_tokens": 45860350.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1670, "step_time": 12.734601717442274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3350832015275955, "epoch": 0.07739694302918018, "frac_reward_zero_std": 1.0, "grad_norm": 0.002224243711680174, "kl": 0.0026303930208086967, "learning_rate": 9.845298749421029e-07, "loss": 0.0001, "num_tokens": 45902258.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1671, "step_time": 24.80718930065632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 208.0625, "completions/mean_terminated_length": 208.0625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.2840676084160805, "epoch": 0.07744326076887448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008224576595239341, "kl": 0.0009848253830568865, "learning_rate": 9.845206113941638e-07, "loss": 0.0, "num_tokens": 45930563.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1672, "step_time": 23.42478010803461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.17919356748461723, "epoch": 0.07748957850856879, "frac_reward_zero_std": 1.0, "grad_norm": 0.005466792266815901, "kl": 0.00216941034886986, "learning_rate": 9.845113478462251e-07, "loss": 0.0001, "num_tokens": 45957279.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 1673, "step_time": 18.26247502863407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.34672409296035767, "epoch": 0.07753589624826308, "frac_reward_zero_std": 0.0, "grad_norm": 0.10089165717363358, "kl": 0.0020296200236771256, "learning_rate": 9.845020842982862e-07, "loss": 0.0926, "num_tokens": 45988762.0, "reward": 0.05708860233426094, "reward_std": 0.08745308220386505, "rewards/reward_func/mean": 0.05708860233426094, "rewards/reward_func/std": 0.08745308220386505, "step": 1674, "step_time": 23.20253485813737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 250.0625, "completions/mean_terminated_length": 250.0625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.20240335538983345, "epoch": 0.0775822139879574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012101448373869061, "kl": 0.0011594314710237086, "learning_rate": 9.844928207503474e-07, "loss": 0.0001, "num_tokens": 46024843.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1675, "step_time": 27.527564823627472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 207.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.2570352144539356, "epoch": 0.07762853172765169, "frac_reward_zero_std": 0.0, "grad_norm": 0.06474600732326508, "kl": 0.0013402775075519457, "learning_rate": 9.844835572024085e-07, "loss": -0.0174, "num_tokens": 46049229.0, "reward": 0.9291586875915527, "reward_std": 0.2500021159648895, "rewards/reward_func/mean": 0.9291586875915527, "rewards/reward_func/std": 0.2500021159648895, "step": 1676, "step_time": 22.279742319136858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 162.4375, "completions/mean_terminated_length": 162.4375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3593992218375206, "epoch": 0.077674849467346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014101880369707942, "kl": 0.0013620714307762682, "learning_rate": 9.844742936544696e-07, "loss": 0.0001, "num_tokens": 46074676.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1677, "step_time": 17.406505409628153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 131.9375, "completions/mean_terminated_length": 131.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.27665281295776367, "epoch": 0.0777211672070403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016846642829477787, "kl": 0.0013541773951146752, "learning_rate": 9.844650301065307e-07, "loss": 0.0001, "num_tokens": 46105635.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1678, "step_time": 16.720541026443243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 121.4375, "completions/mean_terminated_length": 121.4375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.37001293897628784, "epoch": 0.0777674849467346, "frac_reward_zero_std": 1.0, "grad_norm": 0.001234093215316534, "kl": 0.0012913451646454632, "learning_rate": 9.844557665585919e-07, "loss": 0.0001, "num_tokens": 46138362.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1679, "step_time": 15.906921125948429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.36959002166986465, "epoch": 0.0778138026864289, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019340359140187502, "kl": 0.0018135556892957538, "learning_rate": 9.84446503010653e-07, "loss": 0.0001, "num_tokens": 46164350.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1680, "step_time": 17.01923667639494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 194.875, "completions/mean_terminated_length": 194.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.34608144313097, "epoch": 0.07786012042612321, "frac_reward_zero_std": 0.0, "grad_norm": 0.10357584804296494, "kl": 0.0018900996656157076, "learning_rate": 9.844372394627141e-07, "loss": -0.0511, "num_tokens": 46198684.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 1681, "step_time": 22.219603832811117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3109801709651947, "epoch": 0.07790643816581751, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035751787945628166, "kl": 0.0013909574190620333, "learning_rate": 9.844279759147752e-07, "loss": 0.0001, "num_tokens": 46229856.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1682, "step_time": 17.34016814827919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.42632104456424713, "epoch": 0.07795275590551182, "frac_reward_zero_std": 1.0, "grad_norm": 0.001590759726241231, "kl": 0.0018257541814818978, "learning_rate": 9.844187123668366e-07, "loss": 0.0001, "num_tokens": 46271845.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1683, "step_time": 21.376418214291334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 168.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4322524890303612, "epoch": 0.07799907364520611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030773428734391928, "kl": 0.00173371157143265, "learning_rate": 9.844094488188977e-07, "loss": 0.0001, "num_tokens": 46305066.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1684, "step_time": 20.002454344183207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 127.4375, "completions/mean_terminated_length": 127.4375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2681782655417919, "epoch": 0.07804539138490042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013541457010433078, "kl": 0.0009537218720652163, "learning_rate": 9.844001852709586e-07, "loss": 0.0, "num_tokens": 46335537.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1685, "step_time": 16.65060442686081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2002566084265709, "epoch": 0.07809170912459472, "frac_reward_zero_std": 0.0, "grad_norm": 0.11434628814458847, "kl": 0.0013527112314477563, "learning_rate": 9.843909217230197e-07, "loss": -0.0046, "num_tokens": 46356949.0, "reward": 0.366998553276062, "reward_std": 0.18222030997276306, "rewards/reward_func/mean": 0.366998553276062, "rewards/reward_func/std": 0.18222030997276306, "step": 1686, "step_time": 16.181255109608173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 134.1875, "completions/mean_terminated_length": 134.1875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.25187139958143234, "epoch": 0.07813802686428903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027301819063723087, "kl": 0.0014639183063991368, "learning_rate": 9.84381658175081e-07, "loss": 0.0001, "num_tokens": 46376760.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1687, "step_time": 14.960896357893944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 128.125, "completions/mean_terminated_length": 128.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.33778195083141327, "epoch": 0.07818434460398332, "frac_reward_zero_std": 1.0, "grad_norm": 0.001593102584592998, "kl": 0.0016300349379889667, "learning_rate": 9.843723946271422e-07, "loss": 0.0001, "num_tokens": 46412586.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1688, "step_time": 17.47895924001932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 138.6875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.43217572569847107, "epoch": 0.07823066234367763, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013693617656826973, "kl": 0.0015838368562981486, "learning_rate": 9.843631310792033e-07, "loss": 0.0001, "num_tokens": 46436965.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1689, "step_time": 14.800542384386063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.38462790846824646, "epoch": 0.07827698008337193, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022047506645321846, "kl": 0.0020656742271967232, "learning_rate": 9.843538675312645e-07, "loss": 0.0001, "num_tokens": 46470089.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1690, "step_time": 18.441659960895777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 343.1875, "completions/mean_terminated_length": 343.1875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.32002317160367966, "epoch": 0.07832329782306624, "frac_reward_zero_std": 0.0, "grad_norm": 0.05422141030430794, "kl": 0.001787881599739194, "learning_rate": 9.843446039833256e-07, "loss": -0.1295, "num_tokens": 46499356.0, "reward": 0.6308025121688843, "reward_std": 0.39480242133140564, "rewards/reward_func/mean": 0.6308025121688843, "rewards/reward_func/std": 0.39480242133140564, "step": 1691, "step_time": 36.27641510590911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 188.25, "completions/mean_terminated_length": 188.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3893887996673584, "epoch": 0.07836961556276054, "frac_reward_zero_std": 1.0, "grad_norm": 0.002944176783785224, "kl": 0.0020869087893515825, "learning_rate": 9.843353404353867e-07, "loss": 0.0001, "num_tokens": 46530816.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1692, "step_time": 21.236405465751886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.29504361748695374, "epoch": 0.07841593330245485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019446397200226784, "kl": 0.001616210414795205, "learning_rate": 9.843260768874478e-07, "loss": 0.0001, "num_tokens": 46560142.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1693, "step_time": 18.738793417811394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 126.8125, "completions/mean_terminated_length": 126.8125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3024235740303993, "epoch": 0.07846225104214914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009588732500560582, "kl": 0.001039199487422593, "learning_rate": 9.84316813339509e-07, "loss": 0.0001, "num_tokens": 46580699.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1694, "step_time": 14.138744950294495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4446135610342026, "epoch": 0.07850856878184345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046493783593177795, "kl": 0.0029429663554765284, "learning_rate": 9.8430754979157e-07, "loss": 0.0001, "num_tokens": 46625347.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1695, "step_time": 23.11709763109684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 152.3125, "completions/mean_terminated_length": 152.3125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.36051344126462936, "epoch": 0.07855488652153775, "frac_reward_zero_std": 1.0, "grad_norm": 0.001356164226308465, "kl": 0.0017850837029982358, "learning_rate": 9.842982862436314e-07, "loss": 0.0001, "num_tokens": 46647416.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1696, "step_time": 16.97948555275798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 151.6875, "completions/mean_terminated_length": 151.6875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.14736872911453247, "epoch": 0.07860120426123206, "frac_reward_zero_std": 0.0, "grad_norm": 0.07276488840579987, "kl": 0.003640395647380501, "learning_rate": 9.842890226956923e-07, "loss": -0.0075, "num_tokens": 46669267.0, "reward": 0.9060009717941284, "reward_std": 0.18091386556625366, "rewards/reward_func/mean": 0.9060009717941284, "rewards/reward_func/std": 0.18091386556625366, "step": 1697, "step_time": 15.534895148128271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3113630935549736, "epoch": 0.07864752200092635, "frac_reward_zero_std": 1.0, "grad_norm": 0.003892549080774188, "kl": 0.0018110026721842587, "learning_rate": 9.842797591477535e-07, "loss": 0.0001, "num_tokens": 46698845.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1698, "step_time": 15.903801303356886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2357831597328186, "epoch": 0.07869383974062066, "frac_reward_zero_std": 0.0, "grad_norm": 0.11127041280269623, "kl": 0.0028912353445775807, "learning_rate": 9.842704955998146e-07, "loss": 0.0851, "num_tokens": 46720431.0, "reward": 0.1674453616142273, "reward_std": 0.06536397337913513, "rewards/reward_func/mean": 0.1674453616142273, "rewards/reward_func/std": 0.06536397337913513, "step": 1699, "step_time": 21.52933993190527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.14909964427351952, "epoch": 0.07874015748031496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006626087124459445, "kl": 0.0005348598788259551, "learning_rate": 9.84261232051876e-07, "loss": 0.0, "num_tokens": 46749582.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 1700, "step_time": 17.675500009208918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 127.0625, "completions/mean_terminated_length": 127.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3040238469839096, "epoch": 0.07878647522000927, "frac_reward_zero_std": 1.0, "grad_norm": 0.001297865412198007, "kl": 0.0013960163632873446, "learning_rate": 9.84251968503937e-07, "loss": 0.0001, "num_tokens": 46778047.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1701, "step_time": 15.285749927163124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 218.9375, "completions/mean_terminated_length": 218.9375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.18902039900422096, "epoch": 0.07883279295970357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023403866216540337, "kl": 0.0017285541980527341, "learning_rate": 9.842427049559982e-07, "loss": 0.0001, "num_tokens": 46800206.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1702, "step_time": 21.16633603721857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 233.8125, "completions/mean_terminated_length": 233.8125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.3055119514465332, "epoch": 0.07887911069939788, "frac_reward_zero_std": 0.0, "grad_norm": 0.08222231268882751, "kl": 0.0018914344254881144, "learning_rate": 9.842334414080593e-07, "loss": -0.0129, "num_tokens": 46836651.0, "reward": 0.16383880376815796, "reward_std": 0.35467827320098877, "rewards/reward_func/mean": 0.16383880376815796, "rewards/reward_func/std": 0.35467830300331116, "step": 1703, "step_time": 27.084114365279675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3746067136526108, "epoch": 0.07892542843909217, "frac_reward_zero_std": 1.0, "grad_norm": 0.001366176176816225, "kl": 0.0018012887449003756, "learning_rate": 9.842241778601204e-07, "loss": 0.0001, "num_tokens": 46872207.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1704, "step_time": 19.433010537177324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 185.5625, "completions/mean_terminated_length": 185.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.41927726566791534, "epoch": 0.07897174617878648, "frac_reward_zero_std": 1.0, "grad_norm": 0.00419262982904911, "kl": 0.0023562940768897533, "learning_rate": 9.842149143121815e-07, "loss": 0.0001, "num_tokens": 46898568.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1705, "step_time": 18.908721026033163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3284745365381241, "epoch": 0.07901806391848078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015150196850299835, "kl": 0.0015281571249943227, "learning_rate": 9.842056507642427e-07, "loss": 0.0001, "num_tokens": 46922838.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1706, "step_time": 14.719692457467318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 167.5625, "completions/mean_terminated_length": 167.5625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.35932251065969467, "epoch": 0.07906438165817509, "frac_reward_zero_std": 1.0, "grad_norm": 0.002645768690854311, "kl": 0.002208069316111505, "learning_rate": 9.841963872163038e-07, "loss": 0.0001, "num_tokens": 46945167.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1707, "step_time": 17.174181506037712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.21813251078128815, "epoch": 0.07911069939786938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010103777749463916, "kl": 0.0010198226082138717, "learning_rate": 9.84187123668365e-07, "loss": 0.0001, "num_tokens": 46964801.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1708, "step_time": 15.079619221389294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 279.6875, "completions/mean_terminated_length": 279.6875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "entropy": 0.5472236573696136, "epoch": 0.07915701713756369, "frac_reward_zero_std": 0.0, "grad_norm": 0.0745278149843216, "kl": 0.0019589700386859477, "learning_rate": 9.84177860120426e-07, "loss": 0.1951, "num_tokens": 46991484.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 1709, "step_time": 38.763111498206854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.17142558470368385, "epoch": 0.07920333487725799, "frac_reward_zero_std": 1.0, "grad_norm": 0.008708245120942593, "kl": 0.004739268129924312, "learning_rate": 9.841685965724872e-07, "loss": 0.0002, "num_tokens": 47038802.0, "reward": 0.8337529301643372, "reward_std": 0.0, "rewards/reward_func/mean": 0.8337529301643372, "rewards/reward_func/std": 0.0, "step": 1710, "step_time": 23.529833510518074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.2060224413871765, "epoch": 0.0792496526169523, "frac_reward_zero_std": 0.0, "grad_norm": 0.10727056860923767, "kl": 0.0009348559251520783, "learning_rate": 9.841593330245483e-07, "loss": -0.0368, "num_tokens": 47064644.0, "reward": 0.9850083589553833, "reward_std": 0.03223112225532532, "rewards/reward_func/mean": 0.9850083589553833, "rewards/reward_func/std": 0.03223112225532532, "step": 1711, "step_time": 21.429081067442894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 104.375, "completions/mean_terminated_length": 104.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.23032863065600395, "epoch": 0.0792959703566466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013954452006146312, "kl": 0.0011873962357640266, "learning_rate": 9.841500694766094e-07, "loss": 0.0001, "num_tokens": 47083994.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1712, "step_time": 11.449270065873861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.24487724155187607, "epoch": 0.0793422880963409, "frac_reward_zero_std": 0.0, "grad_norm": 0.06413564085960388, "kl": 0.0019074836163781583, "learning_rate": 9.841408059286708e-07, "loss": -0.0305, "num_tokens": 47124448.0, "reward": 0.5864511728286743, "reward_std": 0.11027967929840088, "rewards/reward_func/mean": 0.5864511728286743, "rewards/reward_func/std": 0.11027967929840088, "step": 1713, "step_time": 33.45283553749323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 149.25, "completions/mean_terminated_length": 149.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3016955330967903, "epoch": 0.0793886058360352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009356994414702058, "kl": 0.0010292467050021514, "learning_rate": 9.841315423807319e-07, "loss": 0.0001, "num_tokens": 47159668.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1714, "step_time": 19.849836815148592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3925165981054306, "epoch": 0.07943492357572951, "frac_reward_zero_std": 1.0, "grad_norm": 0.002282223431393504, "kl": 0.0024772547476459295, "learning_rate": 9.841222788327928e-07, "loss": 0.0001, "num_tokens": 47211378.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1715, "step_time": 23.54809008166194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.23693326488137245, "epoch": 0.0794812413154238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012495475821197033, "kl": 0.001155626232502982, "learning_rate": 9.84113015284854e-07, "loss": 0.0001, "num_tokens": 47231062.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1716, "step_time": 14.24133886769414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3047955483198166, "epoch": 0.07952755905511812, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009511947864666581, "kl": 0.0011468870943645015, "learning_rate": 9.841037517369153e-07, "loss": 0.0001, "num_tokens": 47252758.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1717, "step_time": 16.200883217155933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 226.4375, "completions/mean_terminated_length": 226.4375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.2305188961327076, "epoch": 0.07957387679481241, "frac_reward_zero_std": 1.0, "grad_norm": 0.00238050171174109, "kl": 0.001595924055436626, "learning_rate": 9.840944881889764e-07, "loss": 0.0001, "num_tokens": 47283325.0, "reward": 0.6466840505599976, "reward_std": 0.0, "rewards/reward_func/mean": 0.6466840505599976, "rewards/reward_func/std": 0.0, "step": 1718, "step_time": 22.60474267974496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.30449212342500687, "epoch": 0.07962019453450672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012453654780983925, "kl": 0.0012665226822718978, "learning_rate": 9.840852246410375e-07, "loss": 0.0001, "num_tokens": 47305531.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1719, "step_time": 14.574725233018398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.25209128856658936, "epoch": 0.07966651227420102, "frac_reward_zero_std": 0.0, "grad_norm": 0.1827867031097412, "kl": 0.004445167491212487, "learning_rate": 9.840759610930986e-07, "loss": -0.0504, "num_tokens": 47334121.0, "reward": 0.8146799206733704, "reward_std": 0.31801846623420715, "rewards/reward_func/mean": 0.8146799206733704, "rewards/reward_func/std": 0.31801849603652954, "step": 1720, "step_time": 20.539701025933027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3810490518808365, "epoch": 0.07971283001389533, "frac_reward_zero_std": 1.0, "grad_norm": 0.005509552545845509, "kl": 0.0023259613662958145, "learning_rate": 9.840666975451598e-07, "loss": 0.0001, "num_tokens": 47365661.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1721, "step_time": 18.761831019073725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 227.3125, "completions/mean_terminated_length": 227.3125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.423902727663517, "epoch": 0.07975914775358962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020497653167694807, "kl": 0.001853982568718493, "learning_rate": 9.840574339972209e-07, "loss": 0.0001, "num_tokens": 47398066.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1722, "step_time": 28.940093513578176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.38299621641635895, "epoch": 0.07980546549328393, "frac_reward_zero_std": 0.0, "grad_norm": 0.09138719737529755, "kl": 0.0015985459904186428, "learning_rate": 9.84048170449282e-07, "loss": 0.0412, "num_tokens": 47422917.0, "reward": 0.033777061849832535, "reward_std": 0.13510826230049133, "rewards/reward_func/mean": 0.033777061849832535, "rewards/reward_func/std": 0.13510826230049133, "step": 1723, "step_time": 20.05716634169221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.446855790913105, "epoch": 0.07985178323297823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012937851715832949, "kl": 0.001694143924396485, "learning_rate": 9.840389069013431e-07, "loss": 0.0001, "num_tokens": 47465747.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1724, "step_time": 20.9438929669559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.33595968037843704, "epoch": 0.07989810097267254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026925739366561174, "kl": 0.0018194759904872626, "learning_rate": 9.840296433534043e-07, "loss": 0.0001, "num_tokens": 47487043.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1725, "step_time": 17.122764468193054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.18933162838220596, "epoch": 0.07994441871236684, "frac_reward_zero_std": 0.0, "grad_norm": 0.08614408224821091, "kl": 0.0019587448332458735, "learning_rate": 9.840203798054654e-07, "loss": 0.0678, "num_tokens": 47508763.0, "reward": 0.6217309236526489, "reward_std": 0.004086330533027649, "rewards/reward_func/mean": 0.6217309236526489, "rewards/reward_func/std": 0.004086339380592108, "step": 1726, "step_time": 22.153468146920204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 120.0625, "completions/mean_terminated_length": 120.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.23470370844006538, "epoch": 0.07999073645206115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011941755656152964, "kl": 0.001213880255818367, "learning_rate": 9.840111162575267e-07, "loss": 0.0001, "num_tokens": 47528012.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1727, "step_time": 13.319803450256586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.36268921941518784, "epoch": 0.08003705419175544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014123907312750816, "kl": 0.0016592381580267102, "learning_rate": 9.840018527095876e-07, "loss": 0.0001, "num_tokens": 47560370.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1728, "step_time": 23.27802597731352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 129.5, "completions/mean_terminated_length": 129.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.25399216264486313, "epoch": 0.08008337193144975, "frac_reward_zero_std": 1.0, "grad_norm": 0.004883928690105677, "kl": 0.0017067000735551119, "learning_rate": 9.839925891616488e-07, "loss": 0.0001, "num_tokens": 47582538.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1729, "step_time": 14.92527623474598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.6875, "completions/mean_terminated_length": 137.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3258182033896446, "epoch": 0.08012968967114405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011030449531972408, "kl": 0.0015004372398834676, "learning_rate": 9.8398332561371e-07, "loss": 0.0001, "num_tokens": 47608437.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1730, "step_time": 16.201912455260754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 175.4375, "completions/mean_terminated_length": 175.4375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.2518988251686096, "epoch": 0.08017600741083836, "frac_reward_zero_std": 0.0, "grad_norm": 0.09052787721157074, "kl": 0.0013592242321465164, "learning_rate": 9.839740620657712e-07, "loss": -0.0693, "num_tokens": 47629500.0, "reward": 0.809805154800415, "reward_std": 0.29135626554489136, "rewards/reward_func/mean": 0.809805154800415, "rewards/reward_func/std": 0.29135629534721375, "step": 1731, "step_time": 18.97813592478633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.40458470582962036, "epoch": 0.08022232515053265, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010849455138668418, "kl": 0.001426139788236469, "learning_rate": 9.839647985178323e-07, "loss": 0.0001, "num_tokens": 47652658.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1732, "step_time": 18.86000870913267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 169.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2905171662569046, "epoch": 0.08026864289022696, "frac_reward_zero_std": 0.0, "grad_norm": 0.11406318098306656, "kl": 0.0018576665024738759, "learning_rate": 9.839555349698935e-07, "loss": -0.0569, "num_tokens": 47673703.0, "reward": 0.7525621652603149, "reward_std": 0.3050681948661804, "rewards/reward_func/mean": 0.7525621652603149, "rewards/reward_func/std": 0.3050681948661804, "step": 1733, "step_time": 16.87060249224305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 129.4375, "completions/mean_terminated_length": 129.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2552470825612545, "epoch": 0.08031496062992126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012431704672053456, "kl": 0.0010684909793781117, "learning_rate": 9.839462714219546e-07, "loss": 0.0001, "num_tokens": 47695742.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1734, "step_time": 13.730560723692179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.24373339861631393, "epoch": 0.08036127836961557, "frac_reward_zero_std": 0.0, "grad_norm": 0.10387435555458069, "kl": 0.0017696096329018474, "learning_rate": 9.839370078740157e-07, "loss": -0.041, "num_tokens": 47731102.0, "reward": 0.6591430902481079, "reward_std": 0.09329462051391602, "rewards/reward_func/mean": 0.6591430902481079, "rewards/reward_func/std": 0.09329462051391602, "step": 1735, "step_time": 27.556267201900482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 253.5625, "completions/mean_terminated_length": 253.5625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.43792251497507095, "epoch": 0.08040759610930986, "frac_reward_zero_std": 0.0, "grad_norm": 0.08416919410228729, "kl": 0.001997353887418285, "learning_rate": 9.839277443260768e-07, "loss": -0.2162, "num_tokens": 47771975.0, "reward": 0.006604422815144062, "reward_std": 0.012659032829105854, "rewards/reward_func/mean": 0.006604422815144062, "rewards/reward_func/std": 0.012659032829105854, "step": 1736, "step_time": 38.68926587700844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2899010255932808, "epoch": 0.08045391384900417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010311250807717443, "kl": 0.0012446382024791092, "learning_rate": 9.83918480778138e-07, "loss": 0.0001, "num_tokens": 47805373.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1737, "step_time": 17.425171364098787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 172.75, "completions/mean_terminated_length": 172.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3713117316365242, "epoch": 0.08050023158869847, "frac_reward_zero_std": 1.0, "grad_norm": 0.00268686399795115, "kl": 0.0017970122571568936, "learning_rate": 9.83909217230199e-07, "loss": 0.0001, "num_tokens": 47826905.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1738, "step_time": 18.37579194828868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2326408438384533, "epoch": 0.08054654932839278, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019349164795130491, "kl": 0.0018987695802934468, "learning_rate": 9.838999536822602e-07, "loss": 0.0001, "num_tokens": 47847873.0, "reward": 0.0018635177984833717, "reward_std": 0.0, "rewards/reward_func/mean": 0.0018635177984833717, "rewards/reward_func/std": 0.0, "step": 1739, "step_time": 16.576042093336582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.32996729016304016, "epoch": 0.08059286706808708, "frac_reward_zero_std": 0.0, "grad_norm": 0.08055779337882996, "kl": 0.0017422466480638832, "learning_rate": 9.838906901343213e-07, "loss": -0.001, "num_tokens": 47869818.0, "reward": 0.023791268467903137, "reward_std": 0.0015891696093603969, "rewards/reward_func/mean": 0.023791268467903137, "rewards/reward_func/std": 0.0015891696093603969, "step": 1740, "step_time": 20.29440288618207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 120.4375, "completions/mean_terminated_length": 120.4375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.29793283343315125, "epoch": 0.08063918480778139, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008918229723349214, "kl": 0.001052564402925782, "learning_rate": 9.838814265863825e-07, "loss": 0.0001, "num_tokens": 47890961.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1741, "step_time": 13.468116946518421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.17352503538131714, "epoch": 0.08068550254747568, "frac_reward_zero_std": 1.0, "grad_norm": 0.001970309065654874, "kl": 0.0015197321772575378, "learning_rate": 9.838721630384436e-07, "loss": 0.0001, "num_tokens": 47916775.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1742, "step_time": 15.390880346298218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 156.3125, "completions/mean_terminated_length": 156.3125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3092532157897949, "epoch": 0.08073182028716999, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022548986598849297, "kl": 0.0017885872512124479, "learning_rate": 9.83862899490505e-07, "loss": 0.0001, "num_tokens": 47939660.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1743, "step_time": 17.08356538042426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2483629770576954, "epoch": 0.08077813802686429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010942198568955064, "kl": 0.0010861923801712692, "learning_rate": 9.83853635942566e-07, "loss": 0.0001, "num_tokens": 47963962.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 1744, "step_time": 18.27033767849207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2996247857809067, "epoch": 0.0808244557665586, "frac_reward_zero_std": 1.0, "grad_norm": 0.002985493279993534, "kl": 0.0020667205681093037, "learning_rate": 9.838443723946272e-07, "loss": 0.0001, "num_tokens": 47984328.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1745, "step_time": 16.176025312393904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.370588019490242, "epoch": 0.0808707735062529, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020940375979989767, "kl": 0.0019824077317025512, "learning_rate": 9.83835108846688e-07, "loss": 0.0001, "num_tokens": 48008948.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1746, "step_time": 16.244132790714502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2751948833465576, "epoch": 0.0809170912459472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013756590196862817, "kl": 0.0014317425666376948, "learning_rate": 9.838258452987494e-07, "loss": 0.0001, "num_tokens": 48029546.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1747, "step_time": 14.355286739766598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.2890290319919586, "epoch": 0.0809634089856415, "frac_reward_zero_std": 0.0, "grad_norm": 0.09583621472120285, "kl": 0.002892935706768185, "learning_rate": 9.838165817508106e-07, "loss": -0.013, "num_tokens": 48051314.0, "reward": 0.8611046075820923, "reward_std": 0.33824291825294495, "rewards/reward_func/mean": 0.8611046075820923, "rewards/reward_func/std": 0.33824291825294495, "step": 1748, "step_time": 27.2410273514688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3782690018415451, "epoch": 0.08100972672533581, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019277514657005668, "kl": 0.0020270912791602314, "learning_rate": 9.838073182028717e-07, "loss": 0.0001, "num_tokens": 48072854.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1749, "step_time": 19.19731855392456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.34312743693590164, "epoch": 0.0810560444650301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017144788289442658, "kl": 0.001481565908761695, "learning_rate": 9.837980546549328e-07, "loss": 0.0001, "num_tokens": 48104719.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1750, "step_time": 22.33763938769698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.1942046657204628, "epoch": 0.08110236220472442, "frac_reward_zero_std": 0.0, "grad_norm": 0.08009286224842072, "kl": 0.0013910191337345168, "learning_rate": 9.83788791106994e-07, "loss": -0.0371, "num_tokens": 48133925.0, "reward": 0.5303791761398315, "reward_std": 0.20773735642433167, "rewards/reward_func/mean": 0.5303791761398315, "rewards/reward_func/std": 0.20773734152317047, "step": 1751, "step_time": 19.963861864060163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 112.0625, "completions/mean_terminated_length": 112.0625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2743670344352722, "epoch": 0.08114867994441871, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011545876041054726, "kl": 0.0012253207387402654, "learning_rate": 9.83779527559055e-07, "loss": 0.0001, "num_tokens": 48154982.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1752, "step_time": 12.302139397710562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.30697520077228546, "epoch": 0.08119499768411302, "frac_reward_zero_std": 1.0, "grad_norm": 0.002024251502007246, "kl": 0.0017657809075899422, "learning_rate": 9.837702640111162e-07, "loss": 0.0001, "num_tokens": 48189938.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1753, "step_time": 18.834266159683466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.21398412063717842, "epoch": 0.08124131542380732, "frac_reward_zero_std": 0.0, "grad_norm": 0.08869022876024246, "kl": 0.0008632429380668327, "learning_rate": 9.837610004631773e-07, "loss": -0.0103, "num_tokens": 48214203.0, "reward": 0.874308168888092, "reward_std": 0.03275489807128906, "rewards/reward_func/mean": 0.874308168888092, "rewards/reward_func/std": 0.03275489807128906, "step": 1754, "step_time": 17.75719940289855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3036256954073906, "epoch": 0.08128763316350163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011545493034645915, "kl": 0.0013409795064944774, "learning_rate": 9.837517369152384e-07, "loss": 0.0001, "num_tokens": 48240601.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1755, "step_time": 15.568399280309677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3096730038523674, "epoch": 0.08133395090319592, "frac_reward_zero_std": 0.0, "grad_norm": 0.10500717908143997, "kl": 0.001821783254854381, "learning_rate": 9.837424733672995e-07, "loss": 0.0119, "num_tokens": 48263531.0, "reward": 0.8728713393211365, "reward_std": 0.23276567459106445, "rewards/reward_func/mean": 0.8728713393211365, "rewards/reward_func/std": 0.23276568949222565, "step": 1756, "step_time": 20.194113560020924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 195.9375, "completions/mean_terminated_length": 195.9375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.1552904136478901, "epoch": 0.08138026864289023, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013945831451565027, "kl": 0.0010427927481941879, "learning_rate": 9.837332098193609e-07, "loss": 0.0001, "num_tokens": 48288202.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1757, "step_time": 20.05296416208148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 124.75, "completions/mean_terminated_length": 124.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.32303228229284286, "epoch": 0.08142658638258453, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024470367934554815, "kl": 0.0021272314770612866, "learning_rate": 9.837239462714218e-07, "loss": 0.0001, "num_tokens": 48309494.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1758, "step_time": 13.834779296070337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 147.6875, "completions/mean_terminated_length": 147.6875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3536003828048706, "epoch": 0.08147290412227884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012544355122372508, "kl": 0.0014445357373915613, "learning_rate": 9.83714682723483e-07, "loss": 0.0001, "num_tokens": 48333233.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1759, "step_time": 16.417766705155373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.33668191730976105, "epoch": 0.08151922186197313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013698124093934894, "kl": 0.0015107369981706142, "learning_rate": 9.837054191755443e-07, "loss": 0.0001, "num_tokens": 48358667.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1760, "step_time": 17.65128844976425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 131.5625, "completions/mean_terminated_length": 131.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3014110028743744, "epoch": 0.08156553960166744, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027149177622050047, "kl": 0.0019454198190942407, "learning_rate": 9.836961556276054e-07, "loss": 0.0001, "num_tokens": 48380948.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1761, "step_time": 14.282371658831835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 209.5625, "completions/mean_terminated_length": 209.5625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4765824005007744, "epoch": 0.08161185734136174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034426741767674685, "kl": 0.003096617292612791, "learning_rate": 9.836868920796665e-07, "loss": 0.0002, "num_tokens": 48407821.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1762, "step_time": 23.894170958548784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 130.9375, "completions/mean_terminated_length": 130.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29027804732322693, "epoch": 0.08165817508105605, "frac_reward_zero_std": 1.0, "grad_norm": 0.001258092699572444, "kl": 0.0011905626743100584, "learning_rate": 9.836776285317276e-07, "loss": 0.0001, "num_tokens": 48432204.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1763, "step_time": 15.331847336143255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.31587763130664825, "epoch": 0.08170449282075035, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023555525112897158, "kl": 0.001577354152686894, "learning_rate": 9.836683649837888e-07, "loss": 0.0001, "num_tokens": 48454378.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1764, "step_time": 16.64985018968582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4026193544268608, "epoch": 0.08175081056044466, "frac_reward_zero_std": 1.0, "grad_norm": 0.000793006329331547, "kl": 0.001331146020675078, "learning_rate": 9.836591014358499e-07, "loss": 0.0001, "num_tokens": 48484691.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1765, "step_time": 18.201582103967667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 178.8125, "completions/mean_terminated_length": 178.8125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3697373867034912, "epoch": 0.08179712830013895, "frac_reward_zero_std": 1.0, "grad_norm": 0.003388105658814311, "kl": 0.0022740360582247376, "learning_rate": 9.83649837887911e-07, "loss": 0.0001, "num_tokens": 48518400.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1766, "step_time": 20.224620919674635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.19097772240638733, "epoch": 0.08184344603983326, "frac_reward_zero_std": 0.0, "grad_norm": 0.0848284438252449, "kl": 0.0007946285040816292, "learning_rate": 9.836405743399721e-07, "loss": -0.0531, "num_tokens": 48553698.0, "reward": 0.9223366379737854, "reward_std": 0.11897119879722595, "rewards/reward_func/mean": 0.9223366379737854, "rewards/reward_func/std": 0.11897119134664536, "step": 1767, "step_time": 18.881062146276236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.305821530520916, "epoch": 0.08188976377952756, "frac_reward_zero_std": 1.0, "grad_norm": 0.003025626763701439, "kl": 0.003023940953426063, "learning_rate": 9.836313107920333e-07, "loss": 0.0002, "num_tokens": 48592005.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1768, "step_time": 19.679544236510992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 221.4375, "completions/mean_terminated_length": 221.4375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.24040383473038673, "epoch": 0.08193608151922187, "frac_reward_zero_std": 0.0, "grad_norm": 0.1213071346282959, "kl": 0.0017236094281543046, "learning_rate": 9.836220472440944e-07, "loss": 0.0285, "num_tokens": 48616652.0, "reward": 0.9858227372169495, "reward_std": 0.0304801557213068, "rewards/reward_func/mean": 0.9858227372169495, "rewards/reward_func/std": 0.030480151996016502, "step": 1769, "step_time": 21.561090268194675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.27792296558618546, "epoch": 0.08198239925891616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009946874342858791, "kl": 0.00113045732723549, "learning_rate": 9.836127836961557e-07, "loss": 0.0001, "num_tokens": 48638908.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1770, "step_time": 16.195545982569456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 203.25, "completions/mean_terminated_length": 203.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.39546895772218704, "epoch": 0.08202871699861047, "frac_reward_zero_std": 0.0, "grad_norm": 0.09161561727523804, "kl": 0.0020151612407062203, "learning_rate": 9.836035201482166e-07, "loss": -0.1318, "num_tokens": 48661648.0, "reward": 0.12096919119358063, "reward_std": 0.3307603895664215, "rewards/reward_func/mean": 0.12096919119358063, "rewards/reward_func/std": 0.3307604193687439, "step": 1771, "step_time": 23.496229242533445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4003916531801224, "epoch": 0.08207503473830477, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014100077096372843, "kl": 0.0020715085265692323, "learning_rate": 9.835942566002778e-07, "loss": 0.0001, "num_tokens": 48693675.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1772, "step_time": 21.215330470353365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 150.75, "completions/mean_terminated_length": 150.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.29009925574064255, "epoch": 0.08212135247799908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013742835726588964, "kl": 0.0011427075078245252, "learning_rate": 9.83584993052339e-07, "loss": 0.0001, "num_tokens": 48715879.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1773, "step_time": 15.747868739068508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4076923429965973, "epoch": 0.08216767021769338, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033543696627020836, "kl": 0.002331511495867744, "learning_rate": 9.835757295044002e-07, "loss": 0.0001, "num_tokens": 48739789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1774, "step_time": 22.986718233674765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3451576381921768, "epoch": 0.08221398795738769, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031812279485166073, "kl": 0.002530711644794792, "learning_rate": 9.835664659564613e-07, "loss": 0.0001, "num_tokens": 48767231.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1775, "step_time": 26.703542787581682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.22918789088726044, "epoch": 0.08226030569708198, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011042740661650896, "kl": 0.0012245680554769933, "learning_rate": 9.835572024085225e-07, "loss": 0.0001, "num_tokens": 48792283.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 1776, "step_time": 17.573363177478313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 246.9375, "completions/mean_terminated_length": 246.9375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.2423914223909378, "epoch": 0.08230662343677629, "frac_reward_zero_std": 0.0, "grad_norm": 0.24949026107788086, "kl": 0.006801836425438523, "learning_rate": 9.835479388605836e-07, "loss": 0.0428, "num_tokens": 48827066.0, "reward": 0.8991250991821289, "reward_std": 0.23976711928844452, "rewards/reward_func/mean": 0.8991250991821289, "rewards/reward_func/std": 0.2397671341896057, "step": 1777, "step_time": 27.01763205602765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 239.75, "completions/mean_terminated_length": 239.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.2516292668879032, "epoch": 0.08235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.002423672005534172, "kl": 0.0017647942004259676, "learning_rate": 9.835386753126447e-07, "loss": 0.0001, "num_tokens": 48865782.0, "reward": 0.7441932559013367, "reward_std": 0.0, "rewards/reward_func/mean": 0.7441932559013367, "rewards/reward_func/std": 0.0, "step": 1778, "step_time": 27.450808752328157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.36280398070812225, "epoch": 0.0823992589161649, "frac_reward_zero_std": 0.0, "grad_norm": 0.07675819844007492, "kl": 0.002921669220086187, "learning_rate": 9.835294117647058e-07, "loss": -0.0686, "num_tokens": 48888396.0, "reward": 0.446357399225235, "reward_std": 0.2671467959880829, "rewards/reward_func/mean": 0.446357399225235, "rewards/reward_func/std": 0.2671468257904053, "step": 1779, "step_time": 24.887188009917736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 197.5, "completions/mean_terminated_length": 197.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.38635627925395966, "epoch": 0.08244557665585919, "frac_reward_zero_std": 0.0, "grad_norm": 0.10332873463630676, "kl": 0.0033139188308268785, "learning_rate": 9.83520148216767e-07, "loss": -0.0341, "num_tokens": 48937924.0, "reward": 0.05635083466768265, "reward_std": 0.21836689114570618, "rewards/reward_func/mean": 0.05635083466768265, "rewards/reward_func/std": 0.21836690604686737, "step": 1780, "step_time": 30.988724403083324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.27067185193300247, "epoch": 0.0824918943955535, "frac_reward_zero_std": 0.0, "grad_norm": 0.07795015722513199, "kl": 0.0017696596623864025, "learning_rate": 9.83510884668828e-07, "loss": -0.0698, "num_tokens": 48967924.0, "reward": 0.1412212997674942, "reward_std": 0.13408060371875763, "rewards/reward_func/mean": 0.1412212997674942, "rewards/reward_func/std": 0.13408060371875763, "step": 1781, "step_time": 31.076837234199047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.42776962369680405, "epoch": 0.0825382121352478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020765860099345446, "kl": 0.001906985737150535, "learning_rate": 9.835016211208892e-07, "loss": 0.0001, "num_tokens": 49008053.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1782, "step_time": 22.713887855410576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2731229066848755, "epoch": 0.08258452987494211, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010325806215405464, "kl": 0.0012288026919122785, "learning_rate": 9.834923575729503e-07, "loss": 0.0001, "num_tokens": 49041429.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 1783, "step_time": 20.36423819512129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.30894937366247177, "epoch": 0.0826308476146364, "frac_reward_zero_std": 0.0, "grad_norm": 0.11499016731977463, "kl": 0.0018300899828318506, "learning_rate": 9.834830940250115e-07, "loss": 0.132, "num_tokens": 49063396.0, "reward": 0.14704498648643494, "reward_std": 0.08768068253993988, "rewards/reward_func/mean": 0.14704498648643494, "rewards/reward_func/std": 0.08768068999052048, "step": 1784, "step_time": 20.808938808739185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.18482393398880959, "epoch": 0.08267716535433071, "frac_reward_zero_std": 0.0, "grad_norm": 0.17283470928668976, "kl": 0.0014046141877770424, "learning_rate": 9.834738304770726e-07, "loss": 0.0076, "num_tokens": 49085766.0, "reward": 0.9855483174324036, "reward_std": 0.0394895039498806, "rewards/reward_func/mean": 0.9855483174324036, "rewards/reward_func/std": 0.039489492774009705, "step": 1785, "step_time": 19.037325251847506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.23211238905787468, "epoch": 0.08272348309402501, "frac_reward_zero_std": 1.0, "grad_norm": 0.001915707252919674, "kl": 0.0012915268598590046, "learning_rate": 9.834645669291337e-07, "loss": 0.0001, "num_tokens": 49105096.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1786, "step_time": 13.047077864408493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 188.1875, "completions/mean_terminated_length": 188.1875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.17376741394400597, "epoch": 0.08276980083371932, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019552954472601414, "kl": 0.001579032134031877, "learning_rate": 9.83455303381195e-07, "loss": 0.0001, "num_tokens": 49129147.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 1787, "step_time": 19.296804752200842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2800978422164917, "epoch": 0.08281611857341362, "frac_reward_zero_std": 1.0, "grad_norm": 0.005788644775748253, "kl": 0.0014217516873031855, "learning_rate": 9.834460398332562e-07, "loss": 0.0001, "num_tokens": 49163909.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1788, "step_time": 19.545057754963636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 121.3125, "completions/mean_terminated_length": 121.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.28535931557416916, "epoch": 0.08286243631310793, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034982829820364714, "kl": 0.002478871843777597, "learning_rate": 9.83436776285317e-07, "loss": 0.0001, "num_tokens": 49186074.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1789, "step_time": 14.213303998112679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3252570927143097, "epoch": 0.08290875405280222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023588044568896294, "kl": 0.0019274250080343336, "learning_rate": 9.834275127373784e-07, "loss": 0.0001, "num_tokens": 49210326.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1790, "step_time": 15.675796948373318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2171715386211872, "epoch": 0.08295507179249653, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025463069323450327, "kl": 0.0015180335321929306, "learning_rate": 9.834182491894396e-07, "loss": 0.0001, "num_tokens": 49229770.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1791, "step_time": 14.180842258036137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 184.0, "completions/mean_terminated_length": 184.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.398625910282135, "epoch": 0.08300138953219083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008965880842879415, "kl": 0.0012904828181490302, "learning_rate": 9.834089856415007e-07, "loss": 0.0001, "num_tokens": 49257914.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1792, "step_time": 19.320599518716335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3106895983219147, "epoch": 0.08304770727188514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013497864129021764, "kl": 0.0011589547211769968, "learning_rate": 9.833997220935618e-07, "loss": 0.0001, "num_tokens": 49288179.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1793, "step_time": 16.620586711913347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3716931715607643, "epoch": 0.08309402501157943, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017326937522739172, "kl": 0.0013969821447972208, "learning_rate": 9.83390458545623e-07, "loss": 0.0001, "num_tokens": 49309981.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1794, "step_time": 15.223750609904528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 109.5625, "completions/mean_terminated_length": 109.5625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.23468219861388206, "epoch": 0.08314034275127374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020196361001580954, "kl": 0.0012948633375344798, "learning_rate": 9.83381194997684e-07, "loss": 0.0001, "num_tokens": 49330166.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1795, "step_time": 12.192474193871021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3503687083721161, "epoch": 0.08318666049096804, "frac_reward_zero_std": 0.0, "grad_norm": 0.12645870447158813, "kl": 0.002182353870011866, "learning_rate": 9.833719314497452e-07, "loss": 0.0114, "num_tokens": 49351945.0, "reward": 0.5515605807304382, "reward_std": 0.44124847650527954, "rewards/reward_func/mean": 0.5515605807304382, "rewards/reward_func/std": 0.44124844670295715, "step": 1796, "step_time": 16.90958084538579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 174.8125, "completions/mean_terminated_length": 174.8125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3956194296479225, "epoch": 0.08323297823066235, "frac_reward_zero_std": 1.0, "grad_norm": 0.001963954418897629, "kl": 0.0018403020512778312, "learning_rate": 9.833626679018063e-07, "loss": 0.0001, "num_tokens": 49383446.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1797, "step_time": 20.44994032010436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.39586982131004333, "epoch": 0.08327929597035665, "frac_reward_zero_std": 0.0, "grad_norm": 0.10813822597265244, "kl": 0.0018154153076466173, "learning_rate": 9.833534043538674e-07, "loss": 0.0989, "num_tokens": 49418698.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 1798, "step_time": 28.728622019290924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.42988988757133484, "epoch": 0.08332561371005096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023001835215836763, "kl": 0.0018978688749484718, "learning_rate": 9.833441408059286e-07, "loss": 0.0001, "num_tokens": 49444376.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1799, "step_time": 20.286078292876482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 121.3125, "completions/mean_terminated_length": 121.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.24906935542821884, "epoch": 0.08337193144974525, "frac_reward_zero_std": 1.0, "grad_norm": 0.005079444032162428, "kl": 0.0016205398133024573, "learning_rate": 9.833348772579899e-07, "loss": 0.0001, "num_tokens": 49464445.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1800, "step_time": 13.23555477336049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.23199346661567688, "epoch": 0.08341824918943956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012887542834505439, "kl": 0.0011629058717517182, "learning_rate": 9.833256137100508e-07, "loss": 0.0001, "num_tokens": 49491275.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 1801, "step_time": 17.76983129605651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.20500538870692253, "epoch": 0.08346456692913386, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009976846631616354, "kl": 0.000767384612117894, "learning_rate": 9.83316350162112e-07, "loss": 0.0, "num_tokens": 49513049.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 1802, "step_time": 16.451257165521383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.39179253578186035, "epoch": 0.08351088466882817, "frac_reward_zero_std": 0.0, "grad_norm": 0.10531281679868698, "kl": 0.0025745042366907, "learning_rate": 9.833070866141733e-07, "loss": 0.1, "num_tokens": 49543719.0, "reward": 0.6902567148208618, "reward_std": 0.4121498465538025, "rewards/reward_func/mean": 0.6902567148208618, "rewards/reward_func/std": 0.4121498465538025, "step": 1803, "step_time": 26.545985084027052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 165.5625, "completions/mean_terminated_length": 165.5625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.43202464282512665, "epoch": 0.08355720240852246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007357713766396046, "kl": 0.0014624310715589672, "learning_rate": 9.832978230662344e-07, "loss": 0.0001, "num_tokens": 49580416.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1804, "step_time": 20.909253243356943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 140.3125, "completions/mean_terminated_length": 140.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.27327684313058853, "epoch": 0.08360352014821677, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028917582239955664, "kl": 0.001670055149588734, "learning_rate": 9.832885595182955e-07, "loss": 0.0001, "num_tokens": 49600805.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1805, "step_time": 14.909344926476479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 191.875, "completions/mean_terminated_length": 191.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.24407634884119034, "epoch": 0.08364983788791107, "frac_reward_zero_std": 0.0, "grad_norm": 0.08293452858924866, "kl": 0.0010697099642129615, "learning_rate": 9.832792959703566e-07, "loss": 0.0338, "num_tokens": 49622739.0, "reward": 0.9848532676696777, "reward_std": 0.02709529735147953, "rewards/reward_func/mean": 0.9848532676696777, "rewards/reward_func/std": 0.02709529921412468, "step": 1806, "step_time": 19.296129278838634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.28344016522169113, "epoch": 0.08369615562760538, "frac_reward_zero_std": 1.0, "grad_norm": 0.003905265359207988, "kl": 0.002277866209624335, "learning_rate": 9.832700324224178e-07, "loss": 0.0001, "num_tokens": 49644709.0, "reward": 0.26359713077545166, "reward_std": 0.0, "rewards/reward_func/mean": 0.26359713077545166, "rewards/reward_func/std": 0.0, "step": 1807, "step_time": 19.552649281919003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.17779069393873215, "epoch": 0.08374247336729967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017649864312261343, "kl": 0.001114297832828015, "learning_rate": 9.832607688744789e-07, "loss": 0.0001, "num_tokens": 49670839.0, "reward": 0.5468458533287048, "reward_std": 0.0, "rewards/reward_func/mean": 0.5468458533287048, "rewards/reward_func/std": 0.0, "step": 1808, "step_time": 18.213044803589582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2523210905492306, "epoch": 0.08378879110699398, "frac_reward_zero_std": 0.0, "grad_norm": 0.15321528911590576, "kl": 0.002367090666666627, "learning_rate": 9.8325150532654e-07, "loss": -0.0064, "num_tokens": 49691536.0, "reward": 0.9750396013259888, "reward_std": 0.0536632239818573, "rewards/reward_func/mean": 0.9750396013259888, "rewards/reward_func/std": 0.0536632314324379, "step": 1809, "step_time": 16.44727297499776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.43200092017650604, "epoch": 0.08383510884668828, "frac_reward_zero_std": 0.0, "grad_norm": 0.07732279598712921, "kl": 0.0023723530175630003, "learning_rate": 9.832422417786011e-07, "loss": -0.0417, "num_tokens": 49713954.0, "reward": 0.050845880061388016, "reward_std": 0.2019171267747879, "rewards/reward_func/mean": 0.050845880061388016, "rewards/reward_func/std": 0.2019171416759491, "step": 1810, "step_time": 35.61111123859882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 271.6000061035156, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.17735937237739563, "epoch": 0.08388142658638259, "frac_reward_zero_std": 0.0, "grad_norm": 0.07135229557752609, "kl": 0.00859279406722635, "learning_rate": 9.832329782306623e-07, "loss": 0.5534, "num_tokens": 49746076.0, "reward": 0.804602324962616, "reward_std": 0.21490788459777832, "rewards/reward_func/mean": 0.804602324962616, "rewards/reward_func/std": 0.21490789949893951, "step": 1811, "step_time": 78.7587280869484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 193.0625, "completions/mean_terminated_length": 193.0625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.19942530244588852, "epoch": 0.08392774432607689, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008088955073617399, "kl": 0.0008911041804822162, "learning_rate": 9.832237146827234e-07, "loss": 0.0, "num_tokens": 49771997.0, "reward": 0.7195215821266174, "reward_std": 0.0, "rewards/reward_func/mean": 0.7195215821266174, "rewards/reward_func/std": 0.0, "step": 1812, "step_time": 18.957622949033976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 180.4375, "completions/mean_terminated_length": 180.4375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.21608738601207733, "epoch": 0.0839740620657712, "frac_reward_zero_std": 0.0, "grad_norm": 0.09081307053565979, "kl": 0.0011739842884708196, "learning_rate": 9.832144511347847e-07, "loss": 0.0007, "num_tokens": 49794340.0, "reward": 0.9288549423217773, "reward_std": 0.02620236761868, "rewards/reward_func/mean": 0.9288549423217773, "rewards/reward_func/std": 0.026202375069260597, "step": 1813, "step_time": 18.37841545045376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 183.1875, "completions/mean_terminated_length": 183.1875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.23088476434350014, "epoch": 0.08402037980546549, "frac_reward_zero_std": 1.0, "grad_norm": 0.002391039626672864, "kl": 0.001544103433843702, "learning_rate": 9.832051875868456e-07, "loss": 0.0001, "num_tokens": 49817207.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 1814, "step_time": 21.5418782196939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.25327617675065994, "epoch": 0.0840666975451598, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012999364407733083, "kl": 0.0011234616104047745, "learning_rate": 9.831959240389068e-07, "loss": 0.0001, "num_tokens": 49836754.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1815, "step_time": 12.833026364445686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 210.4375, "completions/mean_terminated_length": 210.4375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.31768694519996643, "epoch": 0.0841130152848541, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025341857690364122, "kl": 0.001781856408342719, "learning_rate": 9.831866604909679e-07, "loss": 0.0001, "num_tokens": 49867865.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1816, "step_time": 23.116089649498463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 130.1875, "completions/mean_terminated_length": 130.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.33896253257989883, "epoch": 0.08415933302454841, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016798991709947586, "kl": 0.001647378463530913, "learning_rate": 9.831773969430292e-07, "loss": 0.0001, "num_tokens": 49889596.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1817, "step_time": 14.299078464508057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 303.9375, "completions/mean_terminated_length": 303.9375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.24887847900390625, "epoch": 0.0842056507642427, "frac_reward_zero_std": 0.0, "grad_norm": 0.05676943436264992, "kl": 0.0022791285591665655, "learning_rate": 9.831681333950904e-07, "loss": -0.0944, "num_tokens": 49930059.0, "reward": 0.8556371331214905, "reward_std": 0.33811211585998535, "rewards/reward_func/mean": 0.8556371331214905, "rewards/reward_func/std": 0.33811214566230774, "step": 1818, "step_time": 34.08150742575526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 154.875, "completions/mean_terminated_length": 154.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.19956690818071365, "epoch": 0.08425196850393701, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015095311682671309, "kl": 0.0009476547274971381, "learning_rate": 9.831588698471515e-07, "loss": 0.0, "num_tokens": 49955289.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 1819, "step_time": 16.97852310538292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.20373685285449028, "epoch": 0.08429828624363131, "frac_reward_zero_std": 1.0, "grad_norm": 0.001349732163362205, "kl": 0.0008416222117375582, "learning_rate": 9.831496062992126e-07, "loss": 0.0, "num_tokens": 49975955.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 1820, "step_time": 15.104047987610102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 167.0625, "completions/mean_terminated_length": 167.0625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.32213493436574936, "epoch": 0.08434460398332562, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021454528905451298, "kl": 0.0014302792551461607, "learning_rate": 9.831403427512737e-07, "loss": 0.0001, "num_tokens": 49999492.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1821, "step_time": 19.009277906268835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 140.3125, "completions/mean_terminated_length": 140.3125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2914526090025902, "epoch": 0.08439092172301992, "frac_reward_zero_std": 1.0, "grad_norm": 0.00561274541541934, "kl": 0.002711360401008278, "learning_rate": 9.831310792033349e-07, "loss": 0.0001, "num_tokens": 50022953.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1822, "step_time": 16.8443075530231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.2728447914123535, "epoch": 0.08443723946271423, "frac_reward_zero_std": 1.0, "grad_norm": 0.002339413855224848, "kl": 0.0017294083663728088, "learning_rate": 9.83121815655396e-07, "loss": 0.0001, "num_tokens": 50047817.0, "reward": 0.6339495778083801, "reward_std": 0.0, "rewards/reward_func/mean": 0.6339495778083801, "rewards/reward_func/std": 0.0, "step": 1823, "step_time": 18.795628257095814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 134.1875, "completions/mean_terminated_length": 134.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3065073862671852, "epoch": 0.08448355720240852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018273607129231095, "kl": 0.0014974308433011174, "learning_rate": 9.83112552107457e-07, "loss": 0.0001, "num_tokens": 50070364.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1824, "step_time": 14.868289861828089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 169.5625, "completions/mean_terminated_length": 169.5625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.43381329625844955, "epoch": 0.08452987494210283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021011296194046736, "kl": 0.002246720192488283, "learning_rate": 9.831032885595182e-07, "loss": 0.0001, "num_tokens": 50109557.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1825, "step_time": 22.53801593557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 119.4375, "completions/mean_terminated_length": 119.4375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.28671035915613174, "epoch": 0.08457619268179713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020305654034018517, "kl": 0.0018016251851804554, "learning_rate": 9.830940250115794e-07, "loss": 0.0001, "num_tokens": 50130828.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1826, "step_time": 13.542615745216608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 132.5625, "completions/mean_terminated_length": 132.5625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.25577178224921227, "epoch": 0.08462251042149144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032878646161407232, "kl": 0.001347804325632751, "learning_rate": 9.830847614636405e-07, "loss": 0.0001, "num_tokens": 50156117.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1827, "step_time": 14.89745607599616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4408046752214432, "epoch": 0.08466882816118573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015493660466745496, "kl": 0.0016584981349296868, "learning_rate": 9.830754979157016e-07, "loss": 0.0001, "num_tokens": 50182782.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1828, "step_time": 22.523534949868917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4187343046069145, "epoch": 0.08471514590088004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015243644593283534, "kl": 0.00160211167531088, "learning_rate": 9.830662343677627e-07, "loss": 0.0001, "num_tokens": 50215558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1829, "step_time": 22.589669562876225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3287733271718025, "epoch": 0.08476146364057434, "frac_reward_zero_std": 1.0, "grad_norm": 0.002134723588824272, "kl": 0.0015161820920184255, "learning_rate": 9.83056970819824e-07, "loss": 0.0001, "num_tokens": 50248996.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1830, "step_time": 17.67187250033021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.21432293578982353, "epoch": 0.08480778138026865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010669506154954433, "kl": 0.001128709947806783, "learning_rate": 9.830477072718852e-07, "loss": 0.0001, "num_tokens": 50274424.0, "reward": 0.815539538860321, "reward_std": 0.0, "rewards/reward_func/mean": 0.815539538860321, "rewards/reward_func/std": 0.0, "step": 1831, "step_time": 15.700519874691963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 129.375, "completions/mean_terminated_length": 129.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.1663511097431183, "epoch": 0.08485409911996294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014463907573372126, "kl": 0.0009139027533819899, "learning_rate": 9.83038443723946e-07, "loss": 0.0, "num_tokens": 50295518.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 1832, "step_time": 14.905838370323181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.356958732008934, "epoch": 0.08490041685965725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013619877863675356, "kl": 0.0015395651280414313, "learning_rate": 9.830291801760074e-07, "loss": 0.0001, "num_tokens": 50315998.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1833, "step_time": 16.306883040815592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.13665926083922386, "epoch": 0.08494673459935155, "frac_reward_zero_std": 0.0, "grad_norm": 0.10321447998285294, "kl": 0.0006671329028904438, "learning_rate": 9.830199166280686e-07, "loss": -0.0015, "num_tokens": 50348251.0, "reward": 0.8971847295761108, "reward_std": 0.040134936571121216, "rewards/reward_func/mean": 0.8971847295761108, "rewards/reward_func/std": 0.04013495147228241, "step": 1834, "step_time": 18.512251127511263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2695477083325386, "epoch": 0.08499305233904586, "frac_reward_zero_std": 1.0, "grad_norm": 0.001143561559729278, "kl": 0.0010611762409098446, "learning_rate": 9.830106530801297e-07, "loss": 0.0001, "num_tokens": 50370988.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1835, "step_time": 14.397206574678421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1885693371295929, "epoch": 0.08503937007874016, "frac_reward_zero_std": 1.0, "grad_norm": 0.002070471178740263, "kl": 0.0013506181130651385, "learning_rate": 9.830013895321908e-07, "loss": 0.0001, "num_tokens": 50408556.0, "reward": 0.9091564416885376, "reward_std": 0.0, "rewards/reward_func/mean": 0.9091564416885376, "rewards/reward_func/std": 0.0, "step": 1836, "step_time": 23.369936358183622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 115.6875, "completions/mean_terminated_length": 115.6875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.20968084037303925, "epoch": 0.08508568781843447, "frac_reward_zero_std": 1.0, "grad_norm": 0.002231639577075839, "kl": 0.001264735299628228, "learning_rate": 9.82992125984252e-07, "loss": 0.0001, "num_tokens": 50428055.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1837, "step_time": 12.542578887194395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.27801789343357086, "epoch": 0.08513200555812876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013946460094302893, "kl": 0.0012275190965738147, "learning_rate": 9.82982862436313e-07, "loss": 0.0001, "num_tokens": 50450131.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1838, "step_time": 14.32071528956294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 205.5625, "completions/mean_terminated_length": 205.5625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.2687208354473114, "epoch": 0.08517832329782307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024656786117702723, "kl": 0.0018901612784247845, "learning_rate": 9.829735988883742e-07, "loss": 0.0001, "num_tokens": 50471852.0, "reward": 0.9574533700942993, "reward_std": 0.0, "rewards/reward_func/mean": 0.9574533700942993, "rewards/reward_func/std": 0.0, "step": 1839, "step_time": 19.37217130884528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.28042303770780563, "epoch": 0.08522464103751737, "frac_reward_zero_std": 1.0, "grad_norm": 0.001550883986055851, "kl": 0.001388590142596513, "learning_rate": 9.829643353404353e-07, "loss": 0.0001, "num_tokens": 50500022.0, "reward": 0.8511605262756348, "reward_std": 0.0, "rewards/reward_func/mean": 0.8511605262756348, "rewards/reward_func/std": 0.0, "step": 1840, "step_time": 26.087206903845072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.40595898032188416, "epoch": 0.08527095877721168, "frac_reward_zero_std": 0.0, "grad_norm": 0.12976838648319244, "kl": 0.0017887320136651397, "learning_rate": 9.829550717924964e-07, "loss": -0.1958, "num_tokens": 50535424.0, "reward": 0.014202741906046867, "reward_std": 0.03880927711725235, "rewards/reward_func/mean": 0.014202741906046867, "rewards/reward_func/std": 0.03880928084254265, "step": 1841, "step_time": 25.53868832066655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "entropy": 0.172083992511034, "epoch": 0.08531727651690597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006974312709644437, "kl": 0.0009179186017718166, "learning_rate": 9.829458082445576e-07, "loss": 0.0, "num_tokens": 50564816.0, "reward": 0.9813933372497559, "reward_std": 0.0, "rewards/reward_func/mean": 0.9813933372497559, "rewards/reward_func/std": 0.0, "step": 1842, "step_time": 36.786931216716766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.19916684553027153, "epoch": 0.08536359425660028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010334254475310445, "kl": 0.0009760814864421263, "learning_rate": 9.82936544696619e-07, "loss": 0.0, "num_tokens": 50587026.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1843, "step_time": 16.807015921920538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4466772973537445, "epoch": 0.08540991199629458, "frac_reward_zero_std": 1.0, "grad_norm": 0.001840939512476325, "kl": 0.0020238936704117805, "learning_rate": 9.8292728114868e-07, "loss": 0.0001, "num_tokens": 50609648.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1844, "step_time": 18.286206517368555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.276754766702652, "epoch": 0.08545622973598889, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015949427615851164, "kl": 0.00127955008065328, "learning_rate": 9.82918017600741e-07, "loss": 0.0001, "num_tokens": 50630399.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1845, "step_time": 14.9763044975698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 137.1875, "completions/mean_terminated_length": 137.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3048417717218399, "epoch": 0.08550254747568319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011707213707268238, "kl": 0.0013620333629660308, "learning_rate": 9.82908754052802e-07, "loss": 0.0001, "num_tokens": 50652818.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1846, "step_time": 15.950649853795767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3699372261762619, "epoch": 0.0855488652153775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014884648844599724, "kl": 0.001510350644821301, "learning_rate": 9.828994905048634e-07, "loss": 0.0001, "num_tokens": 50696720.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1847, "step_time": 22.3307176977396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2906503230333328, "epoch": 0.08559518295507179, "frac_reward_zero_std": 1.0, "grad_norm": 0.007628771010786295, "kl": 0.0023074908240232617, "learning_rate": 9.828902269569245e-07, "loss": 0.0001, "num_tokens": 50716273.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1848, "step_time": 13.887910101562738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 115.0625, "completions/mean_terminated_length": 115.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.27160170674324036, "epoch": 0.0856415006947661, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013438882306218147, "kl": 0.001252780290087685, "learning_rate": 9.828809634089856e-07, "loss": 0.0001, "num_tokens": 50737218.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1849, "step_time": 13.047743078321218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 191.3125, "completions/mean_terminated_length": 191.3125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4259258583188057, "epoch": 0.0856878184344604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011495755752548575, "kl": 0.001411606790497899, "learning_rate": 9.828716998610468e-07, "loss": 0.0001, "num_tokens": 50767639.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1850, "step_time": 23.015502981841564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3776135966181755, "epoch": 0.08573413617415471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032343179918825626, "kl": 0.0027543975738808513, "learning_rate": 9.82862436313108e-07, "loss": 0.0001, "num_tokens": 50821786.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1851, "step_time": 23.467766117304564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 154.5625, "completions/mean_terminated_length": 154.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3044525012373924, "epoch": 0.085780453913849, "frac_reward_zero_std": 0.0, "grad_norm": 0.10032334923744202, "kl": 0.002875492617022246, "learning_rate": 9.82853172765169e-07, "loss": 0.0337, "num_tokens": 50845203.0, "reward": 0.01100965216755867, "reward_std": 0.002935907104983926, "rewards/reward_func/mean": 0.01100965216755867, "rewards/reward_func/std": 0.002935907104983926, "step": 1852, "step_time": 16.856601383537054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3908821791410446, "epoch": 0.08582677165354331, "frac_reward_zero_std": 0.0, "grad_norm": 0.06613165140151978, "kl": 0.0034208440338261425, "learning_rate": 9.828439092172301e-07, "loss": 0.0073, "num_tokens": 50872767.0, "reward": 0.058713316917419434, "reward_std": 0.23485326766967773, "rewards/reward_func/mean": 0.058713316917419434, "rewards/reward_func/std": 0.23485328257083893, "step": 1853, "step_time": 22.533325608819723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.32102739065885544, "epoch": 0.08587308939323761, "frac_reward_zero_std": 0.0, "grad_norm": 0.10941273719072342, "kl": 0.0024127698270604014, "learning_rate": 9.828346456692913e-07, "loss": 0.0219, "num_tokens": 50895383.0, "reward": 0.8727484941482544, "reward_std": 0.23336170613765717, "rewards/reward_func/mean": 0.8727484941482544, "rewards/reward_func/std": 0.23336170613765717, "step": 1854, "step_time": 18.159001354128122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 151.9375, "completions/mean_terminated_length": 151.9375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.39959168434143066, "epoch": 0.08591940713293192, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016449117101728916, "kl": 0.0015864612068980932, "learning_rate": 9.828253821213524e-07, "loss": 0.0001, "num_tokens": 50926134.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1855, "step_time": 19.35863560438156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3179458901286125, "epoch": 0.08596572487262621, "frac_reward_zero_std": 1.0, "grad_norm": 0.001405479502864182, "kl": 0.0012443141895346344, "learning_rate": 9.828161185734135e-07, "loss": 0.0001, "num_tokens": 50949100.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1856, "step_time": 15.826786436140537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3403223231434822, "epoch": 0.08601204261232052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024394665379077196, "kl": 0.0016991851734928787, "learning_rate": 9.828068550254746e-07, "loss": 0.0001, "num_tokens": 50982167.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1857, "step_time": 17.94947485253215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.397172674536705, "epoch": 0.08605836035201482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017737135058268905, "kl": 0.0019284598529338837, "learning_rate": 9.827975914775358e-07, "loss": 0.0001, "num_tokens": 51023523.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1858, "step_time": 22.791668850928545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.3660402297973633, "epoch": 0.08610467809170913, "frac_reward_zero_std": 1.0, "grad_norm": 0.00417954009026289, "kl": 0.002777126559522003, "learning_rate": 9.82788327929597e-07, "loss": 0.0001, "num_tokens": 51054341.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1859, "step_time": 22.992428425699472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.35621071606874466, "epoch": 0.08615099583140343, "frac_reward_zero_std": 1.0, "grad_norm": 0.002203872427344322, "kl": 0.0016685849404893816, "learning_rate": 9.827790643816582e-07, "loss": 0.0001, "num_tokens": 51091475.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1860, "step_time": 22.436206620186567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 162.875, "completions/mean_terminated_length": 162.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.17797373235225677, "epoch": 0.08619731357109774, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014392499579116702, "kl": 0.0008662991021992639, "learning_rate": 9.827698008337194e-07, "loss": 0.0, "num_tokens": 51113137.0, "reward": 0.8464817404747009, "reward_std": 0.0, "rewards/reward_func/mean": 0.8464817404747009, "rewards/reward_func/std": 0.0, "step": 1861, "step_time": 17.213813357055187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 154.1875, "completions/mean_terminated_length": 154.1875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.37707026302814484, "epoch": 0.08624363131079203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014551484491676092, "kl": 0.0017600449791643769, "learning_rate": 9.827605372857805e-07, "loss": 0.0001, "num_tokens": 51147428.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1862, "step_time": 19.349631626158953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 119.4375, "completions/mean_terminated_length": 119.4375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.27240651845932007, "epoch": 0.08628994905048634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020565385930240154, "kl": 0.0012519661395344883, "learning_rate": 9.827512737378416e-07, "loss": 0.0001, "num_tokens": 51169275.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1863, "step_time": 13.438467107713223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 145.5625, "completions/mean_terminated_length": 145.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.24237633123993874, "epoch": 0.08633626679018064, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011182187590748072, "kl": 0.00101482545142062, "learning_rate": 9.827420101899027e-07, "loss": 0.0001, "num_tokens": 51189268.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1864, "step_time": 15.13896419852972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 119.125, "completions/mean_terminated_length": 119.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2867928221821785, "epoch": 0.08638258452987495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015448889462277293, "kl": 0.0012244154931977391, "learning_rate": 9.827327466419639e-07, "loss": 0.0001, "num_tokens": 51209190.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1865, "step_time": 12.89440918713808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 149.5625, "completions/mean_terminated_length": 149.5625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4366995394229889, "epoch": 0.08642890226956924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009921262972056866, "kl": 0.0016550397267565131, "learning_rate": 9.82723483094025e-07, "loss": 0.0001, "num_tokens": 51267759.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1866, "step_time": 25.876544449478388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 114.875, "completions/mean_terminated_length": 114.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2834022119641304, "epoch": 0.08647522000926355, "frac_reward_zero_std": 1.0, "grad_norm": 0.001233416609466076, "kl": 0.0010649955947883427, "learning_rate": 9.827142195460861e-07, "loss": 0.0001, "num_tokens": 51288109.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1867, "step_time": 12.502612922340631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.3381047397851944, "epoch": 0.08652153774895785, "frac_reward_zero_std": 0.0, "grad_norm": 0.08101329952478409, "kl": 0.00206389746745117, "learning_rate": 9.827049559981472e-07, "loss": 0.0059, "num_tokens": 51326033.0, "reward": 0.6728014945983887, "reward_std": 0.3186631500720978, "rewards/reward_func/mean": 0.6728014945983887, "rewards/reward_func/std": 0.3186631500720978, "step": 1868, "step_time": 25.663867883384228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.2553657218813896, "epoch": 0.08656785548865216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012497804127633572, "kl": 0.0013065116945654154, "learning_rate": 9.826956924502084e-07, "loss": 0.0001, "num_tokens": 51356083.0, "reward": 0.9381646513938904, "reward_std": 0.0, "rewards/reward_func/mean": 0.9381646513938904, "rewards/reward_func/std": 0.0, "step": 1869, "step_time": 25.252639766782522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4294079840183258, "epoch": 0.08661417322834646, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015133166452869773, "kl": 0.0019314551609568298, "learning_rate": 9.826864289022695e-07, "loss": 0.0001, "num_tokens": 51400691.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1870, "step_time": 21.71081743016839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 244.1875, "completions/mean_terminated_length": 244.1875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.368765652179718, "epoch": 0.08666049096804077, "frac_reward_zero_std": 0.0, "grad_norm": 0.08822564780712128, "kl": 0.00198890981846489, "learning_rate": 9.826771653543306e-07, "loss": -0.1854, "num_tokens": 51440966.0, "reward": 0.12633299827575684, "reward_std": 0.22599133849143982, "rewards/reward_func/mean": 0.12633299827575684, "rewards/reward_func/std": 0.22599133849143982, "step": 1871, "step_time": 31.792429622262716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.46957267075777054, "epoch": 0.08670680870773506, "frac_reward_zero_std": 1.0, "grad_norm": 0.003613095497712493, "kl": 0.00231377431191504, "learning_rate": 9.826679018063917e-07, "loss": 0.0001, "num_tokens": 51467159.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1872, "step_time": 21.5230940207839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.15072263777256012, "epoch": 0.08675312644742937, "frac_reward_zero_std": 0.0, "grad_norm": 0.06673221290111542, "kl": 0.0023850490106269717, "learning_rate": 9.82658638258453e-07, "loss": -0.0581, "num_tokens": 51498025.0, "reward": 0.9263890385627747, "reward_std": 0.20114344358444214, "rewards/reward_func/mean": 0.9263890385627747, "rewards/reward_func/std": 0.20114347338676453, "step": 1873, "step_time": 21.987693183124065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 155.0625, "completions/mean_terminated_length": 155.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3341098949313164, "epoch": 0.08679944418712367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017773177241906524, "kl": 0.001814171380829066, "learning_rate": 9.826493747105142e-07, "loss": 0.0001, "num_tokens": 51524970.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1874, "step_time": 17.683572709560394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.18182585015892982, "epoch": 0.08684576192681798, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016797479474917054, "kl": 0.0011587960179895163, "learning_rate": 9.82640111162575e-07, "loss": 0.0001, "num_tokens": 51549061.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1875, "step_time": 19.15440022945404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.39947322756052017, "epoch": 0.08689207966651227, "frac_reward_zero_std": 1.0, "grad_norm": 0.001396923209540546, "kl": 0.0016080295317806304, "learning_rate": 9.826308476146362e-07, "loss": 0.0001, "num_tokens": 51591284.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1876, "step_time": 19.90982525423169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3752344325184822, "epoch": 0.08693839740620658, "frac_reward_zero_std": 1.0, "grad_norm": 0.001198168029077351, "kl": 0.0013058011536486447, "learning_rate": 9.826215840666976e-07, "loss": 0.0001, "num_tokens": 51629037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1877, "step_time": 22.050659473985434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 177.5, "completions/mean_terminated_length": 177.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3675825148820877, "epoch": 0.08698471514590088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015190679114311934, "kl": 0.0016339565336238593, "learning_rate": 9.826123205187587e-07, "loss": 0.0001, "num_tokens": 51679925.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1878, "step_time": 24.92166867107153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.28735554218292236, "epoch": 0.08703103288559519, "frac_reward_zero_std": 0.0, "grad_norm": 0.11670393496751785, "kl": 0.0024552375252824277, "learning_rate": 9.826030569708198e-07, "loss": -0.0396, "num_tokens": 51704297.0, "reward": 0.6030696630477905, "reward_std": 0.3176274597644806, "rewards/reward_func/mean": 0.6030696630477905, "rewards/reward_func/std": 0.3176274597644806, "step": 1879, "step_time": 18.444391392171383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.5625, "completions/mean_terminated_length": 117.5625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3189704567193985, "epoch": 0.08707735062528948, "frac_reward_zero_std": 1.0, "grad_norm": 0.002227914985269308, "kl": 0.002039935643551871, "learning_rate": 9.82593793422881e-07, "loss": 0.0001, "num_tokens": 51726002.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1880, "step_time": 13.512067291885614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.23120297119021416, "epoch": 0.0871236683649838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009425807511433959, "kl": 0.001209792186273262, "learning_rate": 9.82584529874942e-07, "loss": 0.0001, "num_tokens": 51745678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1881, "step_time": 14.72817013412714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 211.0, "completions/mean_terminated_length": 211.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.1960393637418747, "epoch": 0.08716998610467809, "frac_reward_zero_std": 0.0, "grad_norm": 0.08713279664516449, "kl": 0.0014232159592211246, "learning_rate": 9.825752663270032e-07, "loss": 0.009, "num_tokens": 51771950.0, "reward": 0.9066835641860962, "reward_std": 0.11560893803834915, "rewards/reward_func/mean": 0.9066835641860962, "rewards/reward_func/std": 0.11560893803834915, "step": 1882, "step_time": 21.761763382703066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.18618855625391006, "epoch": 0.0872163038443724, "frac_reward_zero_std": 0.0, "grad_norm": 0.08309055864810944, "kl": 0.0009894871473079547, "learning_rate": 9.825660027790643e-07, "loss": -0.0953, "num_tokens": 51797306.0, "reward": 0.5495471954345703, "reward_std": 0.07476285099983215, "rewards/reward_func/mean": 0.5495471954345703, "rewards/reward_func/std": 0.07476283609867096, "step": 1883, "step_time": 27.02977531775832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2454192265868187, "epoch": 0.0872626215840667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016377604333683848, "kl": 0.001226855645654723, "learning_rate": 9.825567392311254e-07, "loss": 0.0001, "num_tokens": 51822870.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1884, "step_time": 19.167727533727884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.2673642858862877, "epoch": 0.087308939323761, "frac_reward_zero_std": 0.0, "grad_norm": 0.08692874759435654, "kl": 0.0021311097079887986, "learning_rate": 9.825474756831866e-07, "loss": 0.0604, "num_tokens": 51846660.0, "reward": 0.8378630876541138, "reward_std": 0.16086310148239136, "rewards/reward_func/mean": 0.8378630876541138, "rewards/reward_func/std": 0.16086310148239136, "step": 1885, "step_time": 22.522730112075806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 190.3125, "completions/mean_terminated_length": 190.3125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.38384315371513367, "epoch": 0.0873552570634553, "frac_reward_zero_std": 0.0, "grad_norm": 0.08401807397603989, "kl": 0.0019377712160348892, "learning_rate": 9.825382121352477e-07, "loss": -0.0218, "num_tokens": 51879145.0, "reward": 0.0322435162961483, "reward_std": 0.1289740651845932, "rewards/reward_func/mean": 0.0322435162961483, "rewards/reward_func/std": 0.1289740651845932, "step": 1886, "step_time": 21.635551754385233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3883207440376282, "epoch": 0.08740157480314961, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027886752504855394, "kl": 0.001899563183542341, "learning_rate": 9.82528948587309e-07, "loss": 0.0001, "num_tokens": 51913808.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1887, "step_time": 21.102830704301596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.288997083902359, "epoch": 0.08744789254284391, "frac_reward_zero_std": 1.0, "grad_norm": 0.002589646028354764, "kl": 0.0016991238808259368, "learning_rate": 9.8251968503937e-07, "loss": 0.0001, "num_tokens": 51936622.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1888, "step_time": 16.819790691137314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2332070954144001, "epoch": 0.08749421028253822, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015839972766116261, "kl": 0.0013473039434757084, "learning_rate": 9.82510421491431e-07, "loss": 0.0001, "num_tokens": 51956380.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1889, "step_time": 15.471843719482422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 114.5, "completions/mean_terminated_length": 114.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.25283853337168694, "epoch": 0.08754052802223251, "frac_reward_zero_std": 1.0, "grad_norm": 0.002629344817250967, "kl": 0.0012974267883691937, "learning_rate": 9.825011579434924e-07, "loss": 0.0001, "num_tokens": 51975812.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1890, "step_time": 12.317603267729282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.30954907089471817, "epoch": 0.08758684576192682, "frac_reward_zero_std": 0.0, "grad_norm": 0.09644221514463425, "kl": 0.001653652056120336, "learning_rate": 9.824918943955535e-07, "loss": 0.0767, "num_tokens": 52005224.0, "reward": 0.913185715675354, "reward_std": 0.18401484191417694, "rewards/reward_func/mean": 0.913185715675354, "rewards/reward_func/std": 0.18401482701301575, "step": 1891, "step_time": 26.022328063845634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 221.4375, "completions/mean_terminated_length": 221.4375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.16434165090322495, "epoch": 0.08763316350162112, "frac_reward_zero_std": 1.0, "grad_norm": 0.004602053668349981, "kl": 0.00152591013466008, "learning_rate": 9.824826308476147e-07, "loss": 0.0001, "num_tokens": 52043983.0, "reward": 0.9793821573257446, "reward_std": 0.0, "rewards/reward_func/mean": 0.9793821573257446, "rewards/reward_func/std": 0.0, "step": 1892, "step_time": 24.3629249073565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.20935287326574326, "epoch": 0.08767948124131543, "frac_reward_zero_std": 0.0, "grad_norm": 0.089290551841259, "kl": 0.0008470645552733913, "learning_rate": 9.824733672996758e-07, "loss": 0.0032, "num_tokens": 52088694.0, "reward": 0.9019652009010315, "reward_std": 0.06826267391443253, "rewards/reward_func/mean": 0.9019652009010315, "rewards/reward_func/std": 0.06826266646385193, "step": 1893, "step_time": 22.164428021758795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.4360707327723503, "epoch": 0.08772579898100973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016040436457842588, "kl": 0.0023941476247273386, "learning_rate": 9.82464103751737e-07, "loss": 0.0001, "num_tokens": 52137178.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1894, "step_time": 26.880092922598124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.30159368366003036, "epoch": 0.08777211672070404, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015431154752150178, "kl": 0.0015250979049596936, "learning_rate": 9.82454840203798e-07, "loss": 0.0001, "num_tokens": 52157956.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1895, "step_time": 15.323430735617876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.35118141025304794, "epoch": 0.08781843446039833, "frac_reward_zero_std": 0.0, "grad_norm": 0.0921139344573021, "kl": 0.002168814418837428, "learning_rate": 9.824455766558592e-07, "loss": -0.0817, "num_tokens": 52182068.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 1896, "step_time": 25.05549620091915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4070845916867256, "epoch": 0.08786475220009264, "frac_reward_zero_std": 0.0, "grad_norm": 0.10707610100507736, "kl": 0.0032461085938848555, "learning_rate": 9.824363131079203e-07, "loss": 0.0357, "num_tokens": 52204435.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 1897, "step_time": 19.98173614963889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.26376891881227493, "epoch": 0.08791106993978694, "frac_reward_zero_std": 0.0, "grad_norm": 0.18185175955295563, "kl": 0.0015802593261469156, "learning_rate": 9.824270495599814e-07, "loss": -0.0206, "num_tokens": 52227754.0, "reward": 0.9534019827842712, "reward_std": 0.10018271207809448, "rewards/reward_func/mean": 0.9534019827842712, "rewards/reward_func/std": 0.10018270462751389, "step": 1898, "step_time": 18.17505842074752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 248.1875, "completions/mean_terminated_length": 248.1875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.24741164222359657, "epoch": 0.08795738767948125, "frac_reward_zero_std": 0.0, "grad_norm": 0.10063128918409348, "kl": 0.001562878693221137, "learning_rate": 9.824177860120425e-07, "loss": -0.1509, "num_tokens": 52267965.0, "reward": 0.3122842609882355, "reward_std": 0.25194114446640015, "rewards/reward_func/mean": 0.3122842609882355, "rewards/reward_func/std": 0.25194114446640015, "step": 1899, "step_time": 30.92778167501092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.36519360542297363, "epoch": 0.08800370541917554, "frac_reward_zero_std": 0.0, "grad_norm": 0.1244279220700264, "kl": 0.003008928440976888, "learning_rate": 9.824085224641037e-07, "loss": -0.1722, "num_tokens": 52290932.0, "reward": 0.051292113959789276, "reward_std": 0.2051684558391571, "rewards/reward_func/mean": 0.051292113959789276, "rewards/reward_func/std": 0.2051684558391571, "step": 1900, "step_time": 24.453903168439865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 131.4375, "completions/mean_terminated_length": 131.4375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.24329349771142006, "epoch": 0.08805002315886985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010272827930748463, "kl": 0.0010608525190036744, "learning_rate": 9.823992589161648e-07, "loss": 0.0001, "num_tokens": 52314043.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1901, "step_time": 14.853858038783073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 135.0625, "completions/mean_terminated_length": 135.0625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.31072814017534256, "epoch": 0.08809634089856415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011184071190655231, "kl": 0.0013472106656990945, "learning_rate": 9.82389995368226e-07, "loss": 0.0001, "num_tokens": 52339868.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1902, "step_time": 16.587781220674515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 248.1875, "completions/mean_terminated_length": 248.1875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.22742655128240585, "epoch": 0.08814265863825846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039713154546916485, "kl": 0.0021700568904634565, "learning_rate": 9.823807318202872e-07, "loss": 0.0001, "num_tokens": 52363871.0, "reward": 0.8531438708305359, "reward_std": 0.0, "rewards/reward_func/mean": 0.8531438708305359, "rewards/reward_func/std": 0.0, "step": 1903, "step_time": 23.454675372689962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.32779307663440704, "epoch": 0.08818897637795275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012335549108684063, "kl": 0.001041513794916682, "learning_rate": 9.823714682723484e-07, "loss": 0.0001, "num_tokens": 52399965.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1904, "step_time": 19.586184982210398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 192.6875, "completions/mean_terminated_length": 192.6875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.40551455318927765, "epoch": 0.08823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022247496526688337, "kl": 0.00166975031606853, "learning_rate": 9.823622047244095e-07, "loss": 0.0001, "num_tokens": 52428360.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1905, "step_time": 21.36198526248336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3530169874429703, "epoch": 0.08828161185734136, "frac_reward_zero_std": 1.0, "grad_norm": 0.002043097745627165, "kl": 0.0015477327397093177, "learning_rate": 9.823529411764704e-07, "loss": 0.0001, "num_tokens": 52457927.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1906, "step_time": 20.78644258901477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27532320097088814, "epoch": 0.08832792959703567, "frac_reward_zero_std": 1.0, "grad_norm": 0.013013739138841629, "kl": 0.0031569500570185483, "learning_rate": 9.823436776285317e-07, "loss": 0.0002, "num_tokens": 52478743.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1907, "step_time": 15.326287671923637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 161.3125, "completions/mean_terminated_length": 161.3125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3997081443667412, "epoch": 0.08837424733672997, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022086037788540125, "kl": 0.0018891174113377929, "learning_rate": 9.823344140805929e-07, "loss": 0.0001, "num_tokens": 52501484.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1908, "step_time": 17.333568029105663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2683362513780594, "epoch": 0.08842056507642428, "frac_reward_zero_std": 1.0, "grad_norm": 0.005174525082111359, "kl": 0.0018115075945388526, "learning_rate": 9.82325150532654e-07, "loss": 0.0001, "num_tokens": 52521368.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1909, "step_time": 15.31071873754263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 176.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2102694883942604, "epoch": 0.08846688281611857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028989873826503754, "kl": 0.0015588774112984538, "learning_rate": 9.823158869847151e-07, "loss": 0.0001, "num_tokens": 52550115.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1910, "step_time": 19.70367395877838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 114.5, "completions/mean_terminated_length": 114.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.24982880800962448, "epoch": 0.08851320055581288, "frac_reward_zero_std": 1.0, "grad_norm": 0.001907306956127286, "kl": 0.0017034209740813822, "learning_rate": 9.823066234367762e-07, "loss": 0.0001, "num_tokens": 52569371.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1911, "step_time": 13.268001470714808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 154.9375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.17168958857655525, "epoch": 0.08855951829550718, "frac_reward_zero_std": 0.0, "grad_norm": 0.12762132287025452, "kl": 0.006979975383728743, "learning_rate": 9.822973598888374e-07, "loss": -0.0495, "num_tokens": 52589898.0, "reward": 0.597527027130127, "reward_std": 0.2399885058403015, "rewards/reward_func/mean": 0.597527027130127, "rewards/reward_func/std": 0.2399885058403015, "step": 1912, "step_time": 15.9775386787951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.19557374715805054, "epoch": 0.08860583603520149, "frac_reward_zero_std": 1.0, "grad_norm": 0.001692920457571745, "kl": 0.0012704057735390961, "learning_rate": 9.822880963408985e-07, "loss": 0.0001, "num_tokens": 52613491.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1913, "step_time": 20.589644316583872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 166.3125, "completions/mean_terminated_length": 166.3125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3764125630259514, "epoch": 0.08865215377489578, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025116396136581898, "kl": 0.0018544631893746555, "learning_rate": 9.822788327929596e-07, "loss": 0.0001, "num_tokens": 52650296.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1914, "step_time": 21.7999594733119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.22333737090229988, "epoch": 0.0886984715145901, "frac_reward_zero_std": 0.0, "grad_norm": 0.09571599215269089, "kl": 0.00227964308578521, "learning_rate": 9.822695692450207e-07, "loss": 0.0318, "num_tokens": 52671096.0, "reward": 0.9386385679244995, "reward_std": 0.06337377429008484, "rewards/reward_func/mean": 0.9386385679244995, "rewards/reward_func/std": 0.06337378919124603, "step": 1915, "step_time": 18.661055110394955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.22838079929351807, "epoch": 0.08874478925428439, "frac_reward_zero_std": 1.0, "grad_norm": 0.000901050167158246, "kl": 0.001069504301995039, "learning_rate": 9.822603056970819e-07, "loss": 0.0001, "num_tokens": 52692876.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 1916, "step_time": 19.146745320409536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 141.5625, "completions/mean_terminated_length": 141.5625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35518868267536163, "epoch": 0.0887911069939787, "frac_reward_zero_std": 1.0, "grad_norm": 0.001423563458956778, "kl": 0.0015517770953010768, "learning_rate": 9.822510421491432e-07, "loss": 0.0001, "num_tokens": 52726997.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1917, "step_time": 18.057892087846994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 132.4375, "completions/mean_terminated_length": 132.4375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24251199886202812, "epoch": 0.088837424733673, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012752824695780873, "kl": 0.0012560687900986522, "learning_rate": 9.822417786012041e-07, "loss": 0.0001, "num_tokens": 52748476.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1918, "step_time": 15.007533088326454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 113.625, "completions/mean_terminated_length": 113.625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2651001140475273, "epoch": 0.0888837424733673, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037908037193119526, "kl": 0.002068093483103439, "learning_rate": 9.822325150532652e-07, "loss": 0.0001, "num_tokens": 52767878.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1919, "step_time": 12.267887149006128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.19665367901325226, "epoch": 0.0889300602130616, "frac_reward_zero_std": 0.0, "grad_norm": 0.1086895689368248, "kl": 0.002230938378488645, "learning_rate": 9.822232515053266e-07, "loss": -0.0204, "num_tokens": 52790884.0, "reward": 0.9019302129745483, "reward_std": 0.03052354045212269, "rewards/reward_func/mean": 0.9019302129745483, "rewards/reward_func/std": 0.03052353300154209, "step": 1920, "step_time": 17.681687232106924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.410856731235981, "epoch": 0.08897637795275591, "frac_reward_zero_std": 1.0, "grad_norm": 0.00221324828453362, "kl": 0.0020306374062784016, "learning_rate": 9.822139879573877e-07, "loss": 0.0001, "num_tokens": 52812244.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1921, "step_time": 18.97685321420431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 179.125, "completions/mean_terminated_length": 179.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3178200200200081, "epoch": 0.08902269569245021, "frac_reward_zero_std": 0.0, "grad_norm": 0.18764638900756836, "kl": 0.002159391442546621, "learning_rate": 9.822047244094488e-07, "loss": -0.0269, "num_tokens": 52840230.0, "reward": 0.050670865923166275, "reward_std": 0.0035351975820958614, "rewards/reward_func/mean": 0.050670865923166275, "rewards/reward_func/std": 0.0035351980477571487, "step": 1922, "step_time": 19.952056918293238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3301963210105896, "epoch": 0.08906901343214452, "frac_reward_zero_std": 0.0, "grad_norm": 0.11845042556524277, "kl": 0.003240281599573791, "learning_rate": 9.8219546086151e-07, "loss": -0.2876, "num_tokens": 52863231.0, "reward": 0.011592301540076733, "reward_std": 0.024922698736190796, "rewards/reward_func/mean": 0.011592301540076733, "rewards/reward_func/std": 0.024922700598835945, "step": 1923, "step_time": 28.92820466682315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.42731931060552597, "epoch": 0.08911533117183881, "frac_reward_zero_std": 1.0, "grad_norm": 0.002207790268585086, "kl": 0.0017887434223666787, "learning_rate": 9.82186197313571e-07, "loss": 0.0001, "num_tokens": 52884665.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1924, "step_time": 18.699849113821983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.14161300286650658, "epoch": 0.08916164891153312, "frac_reward_zero_std": 0.0, "grad_norm": 0.05392554774880409, "kl": 0.0006552610284416005, "learning_rate": 9.821769337656322e-07, "loss": 0.0032, "num_tokens": 52927935.0, "reward": 0.28361839056015015, "reward_std": 0.012069177813827991, "rewards/reward_func/mean": 0.28361839056015015, "rewards/reward_func/std": 0.012069173157215118, "step": 1925, "step_time": 35.5646690428257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 333.1875, "completions/mean_terminated_length": 333.1875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.19576743617653847, "epoch": 0.08920796665122742, "frac_reward_zero_std": 0.0, "grad_norm": 0.05160801112651825, "kl": 0.0010797584836836904, "learning_rate": 9.821676702176933e-07, "loss": -0.0007, "num_tokens": 52969106.0, "reward": 0.9488431215286255, "reward_std": 0.06820911169052124, "rewards/reward_func/mean": 0.9488431215286255, "rewards/reward_func/std": 0.06820911169052124, "step": 1926, "step_time": 33.76814239099622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 312.9375, "completions/mean_terminated_length": 312.9375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "entropy": 0.27120207250118256, "epoch": 0.08925428439092173, "frac_reward_zero_std": 0.0, "grad_norm": 0.07882967591285706, "kl": 0.0014733055722899735, "learning_rate": 9.821584066697544e-07, "loss": 0.0231, "num_tokens": 53009457.0, "reward": 0.7049738764762878, "reward_std": 0.3606720566749573, "rewards/reward_func/mean": 0.7049738764762878, "rewards/reward_func/std": 0.36067211627960205, "step": 1927, "step_time": 32.67887997999787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 157.875, "completions/mean_terminated_length": 157.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2225835844874382, "epoch": 0.08930060213061602, "frac_reward_zero_std": 0.0, "grad_norm": 0.13789547979831696, "kl": 0.00188472552690655, "learning_rate": 9.821491431218156e-07, "loss": -0.0418, "num_tokens": 53034111.0, "reward": 0.4438591003417969, "reward_std": 0.030712326988577843, "rewards/reward_func/mean": 0.4438591003417969, "rewards/reward_func/std": 0.030712325125932693, "step": 1928, "step_time": 16.75002347677946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.25280067324638367, "epoch": 0.08934691987031033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014815613394603133, "kl": 0.0012445047905202955, "learning_rate": 9.821398795738767e-07, "loss": 0.0001, "num_tokens": 53057283.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 1929, "step_time": 21.23060030862689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 175.9375, "completions/mean_terminated_length": 175.9375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.21397727355360985, "epoch": 0.08939323761000463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034141899086534977, "kl": 0.0017794068262446672, "learning_rate": 9.82130616025938e-07, "loss": 0.0001, "num_tokens": 53080754.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1930, "step_time": 18.86830211430788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.3228232190012932, "epoch": 0.08943955534969894, "frac_reward_zero_std": 0.0, "grad_norm": 0.09571702033281326, "kl": 0.003071622224524617, "learning_rate": 9.82121352477999e-07, "loss": -0.0601, "num_tokens": 53104832.0, "reward": 0.9278337955474854, "reward_std": 0.19719551503658295, "rewards/reward_func/mean": 0.9278337955474854, "rewards/reward_func/std": 0.19719550013542175, "step": 1931, "step_time": 22.584304578602314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.30679020285606384, "epoch": 0.08948587308939324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016486950917169452, "kl": 0.0012764577113557607, "learning_rate": 9.8211208893006e-07, "loss": 0.0001, "num_tokens": 53127343.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1932, "step_time": 15.787466999143362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.2917402759194374, "epoch": 0.08953219082908755, "frac_reward_zero_std": 0.0, "grad_norm": 0.09777598083019257, "kl": 0.0016675287624821067, "learning_rate": 9.821028253821214e-07, "loss": -0.0091, "num_tokens": 53152129.0, "reward": 0.5691647529602051, "reward_std": 0.07495664060115814, "rewards/reward_func/mean": 0.5691647529602051, "rewards/reward_func/std": 0.07495662569999695, "step": 1933, "step_time": 20.581589695066214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3954869881272316, "epoch": 0.08957850856878184, "frac_reward_zero_std": 0.0, "grad_norm": 0.0983789712190628, "kl": 0.002569092670455575, "learning_rate": 9.820935618341825e-07, "loss": -0.1034, "num_tokens": 53175238.0, "reward": 0.05313796177506447, "reward_std": 0.21255184710025787, "rewards/reward_func/mean": 0.05313796177506447, "rewards/reward_func/std": 0.21255184710025787, "step": 1934, "step_time": 24.094012692570686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.159866314381361, "epoch": 0.08962482630847615, "frac_reward_zero_std": 0.0, "grad_norm": 0.08072976022958755, "kl": 0.001008115039439872, "learning_rate": 9.820842982862437e-07, "loss": 0.0209, "num_tokens": 53200530.0, "reward": 0.8739733695983887, "reward_std": 0.0009956677677109838, "rewards/reward_func/mean": 0.8739733695983887, "rewards/reward_func/std": 0.0009956508874893188, "step": 1935, "step_time": 19.25437581166625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.37753038108348846, "epoch": 0.08967114404817045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014955189544707537, "kl": 0.0015476090193260461, "learning_rate": 9.820750347383048e-07, "loss": 0.0001, "num_tokens": 53230268.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1936, "step_time": 17.72052463889122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.31560005247592926, "epoch": 0.08971746178786476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018230763962492347, "kl": 0.001572958513861522, "learning_rate": 9.82065771190366e-07, "loss": 0.0001, "num_tokens": 53252482.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1937, "step_time": 13.774052310734987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 164.125, "completions/mean_terminated_length": 164.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.23324602842330933, "epoch": 0.08976377952755905, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029678151477128267, "kl": 0.002326177229406312, "learning_rate": 9.82056507642427e-07, "loss": 0.0001, "num_tokens": 53279348.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1938, "step_time": 17.855856452137232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.40140706300735474, "epoch": 0.08981009726725336, "frac_reward_zero_std": 1.0, "grad_norm": 0.002848844276741147, "kl": 0.0018949246150441468, "learning_rate": 9.820472440944882e-07, "loss": 0.0001, "num_tokens": 53310023.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1939, "step_time": 18.78799507766962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 207.6875, "completions/mean_terminated_length": 207.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2529890611767769, "epoch": 0.08985641500694766, "frac_reward_zero_std": 0.0, "grad_norm": 0.103531114757061, "kl": 0.0018216021126136184, "learning_rate": 9.820379805465493e-07, "loss": -0.0469, "num_tokens": 53357458.0, "reward": 0.6555301547050476, "reward_std": 0.4592931568622589, "rewards/reward_func/mean": 0.6555301547050476, "rewards/reward_func/std": 0.4592931568622589, "step": 1940, "step_time": 28.453382831066847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 124.625, "completions/mean_terminated_length": 124.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.24574385583400726, "epoch": 0.08990273274664197, "frac_reward_zero_std": 1.0, "grad_norm": 0.001512282993644476, "kl": 0.0012296900094952434, "learning_rate": 9.820287169986104e-07, "loss": 0.0001, "num_tokens": 53379116.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1941, "step_time": 14.23377349972725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.32386118918657303, "epoch": 0.08994905048633627, "frac_reward_zero_std": 0.0, "grad_norm": 0.12431399524211884, "kl": 0.0027555939159356058, "learning_rate": 9.820194534506715e-07, "loss": -0.1047, "num_tokens": 53410830.0, "reward": 0.1294839084148407, "reward_std": 0.23305761814117432, "rewards/reward_func/mean": 0.1294839084148407, "rewards/reward_func/std": 0.23305761814117432, "step": 1942, "step_time": 26.170398607850075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.43460363894701004, "epoch": 0.08999536822603058, "frac_reward_zero_std": 0.0, "grad_norm": 0.07519284635782242, "kl": 0.0021609098184853792, "learning_rate": 9.820101899027327e-07, "loss": -0.0258, "num_tokens": 53441378.0, "reward": 0.005189642775803804, "reward_std": 0.020758571103215218, "rewards/reward_func/mean": 0.005189642775803804, "rewards/reward_func/std": 0.020758571103215218, "step": 1943, "step_time": 25.71977098658681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.31021514534950256, "epoch": 0.09004168596572487, "frac_reward_zero_std": 1.0, "grad_norm": 0.003140147076919675, "kl": 0.002087398723233491, "learning_rate": 9.820009263547938e-07, "loss": 0.0001, "num_tokens": 53462358.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1944, "step_time": 16.813846472650766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3413543477654457, "epoch": 0.09008800370541918, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014058761298656464, "kl": 0.0013815262645948678, "learning_rate": 9.81991662806855e-07, "loss": 0.0001, "num_tokens": 53487681.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1945, "step_time": 17.097448244690895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 106.0625, "completions/mean_terminated_length": 106.0625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.22293299436569214, "epoch": 0.09013432144511348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013684408040717244, "kl": 0.0012961396569153294, "learning_rate": 9.81982399258916e-07, "loss": 0.0001, "num_tokens": 53508018.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1946, "step_time": 12.469510365277529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.4375, "completions/mean_terminated_length": 120.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2610037550330162, "epoch": 0.09018063918480779, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009769543539732695, "kl": 0.0012118838203605264, "learning_rate": 9.819731357109774e-07, "loss": 0.0001, "num_tokens": 53528985.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1947, "step_time": 13.232079800218344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 148.6875, "completions/mean_terminated_length": 148.6875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.16816206276416779, "epoch": 0.09022695692450208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010869316756725311, "kl": 0.0008072914788499475, "learning_rate": 9.819638721630385e-07, "loss": 0.0, "num_tokens": 53550564.0, "reward": 0.24659696221351624, "reward_std": 0.0, "rewards/reward_func/mean": 0.24659696221351624, "rewards/reward_func/std": 0.0, "step": 1948, "step_time": 16.953522082418203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 182.0, "completions/mean_terminated_length": 182.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.35463452339172363, "epoch": 0.09027327466419639, "frac_reward_zero_std": 0.0, "grad_norm": 0.12420963495969772, "kl": 0.003538883465807885, "learning_rate": 9.819546086150994e-07, "loss": -0.1319, "num_tokens": 53591716.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 1949, "step_time": 25.467616628855467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4176323562860489, "epoch": 0.09031959240389069, "frac_reward_zero_std": 1.0, "grad_norm": 0.003623420372605324, "kl": 0.0024453586665913463, "learning_rate": 9.819453450671607e-07, "loss": 0.0001, "num_tokens": 53622317.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1950, "step_time": 25.065615337342024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 147.8125, "completions/mean_terminated_length": 147.8125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.18409983441233635, "epoch": 0.090365910143585, "frac_reward_zero_std": 0.0, "grad_norm": 0.08783780038356781, "kl": 0.0014407682756427675, "learning_rate": 9.819360815192219e-07, "loss": 0.0017, "num_tokens": 53644554.0, "reward": 0.116437628865242, "reward_std": 0.008144183084368706, "rewards/reward_func/mean": 0.116437628865242, "rewards/reward_func/std": 0.008144183084368706, "step": 1951, "step_time": 17.291895169764757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 142.0625, "completions/mean_terminated_length": 142.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.21944910287857056, "epoch": 0.0904122278832793, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021472317166626453, "kl": 0.0013448151585180312, "learning_rate": 9.81926817971283e-07, "loss": 0.0001, "num_tokens": 53664331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1952, "step_time": 14.36643573269248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 116.625, "completions/mean_terminated_length": 116.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28818097710609436, "epoch": 0.0904585456229736, "frac_reward_zero_std": 1.0, "grad_norm": 0.002776138950139284, "kl": 0.0015259020728990436, "learning_rate": 9.819175544233441e-07, "loss": 0.0001, "num_tokens": 53686277.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1953, "step_time": 13.262791015207767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.35676541924476624, "epoch": 0.0905048633626679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019279010593891144, "kl": 0.001689134689513594, "learning_rate": 9.819082908754052e-07, "loss": 0.0001, "num_tokens": 53707836.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1954, "step_time": 18.223545279353857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.25817713141441345, "epoch": 0.09055118110236221, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030821990221738815, "kl": 0.0015234506863635033, "learning_rate": 9.818990273274664e-07, "loss": 0.0001, "num_tokens": 53729122.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1955, "step_time": 13.691711734980345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.19519327953457832, "epoch": 0.0905974988420565, "frac_reward_zero_std": 0.0, "grad_norm": 0.08650083094835281, "kl": 0.0010984998807543889, "learning_rate": 9.818897637795275e-07, "loss": -0.0561, "num_tokens": 53755326.0, "reward": 0.6673440337181091, "reward_std": 0.011904047802090645, "rewards/reward_func/mean": 0.6673440337181091, "rewards/reward_func/std": 0.01190404687076807, "step": 1956, "step_time": 19.144754018634558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.13502133265137672, "epoch": 0.09064381658175082, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012273151660338044, "kl": 0.0008055913640419021, "learning_rate": 9.818805002315886e-07, "loss": 0.0, "num_tokens": 53777434.0, "reward": 0.9555630087852478, "reward_std": 0.0, "rewards/reward_func/mean": 0.9555630087852478, "rewards/reward_func/std": 0.0, "step": 1957, "step_time": 20.97706549987197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "entropy": 0.23473021015524864, "epoch": 0.09069013432144511, "frac_reward_zero_std": 0.0, "grad_norm": 0.09171711653470993, "kl": 0.0016145984409376979, "learning_rate": 9.818712366836497e-07, "loss": -0.021, "num_tokens": 53818418.0, "reward": 0.7842717170715332, "reward_std": 0.14640463888645172, "rewards/reward_func/mean": 0.7842717170715332, "rewards/reward_func/std": 0.14640463888645172, "step": 1958, "step_time": 33.88829968124628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 167.0625, "completions/mean_terminated_length": 167.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.41302254796028137, "epoch": 0.09073645206113942, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017865417758002877, "kl": 0.0017920045065693557, "learning_rate": 9.818619731357109e-07, "loss": 0.0001, "num_tokens": 53848755.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1959, "step_time": 19.698135547339916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.28678224980831146, "epoch": 0.09078276980083372, "frac_reward_zero_std": 0.0, "grad_norm": 0.11251778155565262, "kl": 0.0018669200071599334, "learning_rate": 9.818527095877722e-07, "loss": -0.0109, "num_tokens": 53877726.0, "reward": 0.9071022272109985, "reward_std": 0.24333836138248444, "rewards/reward_func/mean": 0.9071022272109985, "rewards/reward_func/std": 0.24333836138248444, "step": 1960, "step_time": 21.394634574651718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 204.6875, "completions/mean_terminated_length": 204.6875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2744165435433388, "epoch": 0.09082908754052803, "frac_reward_zero_std": 0.0, "grad_norm": 0.12224402278661728, "kl": 0.0029156223754398525, "learning_rate": 9.818434460398331e-07, "loss": -0.034, "num_tokens": 53904585.0, "reward": 0.5987157821655273, "reward_std": 0.1857866793870926, "rewards/reward_func/mean": 0.5987157821655273, "rewards/reward_func/std": 0.1857866793870926, "step": 1961, "step_time": 21.730876356363297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.23464681208133698, "epoch": 0.09087540528022232, "frac_reward_zero_std": 0.0, "grad_norm": 0.13402652740478516, "kl": 0.0016827634535729885, "learning_rate": 9.818341824918942e-07, "loss": -0.0029, "num_tokens": 53925691.0, "reward": 0.16108980774879456, "reward_std": 0.05650884658098221, "rewards/reward_func/mean": 0.16108980774879456, "rewards/reward_func/std": 0.05650884658098221, "step": 1962, "step_time": 18.06282015517354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.36945798993110657, "epoch": 0.09092172301991663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030708175618201494, "kl": 0.0022847213549539447, "learning_rate": 9.818249189439554e-07, "loss": 0.0001, "num_tokens": 53946689.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1963, "step_time": 20.316118702292442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 208.9375, "completions/mean_terminated_length": 208.9375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.30894362181425095, "epoch": 0.09096804075961093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015511916717514396, "kl": 0.0016433202545158565, "learning_rate": 9.818156553960167e-07, "loss": 0.0001, "num_tokens": 53973664.0, "reward": 0.5795782804489136, "reward_std": 0.0, "rewards/reward_func/mean": 0.5795782804489136, "rewards/reward_func/std": 0.0, "step": 1964, "step_time": 22.227172508835793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 119.3125, "completions/mean_terminated_length": 119.3125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2962167412042618, "epoch": 0.09101435849930524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014858448412269354, "kl": 0.0013716779358219355, "learning_rate": 9.818063918480778e-07, "loss": 0.0001, "num_tokens": 53993429.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1965, "step_time": 14.127327963709831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2774844169616699, "epoch": 0.09106067623899954, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011752174468711019, "kl": 0.0011603307939367369, "learning_rate": 9.81797128300139e-07, "loss": 0.0001, "num_tokens": 54019979.0, "reward": 0.4029351770877838, "reward_std": 0.0, "rewards/reward_func/mean": 0.4029351770877838, "rewards/reward_func/std": 0.0, "step": 1966, "step_time": 21.35994939506054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.24713463708758354, "epoch": 0.09110699397869385, "frac_reward_zero_std": 0.0, "grad_norm": 0.0783422440290451, "kl": 0.0015384795842692256, "learning_rate": 9.817878647522e-07, "loss": -0.0317, "num_tokens": 54041263.0, "reward": 0.8954063653945923, "reward_std": 0.03804173320531845, "rewards/reward_func/mean": 0.8954063653945923, "rewards/reward_func/std": 0.038041744381189346, "step": 1967, "step_time": 18.747099719941616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 108.125, "completions/mean_terminated_length": 108.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3539968729019165, "epoch": 0.09115331171838814, "frac_reward_zero_std": 1.0, "grad_norm": 0.004241200629621744, "kl": 0.0021811652986798435, "learning_rate": 9.817786012042612e-07, "loss": 0.0001, "num_tokens": 54063969.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1968, "step_time": 13.496718242764473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 167.5625, "completions/mean_terminated_length": 167.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.35350824892520905, "epoch": 0.09119962945808245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015196891035884619, "kl": 0.0016085615498013794, "learning_rate": 9.817693376563223e-07, "loss": 0.0001, "num_tokens": 54086602.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1969, "step_time": 19.123491693288088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.1599022075533867, "epoch": 0.09124594719777675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008308873511850834, "kl": 0.0009892965463222936, "learning_rate": 9.817600741083835e-07, "loss": 0.0, "num_tokens": 54112068.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 1970, "step_time": 19.56661333888769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 116.3125, "completions/mean_terminated_length": 116.3125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.30716241151094437, "epoch": 0.09129226493747106, "frac_reward_zero_std": 1.0, "grad_norm": 0.005867612082511187, "kl": 0.0022151133161969483, "learning_rate": 9.817508105604446e-07, "loss": 0.0001, "num_tokens": 54132921.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1971, "step_time": 13.333259463310242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.24652868881821632, "epoch": 0.09133858267716535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011445103446021676, "kl": 0.001014224108075723, "learning_rate": 9.817415470125057e-07, "loss": 0.0001, "num_tokens": 54161907.0, "reward": 0.13406634330749512, "reward_std": 0.0, "rewards/reward_func/mean": 0.13406634330749512, "rewards/reward_func/std": 0.0, "step": 1972, "step_time": 22.42594589293003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 134.0625, "completions/mean_terminated_length": 134.0625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28645943105220795, "epoch": 0.09138490041685966, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019106051186099648, "kl": 0.0014066417643334717, "learning_rate": 9.81732283464567e-07, "loss": 0.0001, "num_tokens": 54183604.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1973, "step_time": 14.717470478266478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 171.3125, "completions/mean_terminated_length": 171.3125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.38657428324222565, "epoch": 0.09143121815655396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011994995875284076, "kl": 0.0013932413421571255, "learning_rate": 9.81723019916628e-07, "loss": 0.0001, "num_tokens": 54211769.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1974, "step_time": 19.652615182101727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 199.0625, "completions/mean_terminated_length": 199.0625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.40833304822444916, "epoch": 0.09147753589624827, "frac_reward_zero_std": 1.0, "grad_norm": 0.002428983571007848, "kl": 0.0019206492870580405, "learning_rate": 9.81713756368689e-07, "loss": 0.0001, "num_tokens": 54236826.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1975, "step_time": 24.90197415649891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 167.8125, "completions/mean_terminated_length": 167.8125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4385068714618683, "epoch": 0.09152385363594256, "frac_reward_zero_std": 1.0, "grad_norm": 0.005276212468743324, "kl": 0.0030843618442304432, "learning_rate": 9.817044928207502e-07, "loss": 0.0002, "num_tokens": 54279607.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1976, "step_time": 24.52617084607482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 269.0625, "completions/mean_terminated_length": 269.0625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.24865829199552536, "epoch": 0.09157017137563687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015469096833840013, "kl": 0.0015839740808587521, "learning_rate": 9.816952292728115e-07, "loss": 0.0001, "num_tokens": 54306488.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1977, "step_time": 26.93454695865512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 193.3125, "completions/mean_terminated_length": 193.3125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.2284526750445366, "epoch": 0.09161648911533117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019630505703389645, "kl": 0.0016949988203123212, "learning_rate": 9.816859657248727e-07, "loss": 0.0001, "num_tokens": 54334221.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1978, "step_time": 20.993782050907612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.39297013729810715, "epoch": 0.09166280685502548, "frac_reward_zero_std": 1.0, "grad_norm": 0.00118962861597538, "kl": 0.0014068300661165267, "learning_rate": 9.816767021769338e-07, "loss": 0.0001, "num_tokens": 54371091.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1979, "step_time": 20.093108519911766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27579325065016747, "epoch": 0.09170912459471978, "frac_reward_zero_std": 1.0, "grad_norm": 0.003379854140803218, "kl": 0.001728180272039026, "learning_rate": 9.81667438628995e-07, "loss": 0.0001, "num_tokens": 54391109.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1980, "step_time": 13.318691533058882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1334142778068781, "epoch": 0.09175544233441409, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008702704217284918, "kl": 0.0007795136043569073, "learning_rate": 9.81658175081056e-07, "loss": 0.0, "num_tokens": 54426864.0, "reward": 0.8507331609725952, "reward_std": 0.0, "rewards/reward_func/mean": 0.8507331609725952, "rewards/reward_func/std": 0.0, "step": 1981, "step_time": 20.51024015620351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.33453385531902313, "epoch": 0.09180176007410838, "frac_reward_zero_std": 1.0, "grad_norm": 0.004719714168459177, "kl": 0.0031589337158948183, "learning_rate": 9.816489115331172e-07, "loss": 0.0002, "num_tokens": 54448624.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 1982, "step_time": 18.334271013736725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 117.5, "completions/mean_terminated_length": 117.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2871744707226753, "epoch": 0.09184807781380269, "frac_reward_zero_std": 1.0, "grad_norm": 0.010424979031085968, "kl": 0.0027162292681168765, "learning_rate": 9.816396479851783e-07, "loss": 0.0001, "num_tokens": 54468872.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1983, "step_time": 13.565622244030237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3009023070335388, "epoch": 0.09189439555349699, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019037388265132904, "kl": 0.001530302397441119, "learning_rate": 9.816303844372394e-07, "loss": 0.0001, "num_tokens": 54495240.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 1984, "step_time": 20.63035625964403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 198.4375, "completions/mean_terminated_length": 198.4375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4101143106818199, "epoch": 0.0919407132931913, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029017613269388676, "kl": 0.0018669950077310205, "learning_rate": 9.816211208893005e-07, "loss": 0.0001, "num_tokens": 54521071.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1985, "step_time": 22.225163273513317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2674574702978134, "epoch": 0.0919870310328856, "frac_reward_zero_std": 1.0, "grad_norm": 0.003091532038524747, "kl": 0.002030002244282514, "learning_rate": 9.816118573413617e-07, "loss": 0.0001, "num_tokens": 54540581.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1986, "step_time": 14.761275552213192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.18644894286990166, "epoch": 0.0920333487725799, "frac_reward_zero_std": 1.0, "grad_norm": 0.00401868112385273, "kl": 0.0021452106011565775, "learning_rate": 9.816025937934228e-07, "loss": 0.0001, "num_tokens": 54561739.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 1987, "step_time": 17.11887515336275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 174.0625, "completions/mean_terminated_length": 174.0625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.1855168156325817, "epoch": 0.0920796665122742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026354107540100813, "kl": 0.0016058009350672364, "learning_rate": 9.81593330245484e-07, "loss": 0.0001, "num_tokens": 54583148.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1988, "step_time": 18.430659186095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 113.0, "completions/mean_terminated_length": 113.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.22687219083309174, "epoch": 0.09212598425196851, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023138225078582764, "kl": 0.0012974188139196485, "learning_rate": 9.81584066697545e-07, "loss": 0.0001, "num_tokens": 54603756.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1989, "step_time": 12.908747110515833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 153.9375, "completions/mean_terminated_length": 153.9375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.408561073243618, "epoch": 0.0921723019916628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014644854236394167, "kl": 0.0019203444826416671, "learning_rate": 9.815748031496064e-07, "loss": 0.0001, "num_tokens": 54652683.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1990, "step_time": 23.003245670348406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.22864800691604614, "epoch": 0.09221861973135712, "frac_reward_zero_std": 0.0, "grad_norm": 0.11659272015094757, "kl": 0.0027175315481144935, "learning_rate": 9.815655396016675e-07, "loss": -0.0223, "num_tokens": 54677293.0, "reward": 0.38041621446609497, "reward_std": 0.4716108739376068, "rewards/reward_func/mean": 0.38041621446609497, "rewards/reward_func/std": 0.4716108739376068, "step": 1991, "step_time": 19.50218654796481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 217.0625, "completions/mean_terminated_length": 217.0625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.19894057139754295, "epoch": 0.09226493747105141, "frac_reward_zero_std": 1.0, "grad_norm": 0.006900215987116098, "kl": 0.005521606304682791, "learning_rate": 9.815562760537284e-07, "loss": 0.0003, "num_tokens": 54709758.0, "reward": 0.7708956003189087, "reward_std": 0.0, "rewards/reward_func/mean": 0.7708956003189087, "rewards/reward_func/std": 0.0, "step": 1992, "step_time": 24.340532917529345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 173.625, "completions/mean_terminated_length": 173.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.17384374886751175, "epoch": 0.09231125521074572, "frac_reward_zero_std": 0.0, "grad_norm": 0.08830630779266357, "kl": 0.0021812051709275693, "learning_rate": 9.815470125057895e-07, "loss": -0.0088, "num_tokens": 54734440.0, "reward": 0.7111106514930725, "reward_std": 0.003023663302883506, "rewards/reward_func/mean": 0.7111106514930725, "rewards/reward_func/std": 0.003023657714948058, "step": 1993, "step_time": 18.362381052225828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.23678594082593918, "epoch": 0.09235757295044002, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025700826663523912, "kl": 0.001432108401786536, "learning_rate": 9.815377489578509e-07, "loss": 0.0001, "num_tokens": 54753720.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1994, "step_time": 13.111877344548702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.15514958649873734, "epoch": 0.09240389069013433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011596870608627796, "kl": 0.0009256289049517363, "learning_rate": 9.81528485409912e-07, "loss": 0.0, "num_tokens": 54799118.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 1995, "step_time": 24.76950002834201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.253715917468071, "epoch": 0.09245020842982862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013802764005959034, "kl": 0.0013290940260048956, "learning_rate": 9.815192218619731e-07, "loss": 0.0001, "num_tokens": 54820118.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1996, "step_time": 14.5656126588583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 143.9375, "completions/mean_terminated_length": 143.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.33426566421985626, "epoch": 0.09249652616952293, "frac_reward_zero_std": 1.0, "grad_norm": 0.002717754105105996, "kl": 0.0019190639141015708, "learning_rate": 9.815099583140342e-07, "loss": 0.0001, "num_tokens": 54841749.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1997, "step_time": 16.373453199863434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.26538507640361786, "epoch": 0.09254284390921723, "frac_reward_zero_std": 1.0, "grad_norm": 0.002579397289082408, "kl": 0.0014253585104597732, "learning_rate": 9.815006947660954e-07, "loss": 0.0001, "num_tokens": 54863499.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1998, "step_time": 14.984016232192516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 143.5625, "completions/mean_terminated_length": 143.5625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2968463897705078, "epoch": 0.09258916164891154, "frac_reward_zero_std": 1.0, "grad_norm": 0.002322679152712226, "kl": 0.0015304088883567601, "learning_rate": 9.814914312181565e-07, "loss": 0.0001, "num_tokens": 54885268.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 1999, "step_time": 15.738748639822006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 154.8125, "completions/mean_terminated_length": 154.8125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.42615479975938797, "epoch": 0.09263547938860583, "frac_reward_zero_std": 1.0, "grad_norm": 0.001267561805434525, "kl": 0.0015495356929022819, "learning_rate": 9.814821676702176e-07, "loss": 0.0001, "num_tokens": 54915249.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2000, "step_time": 18.564595259726048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 106.6875, "completions/mean_terminated_length": 106.6875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.24032869935035706, "epoch": 0.09268179712830014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011372538283467293, "kl": 0.0010614956117933616, "learning_rate": 9.814729041222787e-07, "loss": 0.0001, "num_tokens": 54935500.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2001, "step_time": 13.356164246797562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.2812078222632408, "epoch": 0.09272811486799444, "frac_reward_zero_std": 0.0, "grad_norm": 0.13774681091308594, "kl": 0.003768278518691659, "learning_rate": 9.814636405743399e-07, "loss": 0.0101, "num_tokens": 54973784.0, "reward": 0.7076573371887207, "reward_std": 0.0030280218925327063, "rewards/reward_func/mean": 0.7076573371887207, "rewards/reward_func/std": 0.0030280244536697865, "step": 2002, "step_time": 25.842976734042168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3747788146138191, "epoch": 0.09277443260768875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008774223388172686, "kl": 0.0013073801819700748, "learning_rate": 9.814543770264012e-07, "loss": 0.0001, "num_tokens": 55006136.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2003, "step_time": 18.33067310601473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.22896815836429596, "epoch": 0.09282075034738305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037381534930318594, "kl": 0.0016159997030626982, "learning_rate": 9.814451134784623e-07, "loss": 0.0001, "num_tokens": 55026828.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2004, "step_time": 15.401339266449213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4159994348883629, "epoch": 0.09286706808707736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026936079375445843, "kl": 0.0023532802588306367, "learning_rate": 9.814358499305232e-07, "loss": 0.0001, "num_tokens": 55051096.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2005, "step_time": 19.05104163661599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.16379571333527565, "epoch": 0.09291338582677165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007707058684900403, "kl": 0.0007123186805984005, "learning_rate": 9.814265863825844e-07, "loss": 0.0, "num_tokens": 55083994.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 2006, "step_time": 21.624755449593067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.419580914080143, "epoch": 0.09295970356646596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015239104395732284, "kl": 0.002229607925983146, "learning_rate": 9.814173228346457e-07, "loss": 0.0001, "num_tokens": 55132879.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2007, "step_time": 23.54218227788806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2905442714691162, "epoch": 0.09300602130616026, "frac_reward_zero_std": 0.0, "grad_norm": 0.15490469336509705, "kl": 0.003280369914136827, "learning_rate": 9.814080592867068e-07, "loss": 0.0328, "num_tokens": 55154722.0, "reward": 0.9940523505210876, "reward_std": 0.023790646344423294, "rewards/reward_func/mean": 0.9940523505210876, "rewards/reward_func/std": 0.023790642619132996, "step": 2008, "step_time": 19.209634006023407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 282.9375, "completions/mean_terminated_length": 282.9375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.2532089799642563, "epoch": 0.09305233904585457, "frac_reward_zero_std": 1.0, "grad_norm": 0.001870440086349845, "kl": 0.0015394982474390417, "learning_rate": 9.81398795738768e-07, "loss": 0.0001, "num_tokens": 55190353.0, "reward": 0.8887742161750793, "reward_std": 0.0, "rewards/reward_func/mean": 0.8887742161750793, "rewards/reward_func/std": 0.0, "step": 2009, "step_time": 29.07831295952201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 170.6875, "completions/mean_terminated_length": 170.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.21067959442734718, "epoch": 0.09309865678554886, "frac_reward_zero_std": 0.0, "grad_norm": 0.14396819472312927, "kl": 0.003258177952375263, "learning_rate": 9.81389532190829e-07, "loss": -0.0437, "num_tokens": 55213900.0, "reward": 0.8580853939056396, "reward_std": 0.1954721212387085, "rewards/reward_func/mean": 0.8580853939056396, "rewards/reward_func/std": 0.1954721212387085, "step": 2010, "step_time": 18.470594085752964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2205018289387226, "epoch": 0.09314497452524317, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013772568199783564, "kl": 0.0012428172049112618, "learning_rate": 9.813802686428902e-07, "loss": 0.0001, "num_tokens": 55245296.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 2011, "step_time": 19.049249719828367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 177.1875, "completions/mean_terminated_length": 177.1875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.174911018460989, "epoch": 0.09319129226493747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011804018868133426, "kl": 0.0010020088957389817, "learning_rate": 9.813710050949513e-07, "loss": 0.0, "num_tokens": 55266467.0, "reward": 0.747017502784729, "reward_std": 0.0, "rewards/reward_func/mean": 0.747017502784729, "rewards/reward_func/std": 0.0, "step": 2012, "step_time": 17.20403290167451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.16459906101226807, "epoch": 0.09323761000463178, "frac_reward_zero_std": 0.0, "grad_norm": 0.16722524166107178, "kl": 0.004344968969235197, "learning_rate": 9.813617415470125e-07, "loss": -0.1407, "num_tokens": 55287713.0, "reward": 0.32027894258499146, "reward_std": 0.3841894567012787, "rewards/reward_func/mean": 0.32027894258499146, "rewards/reward_func/std": 0.3841894567012787, "step": 2013, "step_time": 19.53892307356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 206.5, "completions/mean_terminated_length": 206.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.16332386806607246, "epoch": 0.09328392774432608, "frac_reward_zero_std": 0.0, "grad_norm": 0.08634932339191437, "kl": 0.001094053266569972, "learning_rate": 9.813524779990736e-07, "loss": -0.0198, "num_tokens": 55311961.0, "reward": 0.9664702415466309, "reward_std": 0.023347126320004463, "rewards/reward_func/mean": 0.9664702415466309, "rewards/reward_func/std": 0.023347120732069016, "step": 2014, "step_time": 20.719892770051956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 266.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.2889692559838295, "epoch": 0.09333024548402039, "frac_reward_zero_std": 0.0, "grad_norm": 0.0823928490281105, "kl": 0.0021612555137835443, "learning_rate": 9.813432144511347e-07, "loss": -0.0869, "num_tokens": 55344389.0, "reward": 0.5739880800247192, "reward_std": 0.196987584233284, "rewards/reward_func/mean": 0.5739880800247192, "rewards/reward_func/std": 0.196987584233284, "step": 2015, "step_time": 27.850818529725075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 232.4375, "completions/mean_terminated_length": 232.4375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.20757009834051132, "epoch": 0.09337656322371468, "frac_reward_zero_std": 0.0, "grad_norm": 0.0711066871881485, "kl": 0.002094621246214956, "learning_rate": 9.813339509031958e-07, "loss": -0.0323, "num_tokens": 55369884.0, "reward": 0.7875936031341553, "reward_std": 0.10538309067487717, "rewards/reward_func/mean": 0.7875936031341553, "rewards/reward_func/std": 0.10538309812545776, "step": 2016, "step_time": 23.083560083061457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 196.3125, "completions/mean_terminated_length": 196.3125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.331020712852478, "epoch": 0.09342288096340899, "frac_reward_zero_std": 0.0, "grad_norm": 0.11179359257221222, "kl": 0.004218329850118607, "learning_rate": 9.81324687355257e-07, "loss": -0.0371, "num_tokens": 55407265.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2017, "step_time": 24.803230065852404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.40553393214941025, "epoch": 0.09346919870310329, "frac_reward_zero_std": 1.0, "grad_norm": 0.002806166186928749, "kl": 0.001957463100552559, "learning_rate": 9.81315423807318e-07, "loss": 0.0001, "num_tokens": 55433991.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2018, "step_time": 19.76413144916296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3953156992793083, "epoch": 0.0935155164427976, "frac_reward_zero_std": 1.0, "grad_norm": 0.001481954357586801, "kl": 0.0014554686786141247, "learning_rate": 9.813061602593792e-07, "loss": 0.0001, "num_tokens": 55458078.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2019, "step_time": 20.22554812580347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 193.0625, "completions/mean_terminated_length": 193.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4021492078900337, "epoch": 0.0935618341824919, "frac_reward_zero_std": 1.0, "grad_norm": 0.005300453864037991, "kl": 0.003082145005464554, "learning_rate": 9.812968967114405e-07, "loss": 0.0002, "num_tokens": 55481983.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2020, "step_time": 20.60504487901926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 191.4375, "completions/mean_terminated_length": 191.4375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.40075282752513885, "epoch": 0.0936081519221862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013799435691908002, "kl": 0.0014147369656711817, "learning_rate": 9.812876331635017e-07, "loss": 0.0001, "num_tokens": 55512950.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2021, "step_time": 21.51679801568389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.18907545879483223, "epoch": 0.0936544696618805, "frac_reward_zero_std": 0.0, "grad_norm": 0.12522315979003906, "kl": 0.0015172976854955778, "learning_rate": 9.812783696155628e-07, "loss": 0.035, "num_tokens": 55534296.0, "reward": 0.9603575468063354, "reward_std": 0.05285662040114403, "rewards/reward_func/mean": 0.9603575468063354, "rewards/reward_func/std": 0.052856624126434326, "step": 2022, "step_time": 20.253864627331495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.34351880848407745, "epoch": 0.09370078740157481, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025619296357035637, "kl": 0.0018770191818475723, "learning_rate": 9.812691060676237e-07, "loss": 0.0001, "num_tokens": 55558219.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2023, "step_time": 14.198850486427546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 185.125, "completions/mean_terminated_length": 185.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4475076347589493, "epoch": 0.0937471051412691, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033801370300352573, "kl": 0.002179990289732814, "learning_rate": 9.81259842519685e-07, "loss": 0.0001, "num_tokens": 55590413.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2024, "step_time": 29.04584624245763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 218.9375, "completions/mean_terminated_length": 218.9375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.23236701637506485, "epoch": 0.09379342288096341, "frac_reward_zero_std": 0.0, "grad_norm": 0.09577567130327225, "kl": 0.0035261387238278985, "learning_rate": 9.812505789717462e-07, "loss": -0.037, "num_tokens": 55613724.0, "reward": 0.7402232885360718, "reward_std": 0.2639003396034241, "rewards/reward_func/mean": 0.7402232885360718, "rewards/reward_func/std": 0.2639003396034241, "step": 2025, "step_time": 21.337419539690018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 134.75, "completions/mean_terminated_length": 134.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.33088279515504837, "epoch": 0.09383974062065771, "frac_reward_zero_std": 1.0, "grad_norm": 0.004115608055144548, "kl": 0.0020482465624809265, "learning_rate": 9.812413154238073e-07, "loss": 0.0001, "num_tokens": 55639240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2026, "step_time": 15.747435353696346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 201.5625, "completions/mean_terminated_length": 201.5625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4111289530992508, "epoch": 0.09388605836035202, "frac_reward_zero_std": 0.0, "grad_norm": 0.10355306416749954, "kl": 0.00337825994938612, "learning_rate": 9.812320518758684e-07, "loss": 0.0917, "num_tokens": 55674609.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 2027, "step_time": 26.33951948583126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 244.0625, "completions/mean_terminated_length": 244.0625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.3604082316160202, "epoch": 0.09393237610004632, "frac_reward_zero_std": 0.0, "grad_norm": 0.08023695647716522, "kl": 0.0027463238802738488, "learning_rate": 9.812227883279295e-07, "loss": 0.0023, "num_tokens": 55712722.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 2028, "step_time": 27.899784050881863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.31885142624378204, "epoch": 0.09397869383974063, "frac_reward_zero_std": 0.0, "grad_norm": 0.11050109565258026, "kl": 0.002334500750293955, "learning_rate": 9.812135247799907e-07, "loss": -0.0501, "num_tokens": 55736184.0, "reward": 0.6500093936920166, "reward_std": 0.32208919525146484, "rewards/reward_func/mean": 0.6500093936920166, "rewards/reward_func/std": 0.32208922505378723, "step": 2029, "step_time": 18.724287275224924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 187.125, "completions/mean_terminated_length": 187.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3281235545873642, "epoch": 0.09402501157943492, "frac_reward_zero_std": 0.0, "grad_norm": 0.12246488779783249, "kl": 0.005524210748262703, "learning_rate": 9.812042612320518e-07, "loss": -0.0006, "num_tokens": 55757050.0, "reward": 0.8068915009498596, "reward_std": 0.21836881339550018, "rewards/reward_func/mean": 0.8068915009498596, "rewards/reward_func/std": 0.21836881339550018, "step": 2030, "step_time": 20.11217812821269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.24049953371286392, "epoch": 0.09407132931912923, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015947173815220594, "kl": 0.0012396015226840973, "learning_rate": 9.81194997684113e-07, "loss": 0.0001, "num_tokens": 55783256.0, "reward": 0.9161604642868042, "reward_std": 0.0, "rewards/reward_func/mean": 0.9161604642868042, "rewards/reward_func/std": 0.0, "step": 2031, "step_time": 21.03316890448332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 198.6875, "completions/mean_terminated_length": 198.6875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.2781083658337593, "epoch": 0.09411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.09110790491104126, "kl": 0.001261789002455771, "learning_rate": 9.81185734136174e-07, "loss": 0.1084, "num_tokens": 55824419.0, "reward": 0.7620233297348022, "reward_std": 0.2957080006599426, "rewards/reward_func/mean": 0.7620233297348022, "rewards/reward_func/std": 0.2957080006599426, "step": 2032, "step_time": 26.385674338787794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 191.3125, "completions/mean_terminated_length": 191.3125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.20272637158632278, "epoch": 0.09416396479851784, "frac_reward_zero_std": 0.0, "grad_norm": 0.07322783023118973, "kl": 0.0009164645161945373, "learning_rate": 9.811764705882352e-07, "loss": 0.0114, "num_tokens": 55860968.0, "reward": 0.8565701246261597, "reward_std": 0.002278505591675639, "rewards/reward_func/mean": 0.8565701246261597, "rewards/reward_func/std": 0.0022785027977079153, "step": 2033, "step_time": 24.84770367667079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 170.5625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.39616280794143677, "epoch": 0.09421028253821213, "frac_reward_zero_std": 1.0, "grad_norm": 0.0078053586184978485, "kl": 0.002060381375486031, "learning_rate": 9.811672070402965e-07, "loss": 0.0001, "num_tokens": 55911825.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2034, "step_time": 24.6925237365067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.3319804519414902, "epoch": 0.09425660027790644, "frac_reward_zero_std": 0.0, "grad_norm": 0.10468065738677979, "kl": 0.0035973970079794526, "learning_rate": 9.811579434923574e-07, "loss": -0.0125, "num_tokens": 55945785.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 2035, "step_time": 24.75483187288046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 203.0625, "completions/mean_terminated_length": 203.0625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.43262146413326263, "epoch": 0.09430291801760074, "frac_reward_zero_std": 0.0, "grad_norm": 0.0937405377626419, "kl": 0.0019505124655552208, "learning_rate": 9.811486799444185e-07, "loss": 0.1206, "num_tokens": 55968362.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2036, "step_time": 25.007275737822056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.15093117579817772, "epoch": 0.09434923575729505, "frac_reward_zero_std": 0.0, "grad_norm": 0.08309942483901978, "kl": 0.0011663798068184406, "learning_rate": 9.811394163964799e-07, "loss": 0.0106, "num_tokens": 55999070.0, "reward": 0.9607253074645996, "reward_std": 0.02734741009771824, "rewards/reward_func/mean": 0.9607253074645996, "rewards/reward_func/std": 0.027347413823008537, "step": 2037, "step_time": 21.52350740507245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.37679579854011536, "epoch": 0.09439555349698935, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013516127364709973, "kl": 0.0016926408861763775, "learning_rate": 9.81130152848541e-07, "loss": 0.0001, "num_tokens": 56034324.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2038, "step_time": 20.581428475677967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 152.6875, "completions/mean_terminated_length": 152.6875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.36912456154823303, "epoch": 0.09444187123668366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032714083790779114, "kl": 0.0026178762200288475, "learning_rate": 9.811208893006021e-07, "loss": 0.0001, "num_tokens": 56079519.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2039, "step_time": 21.89334412664175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 185.375, "completions/mean_terminated_length": 185.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3762866109609604, "epoch": 0.09448818897637795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023587944451719522, "kl": 0.0018823669233825058, "learning_rate": 9.811116257526633e-07, "loss": 0.0001, "num_tokens": 56112933.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2040, "step_time": 21.799803376197815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 146.5625, "completions/mean_terminated_length": 146.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.26305900514125824, "epoch": 0.09453450671607226, "frac_reward_zero_std": 1.0, "grad_norm": 0.00414580712094903, "kl": 0.0029761113109998405, "learning_rate": 9.811023622047244e-07, "loss": 0.0001, "num_tokens": 56132894.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2041, "step_time": 16.481983814388514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2890935614705086, "epoch": 0.09458082445576656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013513185549527407, "kl": 0.001297875598538667, "learning_rate": 9.810930986567855e-07, "loss": 0.0001, "num_tokens": 56152541.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2042, "step_time": 13.403200272470713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 135.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3202582746744156, "epoch": 0.09462714219546087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020696830470114946, "kl": 0.0016303090669680387, "learning_rate": 9.810838351088466e-07, "loss": 0.0001, "num_tokens": 56182896.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2043, "step_time": 17.72435461357236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.35065240412950516, "epoch": 0.09467345993515516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010780496522784233, "kl": 0.0011586417676880956, "learning_rate": 9.810745715609078e-07, "loss": 0.0001, "num_tokens": 56206344.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2044, "step_time": 15.890535064041615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 166.0625, "completions/mean_terminated_length": 166.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3423704281449318, "epoch": 0.09471977767484947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012414716184139252, "kl": 0.0013426708756014705, "learning_rate": 9.810653080129689e-07, "loss": 0.0001, "num_tokens": 56236041.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2045, "step_time": 19.825654160231352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2755350135266781, "epoch": 0.09476609541454377, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035304257180541754, "kl": 0.0024147378862835467, "learning_rate": 9.8105604446503e-07, "loss": 0.0001, "num_tokens": 56257853.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2046, "step_time": 18.540200740098953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.34416310489177704, "epoch": 0.09481241315423808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022126396652311087, "kl": 0.0016846568905748427, "learning_rate": 9.810467809170913e-07, "loss": 0.0001, "num_tokens": 56283969.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2047, "step_time": 17.07555378228426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 311.6875, "completions/mean_terminated_length": 311.6875, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "entropy": 0.22475888207554817, "epoch": 0.09485873089393237, "frac_reward_zero_std": 0.0, "grad_norm": 0.07333046942949295, "kl": 0.002175509522203356, "learning_rate": 9.810375173691523e-07, "loss": -0.0462, "num_tokens": 56324796.0, "reward": 0.8022526502609253, "reward_std": 0.1691083461046219, "rewards/reward_func/mean": 0.8022526502609253, "rewards/reward_func/std": 0.1691083461046219, "step": 2048, "step_time": 33.69538462534547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 141.8125, "completions/mean_terminated_length": 141.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.37307263165712357, "epoch": 0.09490504863362668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072221108712255955, "kl": 0.004154925467446446, "learning_rate": 9.810282538212134e-07, "loss": 0.0002, "num_tokens": 56345609.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2049, "step_time": 15.079968519508839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.304686039686203, "epoch": 0.09495136637332098, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034532558638602495, "kl": 0.0017407748964615166, "learning_rate": 9.810189902732747e-07, "loss": 0.0001, "num_tokens": 56365893.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2050, "step_time": 14.859419286251068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 125.6875, "completions/mean_terminated_length": 125.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2114924117922783, "epoch": 0.09499768411301529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02498413622379303, "kl": 0.004018052568426356, "learning_rate": 9.810097267253358e-07, "loss": 0.0002, "num_tokens": 56385392.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2051, "step_time": 13.88862270489335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 197.0625, "completions/mean_terminated_length": 197.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4702746123075485, "epoch": 0.09504400185270959, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017258214065805078, "kl": 0.0020685320487245917, "learning_rate": 9.81000463177397e-07, "loss": 0.0001, "num_tokens": 56414881.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2052, "step_time": 23.93411250412464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.35899273306131363, "epoch": 0.0950903195924039, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014317986788228154, "kl": 0.0013684245059266686, "learning_rate": 9.80991199629458e-07, "loss": 0.0001, "num_tokens": 56437347.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2053, "step_time": 17.814839605242014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 138.6875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.31955157220363617, "epoch": 0.09513663733209819, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013007436646148562, "kl": 0.0011292924464214593, "learning_rate": 9.809819360815192e-07, "loss": 0.0001, "num_tokens": 56464094.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2054, "step_time": 16.677923016250134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 157.75, "completions/mean_terminated_length": 157.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3834758996963501, "epoch": 0.0951829550717925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012701147934421897, "kl": 0.0014297603629529476, "learning_rate": 9.809726725335803e-07, "loss": 0.0001, "num_tokens": 56499690.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2055, "step_time": 20.412053678184748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3550326004624367, "epoch": 0.0952292728114868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015697143971920013, "kl": 0.0015536470455117524, "learning_rate": 9.809634089856415e-07, "loss": 0.0001, "num_tokens": 56525216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2056, "step_time": 18.424117360264063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.30019643902778625, "epoch": 0.09527559055118111, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014886199496686459, "kl": 0.0015041444275993854, "learning_rate": 9.809541454377026e-07, "loss": 0.0001, "num_tokens": 56547150.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2057, "step_time": 16.6746284365654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 257.9375, "completions/mean_terminated_length": 257.9375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.337487168610096, "epoch": 0.0953219082908754, "frac_reward_zero_std": 0.0, "grad_norm": 0.07086921483278275, "kl": 0.0030414532811846584, "learning_rate": 9.809448818897637e-07, "loss": -0.1511, "num_tokens": 56575741.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 2058, "step_time": 30.399008128792048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 118.75, "completions/mean_terminated_length": 118.75, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23380812257528305, "epoch": 0.09536822603056971, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008936121012084186, "kl": 0.0009265767293982208, "learning_rate": 9.809356183418248e-07, "loss": 0.0, "num_tokens": 56595721.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2059, "step_time": 13.686347719281912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 159.1875, "completions/mean_terminated_length": 159.1875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.25653551146388054, "epoch": 0.09541454377026401, "frac_reward_zero_std": 1.0, "grad_norm": 0.008221469819545746, "kl": 0.003456158301560208, "learning_rate": 9.80926354793886e-07, "loss": 0.0002, "num_tokens": 56619692.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 2060, "step_time": 17.168593287467957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.1649177223443985, "epoch": 0.09546086150995832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010986204724758863, "kl": 0.001024660756229423, "learning_rate": 9.80917091245947e-07, "loss": 0.0001, "num_tokens": 56650406.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2061, "step_time": 21.216629676520824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 223.5625, "completions/mean_terminated_length": 223.5625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.30768416076898575, "epoch": 0.09550717924965262, "frac_reward_zero_std": 0.0, "grad_norm": 0.07383064180612564, "kl": 0.003608456638175994, "learning_rate": 9.809078276980082e-07, "loss": -0.0686, "num_tokens": 56672831.0, "reward": 0.3277292549610138, "reward_std": 0.38379722833633423, "rewards/reward_func/mean": 0.3277292549610138, "rewards/reward_func/std": 0.38379722833633423, "step": 2062, "step_time": 25.07394739612937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 236.1875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.184597447514534, "epoch": 0.09555349698934693, "frac_reward_zero_std": 0.0, "grad_norm": 0.11104031652212143, "kl": 0.0008925462025217712, "learning_rate": 9.808985641500693e-07, "loss": 0.0104, "num_tokens": 56707810.0, "reward": 0.9944300055503845, "reward_std": 0.022280026227235794, "rewards/reward_func/mean": 0.9944300055503845, "rewards/reward_func/std": 0.022280022501945496, "step": 2063, "step_time": 26.623209707438946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.1875, "completions/mean_terminated_length": 169.1875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.14013603702187538, "epoch": 0.09559981472904122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011042552068829536, "kl": 0.0006901939486851916, "learning_rate": 9.808893006021307e-07, "loss": 0.0, "num_tokens": 56745749.0, "reward": 0.8890097737312317, "reward_std": 0.0, "rewards/reward_func/mean": 0.8890097737312317, "rewards/reward_func/std": 0.0, "step": 2064, "step_time": 21.75394108146429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.35413843393325806, "epoch": 0.09564613246873553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026031648740172386, "kl": 0.002009792427998036, "learning_rate": 9.808800370541918e-07, "loss": 0.0001, "num_tokens": 56766775.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2065, "step_time": 14.743171103298664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.29190149903297424, "epoch": 0.09569245020842983, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034215168561786413, "kl": 0.0023275664425455034, "learning_rate": 9.808707735062527e-07, "loss": 0.0001, "num_tokens": 56796123.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2066, "step_time": 16.022603794932365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4320342019200325, "epoch": 0.09573876794812414, "frac_reward_zero_std": 1.0, "grad_norm": 0.002367663662880659, "kl": 0.0018327873258385807, "learning_rate": 9.80861509958314e-07, "loss": 0.0001, "num_tokens": 56823603.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2067, "step_time": 22.358512055128813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 139.1875, "completions/mean_terminated_length": 139.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2488016001880169, "epoch": 0.09578508568781843, "frac_reward_zero_std": 0.0, "grad_norm": 0.17417486011981964, "kl": 0.004339932158472948, "learning_rate": 9.808522464103752e-07, "loss": -0.0446, "num_tokens": 56850166.0, "reward": 0.9193795919418335, "reward_std": 0.15853877365589142, "rewards/reward_func/mean": 0.9193795919418335, "rewards/reward_func/std": 0.15853877365589142, "step": 2068, "step_time": 16.58432499691844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 180.8125, "completions/mean_terminated_length": 180.8125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.23132160678505898, "epoch": 0.09583140342751274, "frac_reward_zero_std": 0.0, "grad_norm": 0.10905469954013824, "kl": 0.001088093180442229, "learning_rate": 9.808429828624363e-07, "loss": 0.0067, "num_tokens": 56889859.0, "reward": 0.8267015218734741, "reward_std": 0.021780284121632576, "rewards/reward_func/mean": 0.8267015218734741, "rewards/reward_func/std": 0.02178027853369713, "step": 2069, "step_time": 23.202710587531328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 220.3125, "completions/mean_terminated_length": 220.3125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2583886757493019, "epoch": 0.09587772116720704, "frac_reward_zero_std": 1.0, "grad_norm": 0.002973136492073536, "kl": 0.0019069083500653505, "learning_rate": 9.808337193144974e-07, "loss": 0.0001, "num_tokens": 56915464.0, "reward": 0.7105904221534729, "reward_std": 0.0, "rewards/reward_func/mean": 0.7105904221534729, "rewards/reward_func/std": 0.0, "step": 2070, "step_time": 23.26581984013319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 190.3125, "completions/mean_terminated_length": 190.3125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.40494538098573685, "epoch": 0.09592403890690135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037858253344893456, "kl": 0.002681231824681163, "learning_rate": 9.808244557665585e-07, "loss": 0.0001, "num_tokens": 56940269.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2071, "step_time": 21.089487422257662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.21866579353809357, "epoch": 0.09597035664659564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031161820515990257, "kl": 0.001894166081910953, "learning_rate": 9.808151922186197e-07, "loss": 0.0001, "num_tokens": 56959869.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2072, "step_time": 13.873397447168827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 125.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.22946714982390404, "epoch": 0.09601667438628995, "frac_reward_zero_std": 1.0, "grad_norm": 0.005955255590379238, "kl": 0.0018666211108211428, "learning_rate": 9.808059286706808e-07, "loss": 0.0001, "num_tokens": 56979249.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2073, "step_time": 13.370041579008102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 163.8125, "completions/mean_terminated_length": 163.8125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3609740734100342, "epoch": 0.09606299212598425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018750651506707072, "kl": 0.001974839164176956, "learning_rate": 9.80796665122742e-07, "loss": 0.0001, "num_tokens": 57000014.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2074, "step_time": 17.314895667135715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.341608501970768, "epoch": 0.09610930986567856, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021006171591579914, "kl": 0.0016438520688097924, "learning_rate": 9.80787401574803e-07, "loss": 0.0001, "num_tokens": 57023938.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2075, "step_time": 15.84898490831256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 138.4375, "completions/mean_terminated_length": 138.4375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.20013650879263878, "epoch": 0.09615562760537286, "frac_reward_zero_std": 1.0, "grad_norm": 0.005079299211502075, "kl": 0.002640404738485813, "learning_rate": 9.807781380268642e-07, "loss": 0.0001, "num_tokens": 57043641.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2076, "step_time": 14.947165336459875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.21334562450647354, "epoch": 0.09620194534506717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010591655736789107, "kl": 0.0010537790512898937, "learning_rate": 9.807688744789255e-07, "loss": 0.0001, "num_tokens": 57072295.0, "reward": 0.7425271272659302, "reward_std": 0.0, "rewards/reward_func/mean": 0.7425271272659302, "rewards/reward_func/std": 0.0, "step": 2077, "step_time": 16.897052317857742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3212684392929077, "epoch": 0.09624826308476146, "frac_reward_zero_std": 1.0, "grad_norm": 0.001615766086615622, "kl": 0.0015198698383755982, "learning_rate": 9.807596109309864e-07, "loss": 0.0001, "num_tokens": 57097791.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2078, "step_time": 16.013172037899494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.16729550063610077, "epoch": 0.09629458082445577, "frac_reward_zero_std": 1.0, "grad_norm": 0.001133670681156218, "kl": 0.0008856083150021732, "learning_rate": 9.807503473830475e-07, "loss": 0.0, "num_tokens": 57121849.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 2079, "step_time": 17.570480413734913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.1947532631456852, "epoch": 0.09634089856415007, "frac_reward_zero_std": 1.0, "grad_norm": 0.004643620457500219, "kl": 0.002088828943669796, "learning_rate": 9.807410838351089e-07, "loss": 0.0001, "num_tokens": 57147743.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 2080, "step_time": 16.442312948405743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 139.9375, "completions/mean_terminated_length": 139.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3246047720313072, "epoch": 0.09638721630384438, "frac_reward_zero_std": 1.0, "grad_norm": 0.006020332686603069, "kl": 0.003244896070100367, "learning_rate": 9.8073182028717e-07, "loss": 0.0002, "num_tokens": 57169326.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2081, "step_time": 16.652198139578104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 198.4375, "completions/mean_terminated_length": 198.4375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.22148482128977776, "epoch": 0.09643353404353867, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012353778583928943, "kl": 0.001151223696069792, "learning_rate": 9.807225567392311e-07, "loss": 0.0001, "num_tokens": 57205141.0, "reward": 0.5623413324356079, "reward_std": 0.0, "rewards/reward_func/mean": 0.5623413324356079, "rewards/reward_func/std": 0.0, "step": 2082, "step_time": 22.177756395190954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.21318692713975906, "epoch": 0.09647985178323298, "frac_reward_zero_std": 0.0, "grad_norm": 0.11407241225242615, "kl": 0.0032745692878961563, "learning_rate": 9.807132931912923e-07, "loss": -0.0443, "num_tokens": 57230063.0, "reward": 0.7006678581237793, "reward_std": 0.07982190698385239, "rewards/reward_func/mean": 0.7006678581237793, "rewards/reward_func/std": 0.07982189953327179, "step": 2083, "step_time": 20.12787677720189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.125, "completions/mean_terminated_length": 123.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2635095790028572, "epoch": 0.09652616952292728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025622595567256212, "kl": 0.0018459637358319014, "learning_rate": 9.807040296433534e-07, "loss": 0.0001, "num_tokens": 57252145.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2084, "step_time": 13.830753143876791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 158.1875, "completions/mean_terminated_length": 158.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3883274719119072, "epoch": 0.09657248726262159, "frac_reward_zero_std": 1.0, "grad_norm": 0.00164109468460083, "kl": 0.001676585408858955, "learning_rate": 9.806947660954145e-07, "loss": 0.0001, "num_tokens": 57285220.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2085, "step_time": 19.91367145255208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 190.1875, "completions/mean_terminated_length": 190.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4454917311668396, "epoch": 0.09661880500231589, "frac_reward_zero_std": 1.0, "grad_norm": 0.00665110070258379, "kl": 0.004151255008764565, "learning_rate": 9.806855025474756e-07, "loss": 0.0002, "num_tokens": 57313207.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2086, "step_time": 24.15814983472228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.24852854758501053, "epoch": 0.0966651227420102, "frac_reward_zero_std": 0.0, "grad_norm": 0.06248871237039566, "kl": 0.002544258371926844, "learning_rate": 9.806762389995368e-07, "loss": 0.0221, "num_tokens": 57342507.0, "reward": 0.9785215258598328, "reward_std": 0.08591390401124954, "rewards/reward_func/mean": 0.9785215258598328, "rewards/reward_func/std": 0.08591391146183014, "step": 2087, "step_time": 20.447678916156292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 178.5, "completions/mean_terminated_length": 178.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.19578471034765244, "epoch": 0.09671144048170449, "frac_reward_zero_std": 0.0, "grad_norm": 0.08643905818462372, "kl": 0.0011518751416588202, "learning_rate": 9.806669754515979e-07, "loss": -0.0566, "num_tokens": 57372995.0, "reward": 0.5361264944076538, "reward_std": 0.09083743393421173, "rewards/reward_func/mean": 0.5361264944076538, "rewards/reward_func/std": 0.09083743393421173, "step": 2088, "step_time": 22.06103541329503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 166.5625, "completions/mean_terminated_length": 166.5625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.20414895564317703, "epoch": 0.0967577582213988, "frac_reward_zero_std": 0.0, "grad_norm": 0.13867773115634918, "kl": 0.0043305178405717015, "learning_rate": 9.80657711903659e-07, "loss": 0.0415, "num_tokens": 57397628.0, "reward": 0.9576616287231445, "reward_std": 0.049273598939180374, "rewards/reward_func/mean": 0.9576616287231445, "rewards/reward_func/std": 0.049273598939180374, "step": 2089, "step_time": 19.180107697844505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.4542301818728447, "epoch": 0.0968040759610931, "frac_reward_zero_std": 0.0, "grad_norm": 0.09310434758663177, "kl": 0.0026618808042258024, "learning_rate": 9.806484483557203e-07, "loss": 0.0406, "num_tokens": 57430206.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 2090, "step_time": 25.198974158614874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2857926934957504, "epoch": 0.09685039370078741, "frac_reward_zero_std": 1.0, "grad_norm": 0.002290284726768732, "kl": 0.0015625466767232865, "learning_rate": 9.806391848077813e-07, "loss": 0.0001, "num_tokens": 57465521.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2091, "step_time": 18.497051488608122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2870178297162056, "epoch": 0.0968967114404817, "frac_reward_zero_std": 0.0, "grad_norm": 0.0984499603509903, "kl": 0.001998687395825982, "learning_rate": 9.806299212598424e-07, "loss": -0.0126, "num_tokens": 57486301.0, "reward": 0.8423806428909302, "reward_std": 0.09095054119825363, "rewards/reward_func/mean": 0.8423806428909302, "rewards/reward_func/std": 0.09095054864883423, "step": 2092, "step_time": 16.793691530823708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 185.0625, "completions/mean_terminated_length": 185.0625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.16687441617250443, "epoch": 0.09694302918017601, "frac_reward_zero_std": 1.0, "grad_norm": 0.001002144650556147, "kl": 0.000947712775086984, "learning_rate": 9.806206577119035e-07, "loss": 0.0, "num_tokens": 57508446.0, "reward": 0.9091564416885376, "reward_std": 0.0, "rewards/reward_func/mean": 0.9091564416885376, "rewards/reward_func/std": 0.0, "step": 2093, "step_time": 18.088635966181755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 166.9375, "completions/mean_terminated_length": 166.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.338656447827816, "epoch": 0.09698934691987031, "frac_reward_zero_std": 1.0, "grad_norm": 0.003697863081470132, "kl": 0.0021888002811465412, "learning_rate": 9.806113941639648e-07, "loss": 0.0001, "num_tokens": 57530093.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2094, "step_time": 18.40073474869132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.373967744410038, "epoch": 0.09703566465956462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010357302380725741, "kl": 0.001299672672757879, "learning_rate": 9.80602130616026e-07, "loss": 0.0001, "num_tokens": 57567359.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2095, "step_time": 19.8071150444448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.37355881184339523, "epoch": 0.09708198239925891, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012508081272244453, "kl": 0.0016345091571565717, "learning_rate": 9.80592867068087e-07, "loss": 0.0001, "num_tokens": 57601226.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2096, "step_time": 19.480534825474024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.21006585657596588, "epoch": 0.09712830013895322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028879523742944, "kl": 0.0015600161132169887, "learning_rate": 9.805836035201482e-07, "loss": 0.0001, "num_tokens": 57622353.0, "reward": 0.39511775970458984, "reward_std": 0.0, "rewards/reward_func/mean": 0.39511775970458984, "rewards/reward_func/std": 0.0, "step": 2097, "step_time": 17.613510336726904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4354572370648384, "epoch": 0.09717461787864752, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025400533340871334, "kl": 0.0021850342745892704, "learning_rate": 9.805743399722093e-07, "loss": 0.0001, "num_tokens": 57645788.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2098, "step_time": 20.207767341285944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 180.8125, "completions/mean_terminated_length": 180.8125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.21231921389698982, "epoch": 0.09722093561834183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016797012649476528, "kl": 0.0012572874256875366, "learning_rate": 9.805650764242705e-07, "loss": 0.0001, "num_tokens": 57675609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2099, "step_time": 19.69765767455101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.48515453189611435, "epoch": 0.09726725335803613, "frac_reward_zero_std": 1.0, "grad_norm": 0.002514895284548402, "kl": 0.002511514292564243, "learning_rate": 9.805558128763316e-07, "loss": 0.0001, "num_tokens": 57704979.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2100, "step_time": 23.956448070704937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.2419266775250435, "epoch": 0.09731357109773044, "frac_reward_zero_std": 1.0, "grad_norm": 0.002808332908898592, "kl": 0.0017441484960727394, "learning_rate": 9.805465493283927e-07, "loss": 0.0001, "num_tokens": 57727503.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 2101, "step_time": 18.251241214573383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2840619161725044, "epoch": 0.09735988883742473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017955612856894732, "kl": 0.0016842714103404433, "learning_rate": 9.805372857804538e-07, "loss": 0.0001, "num_tokens": 57750957.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2102, "step_time": 13.67229737713933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.32540467381477356, "epoch": 0.09740620657711904, "frac_reward_zero_std": 1.0, "grad_norm": 0.001574921770952642, "kl": 0.0017012866446748376, "learning_rate": 9.80528022232515e-07, "loss": 0.0001, "num_tokens": 57787057.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2103, "step_time": 18.993708673864603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 140.6875, "completions/mean_terminated_length": 140.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2592414878308773, "epoch": 0.09745252431681334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021899163257330656, "kl": 0.0015125124482437968, "learning_rate": 9.80518758684576e-07, "loss": 0.0001, "num_tokens": 57809964.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2104, "step_time": 15.311320420354605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.23829853907227516, "epoch": 0.09749884205650765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032185029704123735, "kl": 0.0018298894865438342, "learning_rate": 9.805094951366372e-07, "loss": 0.0001, "num_tokens": 57829875.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2105, "step_time": 15.38297138735652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2016875445842743, "epoch": 0.09754515979620194, "frac_reward_zero_std": 1.0, "grad_norm": 0.001584141282364726, "kl": 0.0011248220544075593, "learning_rate": 9.805002315886983e-07, "loss": 0.0001, "num_tokens": 57849327.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2106, "step_time": 13.799412783235312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 231.5625, "completions/mean_terminated_length": 231.5625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.37778548151254654, "epoch": 0.09759147753589625, "frac_reward_zero_std": 0.0, "grad_norm": 0.09862415492534637, "kl": 0.002680773555766791, "learning_rate": 9.804909680407597e-07, "loss": -0.0603, "num_tokens": 57884040.0, "reward": 0.0003452743694651872, "reward_std": 0.00024041820142883807, "rewards/reward_func/mean": 0.0003452743694651872, "rewards/reward_func/std": 0.0002404182159807533, "step": 2107, "step_time": 30.234281316399574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 112.5, "completions/mean_terminated_length": 112.5, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2959787994623184, "epoch": 0.09763779527559055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025102982763201, "kl": 0.0020582875586114824, "learning_rate": 9.804817044928208e-07, "loss": 0.0001, "num_tokens": 57907264.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2108, "step_time": 13.531257309019566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.33832044154405594, "epoch": 0.09768411301528486, "frac_reward_zero_std": 1.0, "grad_norm": 0.001770894741639495, "kl": 0.001571035390952602, "learning_rate": 9.804724409448817e-07, "loss": 0.0001, "num_tokens": 57933659.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2109, "step_time": 17.286038760095835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.2194627821445465, "epoch": 0.09773043075497916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010604043491184711, "kl": 0.0009090398234548047, "learning_rate": 9.80463177396943e-07, "loss": 0.0, "num_tokens": 57986179.0, "reward": 0.17782793939113617, "reward_std": 0.0, "rewards/reward_func/mean": 0.17782793939113617, "rewards/reward_func/std": 0.0, "step": 2110, "step_time": 26.853449895977974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 118.9375, "completions/mean_terminated_length": 118.9375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3218560069799423, "epoch": 0.09777674849467347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017308102687820792, "kl": 0.0015206353273242712, "learning_rate": 9.804539138490042e-07, "loss": 0.0001, "num_tokens": 58010162.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2111, "step_time": 15.71082091704011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.36409657448530197, "epoch": 0.09782306623436776, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018410587217658758, "kl": 0.001968503464013338, "learning_rate": 9.804446503010653e-07, "loss": 0.0001, "num_tokens": 58038038.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2112, "step_time": 15.209703668951988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 150.3125, "completions/mean_terminated_length": 150.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.20162580534815788, "epoch": 0.09786938397406207, "frac_reward_zero_std": 1.0, "grad_norm": 0.001409210846759379, "kl": 0.0011586171895032749, "learning_rate": 9.804353867531264e-07, "loss": 0.0001, "num_tokens": 58063291.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 2113, "step_time": 17.23585532233119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.31495241820812225, "epoch": 0.09791570171375637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013494627783074975, "kl": 0.001211692811921239, "learning_rate": 9.804261232051876e-07, "loss": 0.0001, "num_tokens": 58092227.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2114, "step_time": 16.41570270061493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 154.5625, "completions/mean_terminated_length": 154.5625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.375872403383255, "epoch": 0.09796201945345068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012647317489609122, "kl": 0.0015600319602526724, "learning_rate": 9.804168596572487e-07, "loss": 0.0001, "num_tokens": 58128988.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2115, "step_time": 20.176334507763386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 239.8125, "completions/mean_terminated_length": 239.8125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.2824728675186634, "epoch": 0.09800833719314497, "frac_reward_zero_std": 0.0, "grad_norm": 0.08006158471107483, "kl": 0.002352870360482484, "learning_rate": 9.804075961093098e-07, "loss": -0.0952, "num_tokens": 58163705.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 2116, "step_time": 30.311158139258623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4082041159272194, "epoch": 0.09805465493283928, "frac_reward_zero_std": 1.0, "grad_norm": 0.004847372882068157, "kl": 0.003344768425449729, "learning_rate": 9.80398332561371e-07, "loss": 0.0002, "num_tokens": 58185563.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2117, "step_time": 18.672387160360813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4239012971520424, "epoch": 0.09810097267253358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015941817546263337, "kl": 0.0015966749342624098, "learning_rate": 9.80389069013432e-07, "loss": 0.0001, "num_tokens": 58207097.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2118, "step_time": 17.626061864197254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 190.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.26102016121149063, "epoch": 0.09814729041222789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017980575794354081, "kl": 0.0013494390004780143, "learning_rate": 9.803798054654932e-07, "loss": 0.0001, "num_tokens": 58229593.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2119, "step_time": 18.867047514766455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 112.875, "completions/mean_terminated_length": 112.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3516552895307541, "epoch": 0.09819360815192218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025518988259136677, "kl": 0.0019067800021730363, "learning_rate": 9.803705419175545e-07, "loss": 0.0001, "num_tokens": 58255303.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2120, "step_time": 14.232797224074602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 118.25, "completions/mean_terminated_length": 118.25, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23819386214017868, "epoch": 0.0982399258916165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013898770557716489, "kl": 0.0013417064037639648, "learning_rate": 9.803612783696154e-07, "loss": 0.0001, "num_tokens": 58274619.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2121, "step_time": 13.692626401782036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3593531921505928, "epoch": 0.09828624363131079, "frac_reward_zero_std": 0.0, "grad_norm": 0.12910568714141846, "kl": 0.0021692859299946576, "learning_rate": 9.803520148216766e-07, "loss": 0.0109, "num_tokens": 58296686.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 2122, "step_time": 19.37053521350026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 176.0625, "completions/mean_terminated_length": 176.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.38618531078100204, "epoch": 0.0983325613710051, "frac_reward_zero_std": 1.0, "grad_norm": 0.002144093392416835, "kl": 0.002112426533130929, "learning_rate": 9.803427512737377e-07, "loss": 0.0001, "num_tokens": 58318975.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2123, "step_time": 20.69658763706684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3645559474825859, "epoch": 0.0983788791106994, "frac_reward_zero_std": 0.0, "grad_norm": 0.1052614226937294, "kl": 0.002696045790798962, "learning_rate": 9.80333487725799e-07, "loss": -0.0076, "num_tokens": 58344139.0, "reward": 0.1696605384349823, "reward_std": 0.36474987864494324, "rewards/reward_func/mean": 0.1696605384349823, "rewards/reward_func/std": 0.36474987864494324, "step": 2124, "step_time": 18.370091810822487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.35013314336538315, "epoch": 0.0984251968503937, "frac_reward_zero_std": 0.0, "grad_norm": 0.16796258091926575, "kl": 0.0027245827950537205, "learning_rate": 9.803242241778601e-07, "loss": -0.0635, "num_tokens": 58377411.0, "reward": 0.2525527775287628, "reward_std": 0.28331121802330017, "rewards/reward_func/mean": 0.2525527775287628, "rewards/reward_func/std": 0.28331121802330017, "step": 2125, "step_time": 24.360945247113705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 167.4375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.39234574884176254, "epoch": 0.098471514590088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012811715714633465, "kl": 0.0018412568606436253, "learning_rate": 9.803149606299213e-07, "loss": 0.0001, "num_tokens": 58427242.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2126, "step_time": 24.655378818511963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 217.25, "completions/mean_terminated_length": 217.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.3896579220890999, "epoch": 0.09851783232978231, "frac_reward_zero_std": 1.0, "grad_norm": 0.003342331387102604, "kl": 0.002888473973143846, "learning_rate": 9.803056970819824e-07, "loss": 0.0001, "num_tokens": 58465166.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2127, "step_time": 24.926264192909002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 176.0625, "completions/mean_terminated_length": 176.0625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.36444825679063797, "epoch": 0.09856415006947661, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015871605137363076, "kl": 0.001471631578169763, "learning_rate": 9.802964335340435e-07, "loss": 0.0001, "num_tokens": 58491663.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2128, "step_time": 18.645525492727757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 151.8125, "completions/mean_terminated_length": 151.8125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.40546315163373947, "epoch": 0.09861046780917092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014846234116703272, "kl": 0.0017653919057920575, "learning_rate": 9.802871699861046e-07, "loss": 0.0001, "num_tokens": 58532572.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2129, "step_time": 20.880179658532143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2928945794701576, "epoch": 0.09865678554886521, "frac_reward_zero_std": 1.0, "grad_norm": 0.001732602366246283, "kl": 0.001616789581021294, "learning_rate": 9.802779064381658e-07, "loss": 0.0001, "num_tokens": 58553332.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2130, "step_time": 14.878775801509619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3602956533432007, "epoch": 0.09870310328855952, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027457152027636766, "kl": 0.0020547359599731863, "learning_rate": 9.802686428902269e-07, "loss": 0.0001, "num_tokens": 58575828.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2131, "step_time": 20.551716335117817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.34010016918182373, "epoch": 0.09874942102825382, "frac_reward_zero_std": 1.0, "grad_norm": 0.005504888948053122, "kl": 0.004122120910324156, "learning_rate": 9.80259379342288e-07, "loss": 0.0002, "num_tokens": 58598737.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2132, "step_time": 20.68515168502927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3857610821723938, "epoch": 0.09879573876794813, "frac_reward_zero_std": 1.0, "grad_norm": 0.002364929998293519, "kl": 0.002093592134770006, "learning_rate": 9.802501157943491e-07, "loss": 0.0001, "num_tokens": 58622017.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2133, "step_time": 16.79001769796014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.3068154603242874, "epoch": 0.09884205650764243, "frac_reward_zero_std": 0.0, "grad_norm": 0.07661484181880951, "kl": 0.0024499078281223774, "learning_rate": 9.802408522464103e-07, "loss": -0.1667, "num_tokens": 58648043.0, "reward": 0.554679274559021, "reward_std": 0.4505092203617096, "rewards/reward_func/mean": 0.554679274559021, "rewards/reward_func/std": 0.4505092203617096, "step": 2134, "step_time": 31.21722112223506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.28052544593811035, "epoch": 0.09888837424733674, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020063433330506086, "kl": 0.0015915047551970929, "learning_rate": 9.802315886984714e-07, "loss": 0.0001, "num_tokens": 58669193.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2135, "step_time": 13.941501632332802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.39567285031080246, "epoch": 0.09893469198703103, "frac_reward_zero_std": 1.0, "grad_norm": 0.001788937021046877, "kl": 0.001613630069186911, "learning_rate": 9.802223251505325e-07, "loss": 0.0001, "num_tokens": 58705195.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2136, "step_time": 20.20920692011714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.23943090438842773, "epoch": 0.09898100972672534, "frac_reward_zero_std": 1.0, "grad_norm": 0.002178144408389926, "kl": 0.0015545105270575732, "learning_rate": 9.802130616025939e-07, "loss": 0.0001, "num_tokens": 58724768.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2137, "step_time": 15.008190114051104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 160.4375, "completions/mean_terminated_length": 160.4375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.26413238793611526, "epoch": 0.09902732746641964, "frac_reward_zero_std": 1.0, "grad_norm": 0.002063291845843196, "kl": 0.0013851181138306856, "learning_rate": 9.80203798054655e-07, "loss": 0.0001, "num_tokens": 58747271.0, "reward": 0.43171051144599915, "reward_std": 0.0, "rewards/reward_func/mean": 0.43171051144599915, "rewards/reward_func/std": 0.0, "step": 2138, "step_time": 16.399781592190266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.230569027364254, "epoch": 0.09907364520611395, "frac_reward_zero_std": 0.0, "grad_norm": 0.13090035319328308, "kl": 0.002730335865635425, "learning_rate": 9.80194534506716e-07, "loss": 0.0891, "num_tokens": 58768757.0, "reward": 0.4019976258277893, "reward_std": 0.1569238007068634, "rewards/reward_func/mean": 0.4019976258277893, "rewards/reward_func/std": 0.1569238007068634, "step": 2139, "step_time": 19.543237898498774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 143.9375, "completions/mean_terminated_length": 143.9375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3291706144809723, "epoch": 0.09911996294580824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019901259802281857, "kl": 0.0017054046329576522, "learning_rate": 9.801852709587772e-07, "loss": 0.0001, "num_tokens": 58789108.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2140, "step_time": 14.700423073023558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.41266487538814545, "epoch": 0.09916628068550255, "frac_reward_zero_std": 1.0, "grad_norm": 0.006549373734742403, "kl": 0.0029919970547780395, "learning_rate": 9.801760074108383e-07, "loss": 0.0002, "num_tokens": 58813376.0, "reward": 4.5816336835535765e-11, "reward_std": 1.2519406344946304e-10, "rewards/reward_func/mean": 4.5816336835535765e-11, "rewards/reward_func/std": 1.2519406344946304e-10, "step": 2141, "step_time": 26.165336951613426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 168.5625, "completions/mean_terminated_length": 168.5625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.17884428054094315, "epoch": 0.09921259842519685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014924416318535805, "kl": 0.0013298338162712753, "learning_rate": 9.801667438628995e-07, "loss": 0.0001, "num_tokens": 58850441.0, "reward": 0.7093939781188965, "reward_std": 0.0, "rewards/reward_func/mean": 0.7093939781188965, "rewards/reward_func/std": 0.0, "step": 2142, "step_time": 20.494751326739788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 137.4375, "completions/mean_terminated_length": 137.4375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.32688628137111664, "epoch": 0.09925891616489116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018485600594431162, "kl": 0.0015059587894938886, "learning_rate": 9.801574803149606e-07, "loss": 0.0001, "num_tokens": 58873120.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2143, "step_time": 16.893532820045948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2655008025467396, "epoch": 0.09930523390458545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015479301800951362, "kl": 0.0011959699622821063, "learning_rate": 9.801482167670217e-07, "loss": 0.0001, "num_tokens": 58895320.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2144, "step_time": 14.535428643226624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 201.8125, "completions/mean_terminated_length": 201.8125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.419305719435215, "epoch": 0.09935155164427976, "frac_reward_zero_std": 0.0, "grad_norm": 0.10994252562522888, "kl": 0.0047153522609733045, "learning_rate": 9.801389532190828e-07, "loss": -0.0247, "num_tokens": 58920725.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 2145, "step_time": 21.881717685610056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.21117480844259262, "epoch": 0.09939786938397406, "frac_reward_zero_std": 1.0, "grad_norm": 0.002725240308791399, "kl": 0.0016288287588395178, "learning_rate": 9.80129689671144e-07, "loss": 0.0001, "num_tokens": 58943564.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2146, "step_time": 16.320057708770037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.5625, "completions/mean_terminated_length": 123.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.29132601618766785, "epoch": 0.09944418712366837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051228683441877365, "kl": 0.001822701218770817, "learning_rate": 9.80120426123205e-07, "loss": 0.0001, "num_tokens": 58969445.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2147, "step_time": 15.702005561441183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 145.6875, "completions/mean_terminated_length": 145.6875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3887370899319649, "epoch": 0.09949050486336267, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014260419411584735, "kl": 0.0014701895706821233, "learning_rate": 9.801111625752662e-07, "loss": 0.0001, "num_tokens": 59001024.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2148, "step_time": 17.672618754208088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 200.4375, "completions/mean_terminated_length": 200.4375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.21593355014920235, "epoch": 0.09953682260305698, "frac_reward_zero_std": 0.0, "grad_norm": 0.1311170905828476, "kl": 0.006177579518407583, "learning_rate": 9.801018990273273e-07, "loss": -0.1122, "num_tokens": 59028871.0, "reward": 0.7609108090400696, "reward_std": 0.24304427206516266, "rewards/reward_func/mean": 0.7609108090400696, "rewards/reward_func/std": 0.24304430186748505, "step": 2149, "step_time": 22.516092840582132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3041015565395355, "epoch": 0.09958314034275127, "frac_reward_zero_std": 1.0, "grad_norm": 0.008619067259132862, "kl": 0.0031587775447405875, "learning_rate": 9.800926354793887e-07, "loss": 0.0002, "num_tokens": 59050113.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2150, "step_time": 14.643218837678432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 178.4375, "completions/mean_terminated_length": 178.4375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.22522944957017899, "epoch": 0.09962945808244558, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015865567838773131, "kl": 0.0014014854386914521, "learning_rate": 9.800833719314498e-07, "loss": 0.0001, "num_tokens": 59072808.0, "reward": 0.4274149239063263, "reward_std": 0.0, "rewards/reward_func/mean": 0.4274149239063263, "rewards/reward_func/std": 0.0, "step": 2151, "step_time": 17.842246294021606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 194.875, "completions/mean_terminated_length": 194.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3555251806974411, "epoch": 0.09967577582213988, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021308145951479673, "kl": 0.0017405890685040504, "learning_rate": 9.800741083835107e-07, "loss": 0.0001, "num_tokens": 59112998.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2152, "step_time": 24.436119481921196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 145.0625, "completions/mean_terminated_length": 145.0625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4170515313744545, "epoch": 0.09972209356183419, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023248870857059956, "kl": 0.0018290946027264, "learning_rate": 9.800648448355718e-07, "loss": 0.0001, "num_tokens": 59146151.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2153, "step_time": 18.82840597257018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 135.9375, "completions/mean_terminated_length": 135.9375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.15384923294186592, "epoch": 0.09976841130152848, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012263598619028926, "kl": 0.0007851967820897698, "learning_rate": 9.800555812876332e-07, "loss": 0.0, "num_tokens": 59180150.0, "reward": 0.3678794503211975, "reward_std": 0.0, "rewards/reward_func/mean": 0.3678794503211975, "rewards/reward_func/std": 0.0, "step": 2154, "step_time": 17.61545640602708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.33582911640405655, "epoch": 0.0998147290412228, "frac_reward_zero_std": 0.0, "grad_norm": 0.4987824261188507, "kl": 0.021140100667253137, "learning_rate": 9.800463177396943e-07, "loss": 0.0232, "num_tokens": 59203602.0, "reward": 0.5846918821334839, "reward_std": 0.46775349974632263, "rewards/reward_func/mean": 0.5846918821334839, "rewards/reward_func/std": 0.46775349974632263, "step": 2155, "step_time": 18.694602459669113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.28729238361120224, "epoch": 0.09986104678091709, "frac_reward_zero_std": 1.0, "grad_norm": 0.002293812809512019, "kl": 0.0019049131078645587, "learning_rate": 9.800370541917554e-07, "loss": 0.0001, "num_tokens": 59224876.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2156, "step_time": 20.50570261478424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 196.6875, "completions/mean_terminated_length": 196.6875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.19378679618239403, "epoch": 0.0999073645206114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017660867888480425, "kl": 0.0011897815275005996, "learning_rate": 9.800277906438166e-07, "loss": 0.0001, "num_tokens": 59248519.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2157, "step_time": 19.057425145059824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 98.5, "completions/mean_terminated_length": 98.5, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2795220613479614, "epoch": 0.0999536822603057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019364472245797515, "kl": 0.0016187586006708443, "learning_rate": 9.800185270958777e-07, "loss": 0.0001, "num_tokens": 59268815.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2158, "step_time": 11.523467421531677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.19120676815509796, "epoch": 0.1, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012357404921203852, "kl": 0.0008733440190553665, "learning_rate": 9.800092635479388e-07, "loss": 0.0, "num_tokens": 59297211.0, "reward": 0.9834010601043701, "reward_std": 0.0, "rewards/reward_func/mean": 0.9834010601043701, "rewards/reward_func/std": 0.0, "step": 2159, "step_time": 34.786151614040136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 165.6875, "completions/mean_terminated_length": 165.6875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.33776474744081497, "epoch": 0.1000463177396943, "frac_reward_zero_std": 1.0, "grad_norm": 0.00268212310038507, "kl": 0.0019901118939742446, "learning_rate": 9.8e-07, "loss": 0.0001, "num_tokens": 59317766.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2160, "step_time": 16.260948821902275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3459314554929733, "epoch": 0.10009263547938861, "frac_reward_zero_std": 1.0, "grad_norm": 0.001920197973959148, "kl": 0.0013784721959382296, "learning_rate": 9.79990736452061e-07, "loss": 0.0001, "num_tokens": 59345258.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2161, "step_time": 18.700891856104136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.4073880463838577, "epoch": 0.10013895321908291, "frac_reward_zero_std": 0.0, "grad_norm": 0.06136735528707504, "kl": 0.0022889088722877204, "learning_rate": 9.799814729041222e-07, "loss": 0.1389, "num_tokens": 59374130.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 2162, "step_time": 36.16574815660715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 163.0625, "completions/mean_terminated_length": 163.0625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.35263369232416153, "epoch": 0.10018527095877722, "frac_reward_zero_std": 1.0, "grad_norm": 0.003628405509516597, "kl": 0.002199364302214235, "learning_rate": 9.799722093561833e-07, "loss": 0.0001, "num_tokens": 59394419.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2163, "step_time": 19.95395029336214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.44233647733926773, "epoch": 0.10023158869847151, "frac_reward_zero_std": 1.0, "grad_norm": 0.004055425524711609, "kl": 0.0034508295357227325, "learning_rate": 9.799629458082446e-07, "loss": 0.0002, "num_tokens": 59428541.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2164, "step_time": 26.92737577110529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3089473396539688, "epoch": 0.10027790643816582, "frac_reward_zero_std": 0.0, "grad_norm": 0.0905863493680954, "kl": 0.0030467250617220998, "learning_rate": 9.799536822603056e-07, "loss": 0.1127, "num_tokens": 59453907.0, "reward": 0.2638552188873291, "reward_std": 0.21108420193195343, "rewards/reward_func/mean": 0.2638552188873291, "rewards/reward_func/std": 0.21108418703079224, "step": 2165, "step_time": 21.577270343899727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 169.5625, "completions/mean_terminated_length": 169.5625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.38580887019634247, "epoch": 0.10032422417786012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025594322942197323, "kl": 0.0018513394461479038, "learning_rate": 9.799444187123667e-07, "loss": 0.0001, "num_tokens": 59485260.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2166, "step_time": 22.639272842556238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 159.9375, "completions/mean_terminated_length": 159.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.1861037164926529, "epoch": 0.10037054191755443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021256774198263884, "kl": 0.0013731769286096096, "learning_rate": 9.79935155164428e-07, "loss": 0.0001, "num_tokens": 59505931.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 2167, "step_time": 18.174105010926723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 179.1875, "completions/mean_terminated_length": 179.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4112344831228256, "epoch": 0.10041685965724872, "frac_reward_zero_std": 1.0, "grad_norm": 0.011156733147799969, "kl": 0.005695650819689035, "learning_rate": 9.799258916164891e-07, "loss": 0.0003, "num_tokens": 59543166.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2168, "step_time": 23.43252919241786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 166.5, "completions/mean_terminated_length": 166.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.317965492606163, "epoch": 0.10046317739694303, "frac_reward_zero_std": 0.0, "grad_norm": 0.10522262752056122, "kl": 0.0018635388696566224, "learning_rate": 9.799166280685503e-07, "loss": -0.035, "num_tokens": 59570502.0, "reward": 0.6956884860992432, "reward_std": 0.41583725810050964, "rewards/reward_func/mean": 0.6956884860992432, "rewards/reward_func/std": 0.41583725810050964, "step": 2169, "step_time": 20.067191254347563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2777117192745209, "epoch": 0.10050949513663733, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017494413768872619, "kl": 0.0015478904824703932, "learning_rate": 9.799073645206114e-07, "loss": 0.0001, "num_tokens": 59599740.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2170, "step_time": 16.83437930420041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 321.625, "completions/mean_terminated_length": 321.625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "entropy": 0.23447756096720695, "epoch": 0.10055581287633164, "frac_reward_zero_std": 0.0, "grad_norm": 0.06996208429336548, "kl": 0.002022105356445536, "learning_rate": 9.798981009726725e-07, "loss": -0.0823, "num_tokens": 59641062.0, "reward": 0.6876200437545776, "reward_std": 0.2851105332374573, "rewards/reward_func/mean": 0.6876200437545776, "rewards/reward_func/std": 0.2851105332374573, "step": 2171, "step_time": 34.55266473069787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2668868377804756, "epoch": 0.10060213061602594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011811316944658756, "kl": 0.0010578126821201295, "learning_rate": 9.798888374247336e-07, "loss": 0.0001, "num_tokens": 59662116.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2172, "step_time": 14.070418328046799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.39442313462495804, "epoch": 0.10064844835572025, "frac_reward_zero_std": 1.0, "grad_norm": 0.001506642671301961, "kl": 0.0018411995843052864, "learning_rate": 9.798795738767948e-07, "loss": 0.0001, "num_tokens": 59697579.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2173, "step_time": 20.27152444422245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3756882771849632, "epoch": 0.10069476609541454, "frac_reward_zero_std": 1.0, "grad_norm": 0.01110369898378849, "kl": 0.003480234125163406, "learning_rate": 9.79870310328856e-07, "loss": 0.0002, "num_tokens": 59751123.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2174, "step_time": 25.346865888684988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.23016562312841415, "epoch": 0.10074108383510885, "frac_reward_zero_std": 0.0, "grad_norm": 0.10656745731830597, "kl": 0.002969473891425878, "learning_rate": 9.79861046780917e-07, "loss": -0.0385, "num_tokens": 59772895.0, "reward": 0.9022549390792847, "reward_std": 0.2546529173851013, "rewards/reward_func/mean": 0.9022549390792847, "rewards/reward_func/std": 0.2546529173851013, "step": 2175, "step_time": 18.87780975922942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 195.4375, "completions/mean_terminated_length": 195.4375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.19872033596038818, "epoch": 0.10078740157480315, "frac_reward_zero_std": 1.0, "grad_norm": 0.005258552264422178, "kl": 0.003601659002015367, "learning_rate": 9.798517832329781e-07, "loss": 0.0002, "num_tokens": 59797558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2176, "step_time": 19.6141157746315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.32640138268470764, "epoch": 0.10083371931449746, "frac_reward_zero_std": 1.0, "grad_norm": 0.005216421093791723, "kl": 0.0028714233194477856, "learning_rate": 9.798425196850393e-07, "loss": 0.0001, "num_tokens": 59818170.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2177, "step_time": 16.826649986207485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28045129776000977, "epoch": 0.10088003705419175, "frac_reward_zero_std": 1.0, "grad_norm": 0.001770458067767322, "kl": 0.0015245918766595423, "learning_rate": 9.798332561371004e-07, "loss": 0.0001, "num_tokens": 59839822.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2178, "step_time": 14.084755450487137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3563677594065666, "epoch": 0.10092635479388606, "frac_reward_zero_std": 1.0, "grad_norm": 0.001462252694182098, "kl": 0.0015968038060236722, "learning_rate": 9.798239925891615e-07, "loss": 0.0001, "num_tokens": 59875830.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2179, "step_time": 18.74991489201784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.22664383053779602, "epoch": 0.10097267253358036, "frac_reward_zero_std": 1.0, "grad_norm": 0.006523195654153824, "kl": 0.0024224047956522554, "learning_rate": 9.798147290412229e-07, "loss": 0.0001, "num_tokens": 59895566.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2180, "step_time": 15.39057507738471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 136.6875, "completions/mean_terminated_length": 136.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4017426148056984, "epoch": 0.10101899027327467, "frac_reward_zero_std": 1.0, "grad_norm": 0.003585459664463997, "kl": 0.002988903783261776, "learning_rate": 9.79805465493284e-07, "loss": 0.0001, "num_tokens": 59948393.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2181, "step_time": 23.900904923677444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4459299221634865, "epoch": 0.10106530801296897, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018644469091668725, "kl": 0.001935254200361669, "learning_rate": 9.79796201945345e-07, "loss": 0.0001, "num_tokens": 59986643.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2182, "step_time": 23.580065827816725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3162192851305008, "epoch": 0.10111162575266328, "frac_reward_zero_std": 0.0, "grad_norm": 0.09607032686471939, "kl": 0.002582930203061551, "learning_rate": 9.79786938397406e-07, "loss": -0.0505, "num_tokens": 60009949.0, "reward": 0.7515207529067993, "reward_std": 0.37357062101364136, "rewards/reward_func/mean": 0.7515207529067993, "rewards/reward_func/std": 0.37357065081596375, "step": 2183, "step_time": 19.198546521365643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.3329693451523781, "epoch": 0.10115794349235757, "frac_reward_zero_std": 0.0, "grad_norm": 0.10372719913721085, "kl": 0.0024032650981098413, "learning_rate": 9.797776748494674e-07, "loss": -0.0148, "num_tokens": 60044453.0, "reward": 0.9002240896224976, "reward_std": 0.05049748346209526, "rewards/reward_func/mean": 0.9002240896224976, "rewards/reward_func/std": 0.050497494637966156, "step": 2184, "step_time": 28.069841776043177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2763189375400543, "epoch": 0.10120426123205188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017113815993070602, "kl": 0.0017008509603329003, "learning_rate": 9.797684113015285e-07, "loss": 0.0001, "num_tokens": 60064269.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2185, "step_time": 13.093166135251522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.29085466265678406, "epoch": 0.10125057897174618, "frac_reward_zero_std": 1.0, "grad_norm": 0.00194756337441504, "kl": 0.0017149012710433453, "learning_rate": 9.797591477535896e-07, "loss": 0.0001, "num_tokens": 60085303.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2186, "step_time": 14.907429192215204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.28259269148111343, "epoch": 0.10129689671144049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034371220972388983, "kl": 0.0021036481484770775, "learning_rate": 9.797498842056507e-07, "loss": 0.0001, "num_tokens": 60110469.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2187, "step_time": 15.739911269396544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.42096367478370667, "epoch": 0.10134321445113478, "frac_reward_zero_std": 1.0, "grad_norm": 0.002669628243893385, "kl": 0.002495112828910351, "learning_rate": 9.797406206577119e-07, "loss": 0.0001, "num_tokens": 60162709.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2188, "step_time": 25.90372997522354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.20531976968050003, "epoch": 0.1013895321908291, "frac_reward_zero_std": 0.0, "grad_norm": 0.14281108975410461, "kl": 0.002182638010708615, "learning_rate": 9.79731357109773e-07, "loss": -0.0356, "num_tokens": 60197575.0, "reward": 0.8571484088897705, "reward_std": 0.22958418726921082, "rewards/reward_func/mean": 0.8571484088897705, "rewards/reward_func/std": 0.22958418726921082, "step": 2189, "step_time": 25.103964366018772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3038442134857178, "epoch": 0.10143584993052339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028399541042745113, "kl": 0.0025349673815071583, "learning_rate": 9.79722093561834e-07, "loss": 0.0001, "num_tokens": 60235167.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2190, "step_time": 24.349001679569483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 136.25, "completions/mean_terminated_length": 136.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.32105864584445953, "epoch": 0.1014821676702177, "frac_reward_zero_std": 1.0, "grad_norm": 0.001613329048268497, "kl": 0.0014620284782722592, "learning_rate": 9.797128300138952e-07, "loss": 0.0001, "num_tokens": 60263635.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2191, "step_time": 16.700292360037565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 230.4375, "completions/mean_terminated_length": 230.4375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.2706785500049591, "epoch": 0.101528485409912, "frac_reward_zero_std": 0.0, "grad_norm": 0.11339399218559265, "kl": 0.002971329726278782, "learning_rate": 9.797035664659564e-07, "loss": -0.0437, "num_tokens": 60290618.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2192, "step_time": 24.38199918717146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 127.4375, "completions/mean_terminated_length": 127.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.25459761917591095, "epoch": 0.1015748031496063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025886453222483397, "kl": 0.0016027696547098458, "learning_rate": 9.796943029180175e-07, "loss": 0.0001, "num_tokens": 60310017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2193, "step_time": 13.862449113279581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 207.9375, "completions/mean_terminated_length": 207.9375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.3888969272375107, "epoch": 0.1016211208893006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028763385489583015, "kl": 0.0023363720101770014, "learning_rate": 9.796850393700788e-07, "loss": 0.0001, "num_tokens": 60336672.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2194, "step_time": 21.99916561320424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 226.5625, "completions/mean_terminated_length": 226.5625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3594680428504944, "epoch": 0.10166743862899491, "frac_reward_zero_std": 0.0, "grad_norm": 0.08128580451011658, "kl": 0.0030222826171666384, "learning_rate": 9.796757758221397e-07, "loss": -0.1862, "num_tokens": 60375545.0, "reward": 0.17653314769268036, "reward_std": 0.37953487038612366, "rewards/reward_func/mean": 0.17653314769268036, "rewards/reward_func/std": 0.37953487038612366, "step": 2195, "step_time": 33.69244493171573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.23426999151706696, "epoch": 0.1017137563686892, "frac_reward_zero_std": 0.0, "grad_norm": 0.07784437388181686, "kl": 0.0045803755056113005, "learning_rate": 9.796665122742009e-07, "loss": -0.0354, "num_tokens": 60413903.0, "reward": 0.7629547715187073, "reward_std": 0.31606027483940125, "rewards/reward_func/mean": 0.7629547715187073, "rewards/reward_func/std": 0.31606027483940125, "step": 2196, "step_time": 27.606477454304695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 144.5, "completions/mean_terminated_length": 144.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.1536317691206932, "epoch": 0.10176007410838352, "frac_reward_zero_std": 0.0, "grad_norm": 0.10756503790616989, "kl": 0.012938304804265499, "learning_rate": 9.796572487262622e-07, "loss": -0.0196, "num_tokens": 60436695.0, "reward": 0.9304066896438599, "reward_std": 0.19016513228416443, "rewards/reward_func/mean": 0.9304066896438599, "rewards/reward_func/std": 0.19016513228416443, "step": 2197, "step_time": 15.513894945383072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.3011670559644699, "epoch": 0.10180639184807781, "frac_reward_zero_std": 0.0, "grad_norm": 0.08634068071842194, "kl": 0.004210830433294177, "learning_rate": 9.796479851783233e-07, "loss": 0.0109, "num_tokens": 60465065.0, "reward": 0.4947678744792938, "reward_std": 0.13267172873020172, "rewards/reward_func/mean": 0.4947678744792938, "rewards/reward_func/std": 0.13267171382904053, "step": 2198, "step_time": 25.829206820577383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3724542334675789, "epoch": 0.10185270958777212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014387345872819424, "kl": 0.0015553278208244592, "learning_rate": 9.796387216303844e-07, "loss": 0.0001, "num_tokens": 60490875.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2199, "step_time": 16.612209875136614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 206.9375, "completions/mean_terminated_length": 206.9375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.275962233543396, "epoch": 0.10189902732746642, "frac_reward_zero_std": 0.0, "grad_norm": 0.0932672768831253, "kl": 0.005184296751394868, "learning_rate": 9.796294580824456e-07, "loss": -0.0711, "num_tokens": 60514410.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 2200, "step_time": 22.264834202826023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 215.5625, "completions/mean_terminated_length": 215.5625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.27668239921331406, "epoch": 0.10194534506716073, "frac_reward_zero_std": 0.0, "grad_norm": 0.12937963008880615, "kl": 0.00448174070334062, "learning_rate": 9.796201945345067e-07, "loss": -0.1306, "num_tokens": 60550947.0, "reward": 0.5384228229522705, "reward_std": 0.21965906023979187, "rewards/reward_func/mean": 0.5384228229522705, "rewards/reward_func/std": 0.21965906023979187, "step": 2201, "step_time": 29.706990282982588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3306841403245926, "epoch": 0.10199166280685502, "frac_reward_zero_std": 1.0, "grad_norm": 0.006056199781596661, "kl": 0.003290728200227022, "learning_rate": 9.796109309865678e-07, "loss": 0.0002, "num_tokens": 60594753.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2202, "step_time": 24.14627345278859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.36340101063251495, "epoch": 0.10203798054654933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016917209140956402, "kl": 0.001690033939667046, "learning_rate": 9.79601667438629e-07, "loss": 0.0001, "num_tokens": 60616251.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2203, "step_time": 19.759938970208168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 252.1875, "completions/mean_terminated_length": 252.1875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.3854904919862747, "epoch": 0.10208429828624363, "frac_reward_zero_std": 0.0, "grad_norm": 0.07628150284290314, "kl": 0.0016758107813075185, "learning_rate": 9.7959240389069e-07, "loss": 0.0077, "num_tokens": 60656286.0, "reward": 0.7000828385353088, "reward_std": 0.005255452822893858, "rewards/reward_func/mean": 0.7000828385353088, "rewards/reward_func/std": 0.005255445837974548, "step": 2204, "step_time": 28.58991450443864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 146.0625, "completions/mean_terminated_length": 146.0625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2740960121154785, "epoch": 0.10213061602593794, "frac_reward_zero_std": 1.0, "grad_norm": 0.003894086927175522, "kl": 0.0018835398368537426, "learning_rate": 9.795831403427512e-07, "loss": 0.0001, "num_tokens": 60679791.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2205, "step_time": 17.592174660414457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 113.25, "completions/mean_terminated_length": 113.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.23828605934977531, "epoch": 0.10217693376563224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019345965702086687, "kl": 0.0015995536523405463, "learning_rate": 9.795738767948123e-07, "loss": 0.0001, "num_tokens": 60699443.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2206, "step_time": 13.819919727742672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3812277764081955, "epoch": 0.10222325150532655, "frac_reward_zero_std": 0.0, "grad_norm": 0.10260231047868729, "kl": 0.002770874183624983, "learning_rate": 9.795646132468737e-07, "loss": 0.0356, "num_tokens": 60721533.0, "reward": 0.7803013324737549, "reward_std": 0.23197969794273376, "rewards/reward_func/mean": 0.7803013324737549, "rewards/reward_func/std": 0.23197971284389496, "step": 2207, "step_time": 19.418404404073954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 245.8125, "completions/mean_terminated_length": 245.8125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.37642689049243927, "epoch": 0.10226956924502084, "frac_reward_zero_std": 0.0, "grad_norm": 0.09306300431489944, "kl": 0.002558030537329614, "learning_rate": 9.795553496989346e-07, "loss": 0.1388, "num_tokens": 60745098.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 2208, "step_time": 29.97770418971777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.20292287319898605, "epoch": 0.10231588698471515, "frac_reward_zero_std": 1.0, "grad_norm": 0.004116197116672993, "kl": 0.0016079325578175485, "learning_rate": 9.795460861509957e-07, "loss": 0.0001, "num_tokens": 60764626.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2209, "step_time": 15.321424350142479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1554102674126625, "epoch": 0.10236220472440945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011155969696119428, "kl": 0.0008762629440752789, "learning_rate": 9.79536822603057e-07, "loss": 0.0, "num_tokens": 60787372.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 2210, "step_time": 17.346901450306177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 116.125, "completions/mean_terminated_length": 116.125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.23476694524288177, "epoch": 0.10240852246410376, "frac_reward_zero_std": 1.0, "grad_norm": 0.002404983853921294, "kl": 0.0013003416825085878, "learning_rate": 9.795275590551182e-07, "loss": 0.0001, "num_tokens": 60807022.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2211, "step_time": 13.673759322613478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.24284956231713295, "epoch": 0.10245484020379805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021608276292681694, "kl": 0.001618764887098223, "learning_rate": 9.795182955071793e-07, "loss": 0.0001, "num_tokens": 60835042.0, "reward": 0.9368637800216675, "reward_std": 0.0, "rewards/reward_func/mean": 0.9368637800216675, "rewards/reward_func/std": 0.0, "step": 2212, "step_time": 30.086726807057858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.24929409474134445, "epoch": 0.10250115794349236, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024518666323274374, "kl": 0.0014328828256111592, "learning_rate": 9.795090319592404e-07, "loss": 0.0001, "num_tokens": 60856780.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2213, "step_time": 13.360966384410858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.38240545988082886, "epoch": 0.10254747568318666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022746773902326822, "kl": 0.002078428486129269, "learning_rate": 9.794997684113015e-07, "loss": 0.0001, "num_tokens": 60906128.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2214, "step_time": 23.992529205977917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1575748734176159, "epoch": 0.10259379342288097, "frac_reward_zero_std": 1.0, "grad_norm": 0.004232973325997591, "kl": 0.0023800735943950713, "learning_rate": 9.794905048633627e-07, "loss": 0.0001, "num_tokens": 60942366.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2215, "step_time": 20.61464837566018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 105.375, "completions/mean_terminated_length": 105.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.31054776161909103, "epoch": 0.10264011116257526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016936726169660687, "kl": 0.001551642024423927, "learning_rate": 9.794812413154238e-07, "loss": 0.0001, "num_tokens": 60962292.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2216, "step_time": 13.38803456351161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 146.625, "completions/mean_terminated_length": 146.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3204849287867546, "epoch": 0.10268642890226957, "frac_reward_zero_std": 1.0, "grad_norm": 0.010563879273831844, "kl": 0.004847561183851212, "learning_rate": 9.79471977767485e-07, "loss": 0.0002, "num_tokens": 60982606.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2217, "step_time": 15.205908689647913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 205.0625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.27916259318590164, "epoch": 0.10273274664196387, "frac_reward_zero_std": 1.0, "grad_norm": 0.000970926892478019, "kl": 0.0012351367040537298, "learning_rate": 9.79462714219546e-07, "loss": 0.0001, "num_tokens": 61026831.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2218, "step_time": 26.449595969170332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 121.8125, "completions/mean_terminated_length": 121.8125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.27991413325071335, "epoch": 0.10277906438165818, "frac_reward_zero_std": 1.0, "grad_norm": 0.008126037195324898, "kl": 0.003011845867149532, "learning_rate": 9.794534506716072e-07, "loss": 0.0001, "num_tokens": 61056252.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2219, "step_time": 16.91433237120509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 189.4375, "completions/mean_terminated_length": 189.4375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4102216511964798, "epoch": 0.10282538212135248, "frac_reward_zero_std": 0.0, "grad_norm": 0.11200933903455734, "kl": 0.005165300099179149, "learning_rate": 9.794441871236683e-07, "loss": -0.052, "num_tokens": 61082083.0, "reward": 0.27963730692863464, "reward_std": 0.42837172746658325, "rewards/reward_func/mean": 0.27963730692863464, "rewards/reward_func/std": 0.42837172746658325, "step": 2220, "step_time": 24.32019878551364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.35585759580135345, "epoch": 0.10287169986104679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016712772194296122, "kl": 0.0015667208936065435, "learning_rate": 9.794349235757294e-07, "loss": 0.0001, "num_tokens": 61116269.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2221, "step_time": 20.54244640469551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.29270917922258377, "epoch": 0.10291801760074108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013240029802545905, "kl": 0.0016460044425912201, "learning_rate": 9.794256600277905e-07, "loss": 0.0001, "num_tokens": 61136070.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2222, "step_time": 13.899787869304419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.23054639250040054, "epoch": 0.10296433534043539, "frac_reward_zero_std": 0.0, "grad_norm": 0.15053316950798035, "kl": 0.002577450970420614, "learning_rate": 9.794163964798516e-07, "loss": -0.0639, "num_tokens": 61163856.0, "reward": 0.909626841545105, "reward_std": 0.02409949339926243, "rewards/reward_func/mean": 0.909626841545105, "rewards/reward_func/std": 0.024099500849843025, "step": 2223, "step_time": 20.294577702879906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.40985822677612305, "epoch": 0.10301065308012969, "frac_reward_zero_std": 1.0, "grad_norm": 0.002675432711839676, "kl": 0.0021192085405346006, "learning_rate": 9.79407132931913e-07, "loss": 0.0001, "num_tokens": 61197347.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2224, "step_time": 21.854994174093008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 236.6875, "completions/mean_terminated_length": 236.6875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.2156229093670845, "epoch": 0.103056970819824, "frac_reward_zero_std": 0.0, "grad_norm": 0.06497209519147873, "kl": 0.0016112701559904963, "learning_rate": 9.793978693839741e-07, "loss": -0.0143, "num_tokens": 61236030.0, "reward": 0.4116564095020294, "reward_std": 0.15257705748081207, "rewards/reward_func/mean": 0.4116564095020294, "rewards/reward_func/std": 0.15257705748081207, "step": 2225, "step_time": 26.80081870406866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 165.375, "completions/mean_terminated_length": 165.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.403880774974823, "epoch": 0.1031032885595183, "frac_reward_zero_std": 1.0, "grad_norm": 0.007626906502991915, "kl": 0.004239951784256846, "learning_rate": 9.79388605836035e-07, "loss": 0.0002, "num_tokens": 61284100.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2226, "step_time": 24.646736599504948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3995928168296814, "epoch": 0.1031496062992126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026067052967846394, "kl": 0.0024176547303795815, "learning_rate": 9.793793422880964e-07, "loss": 0.0001, "num_tokens": 61305754.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2227, "step_time": 19.135168179869652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 115.125, "completions/mean_terminated_length": 115.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.299918569624424, "epoch": 0.1031959240389069, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026245375629514456, "kl": 0.0017001339292619377, "learning_rate": 9.793700787401575e-07, "loss": 0.0001, "num_tokens": 61327084.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2228, "step_time": 13.109154216945171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2734079398214817, "epoch": 0.10324224177860121, "frac_reward_zero_std": 1.0, "grad_norm": 0.00537524838000536, "kl": 0.002722400415223092, "learning_rate": 9.793608151922186e-07, "loss": 0.0001, "num_tokens": 61347478.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2229, "step_time": 14.749796010553837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3564213216304779, "epoch": 0.1032885595182955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014428168069571257, "kl": 0.0014983734581619501, "learning_rate": 9.793515516442797e-07, "loss": 0.0001, "num_tokens": 61379161.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2230, "step_time": 18.56022620201111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 247.5625, "completions/mean_terminated_length": 247.5625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.19931424036622047, "epoch": 0.10333487725798982, "frac_reward_zero_std": 0.0, "grad_norm": 0.09483068436384201, "kl": 0.002567407733295113, "learning_rate": 9.793422880963409e-07, "loss": -0.0195, "num_tokens": 61414754.0, "reward": 0.18062886595726013, "reward_std": 0.05885840579867363, "rewards/reward_func/mean": 0.18062886595726013, "rewards/reward_func/std": 0.05885840579867363, "step": 2231, "step_time": 26.85329046472907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.2816247157752514, "epoch": 0.10338119499768411, "frac_reward_zero_std": 0.0, "grad_norm": 0.11701363325119019, "kl": 0.004782899166457355, "learning_rate": 9.79333024548402e-07, "loss": -0.0859, "num_tokens": 61439506.0, "reward": 0.5287646055221558, "reward_std": 0.4860803186893463, "rewards/reward_func/mean": 0.5287646055221558, "rewards/reward_func/std": 0.4860803186893463, "step": 2232, "step_time": 23.685327105224133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 118.4375, "completions/mean_terminated_length": 118.4375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2728803679347038, "epoch": 0.10342751273737842, "frac_reward_zero_std": 1.0, "grad_norm": 0.003786456538364291, "kl": 0.0021344450942706317, "learning_rate": 9.793237610004631e-07, "loss": 0.0001, "num_tokens": 61460505.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2233, "step_time": 13.11510282382369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.36266087740659714, "epoch": 0.10347383047707272, "frac_reward_zero_std": 1.0, "grad_norm": 0.002360459417104721, "kl": 0.001996085455175489, "learning_rate": 9.793144974525242e-07, "loss": 0.0001, "num_tokens": 61491837.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2234, "step_time": 19.55891367048025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.22553859278559685, "epoch": 0.10352014821676703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029795700684189796, "kl": 0.002468937949743122, "learning_rate": 9.793052339045854e-07, "loss": 0.0001, "num_tokens": 61528524.0, "reward": 0.9661049842834473, "reward_std": 0.0, "rewards/reward_func/mean": 0.9661049842834473, "rewards/reward_func/std": 0.0, "step": 2235, "step_time": 23.35491492599249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.2922440990805626, "epoch": 0.10356646595646132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036938011180609465, "kl": 0.0020365714735817164, "learning_rate": 9.792959703566465e-07, "loss": 0.0001, "num_tokens": 61548930.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2236, "step_time": 18.090626504272223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 182.5625, "completions/mean_terminated_length": 182.5625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.12921619042754173, "epoch": 0.10361278369615563, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010431903647258878, "kl": 0.0007840915495762601, "learning_rate": 9.792867068087078e-07, "loss": 0.0, "num_tokens": 61573051.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2237, "step_time": 19.422021348029375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2800588682293892, "epoch": 0.10365910143584993, "frac_reward_zero_std": 1.0, "grad_norm": 0.001400663866661489, "kl": 0.0014587149489670992, "learning_rate": 9.792774432607687e-07, "loss": 0.0001, "num_tokens": 61598869.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2238, "step_time": 17.431943271309137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2698532044887543, "epoch": 0.10370541917554424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040842462331056595, "kl": 0.0030865041771903634, "learning_rate": 9.792681797128299e-07, "loss": 0.0002, "num_tokens": 61623189.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 2239, "step_time": 17.110507179051638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 160.5625, "completions/mean_terminated_length": 160.5625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3302495777606964, "epoch": 0.10375173691523853, "frac_reward_zero_std": 1.0, "grad_norm": 0.007209908217191696, "kl": 0.003326426784042269, "learning_rate": 9.792589161648912e-07, "loss": 0.0002, "num_tokens": 61645742.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2240, "step_time": 17.75124305486679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.19000709801912308, "epoch": 0.10379805465493284, "frac_reward_zero_std": 0.0, "grad_norm": 0.09699032455682755, "kl": 0.005310878332238644, "learning_rate": 9.792496526169523e-07, "loss": -0.0233, "num_tokens": 61672648.0, "reward": 0.5217881202697754, "reward_std": 0.38342076539993286, "rewards/reward_func/mean": 0.5217881202697754, "rewards/reward_func/std": 0.3834207057952881, "step": 2241, "step_time": 18.545949436724186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 119.5625, "completions/mean_terminated_length": 119.5625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2775387093424797, "epoch": 0.10384437239462714, "frac_reward_zero_std": 1.0, "grad_norm": 0.009978383779525757, "kl": 0.0031977643084246665, "learning_rate": 9.792403890690134e-07, "loss": 0.0002, "num_tokens": 61694353.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2242, "step_time": 13.95238033682108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.32570119202136993, "epoch": 0.10389069013432145, "frac_reward_zero_std": 0.0, "grad_norm": 0.13308730721473694, "kl": 0.006898187682963908, "learning_rate": 9.792311255210746e-07, "loss": -0.0075, "num_tokens": 61716193.0, "reward": 0.22180122137069702, "reward_std": 0.39677008986473083, "rewards/reward_func/mean": 0.22180122137069702, "rewards/reward_func/std": 0.3967701196670532, "step": 2243, "step_time": 20.08134526014328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2500312253832817, "epoch": 0.10393700787401575, "frac_reward_zero_std": 0.0, "grad_norm": 0.10359306633472443, "kl": 0.004036617727251723, "learning_rate": 9.792218619731357e-07, "loss": -0.0516, "num_tokens": 61739701.0, "reward": 0.35930418968200684, "reward_std": 0.2904965281486511, "rewards/reward_func/mean": 0.35930418968200684, "rewards/reward_func/std": 0.2904965877532959, "step": 2244, "step_time": 22.156222824007273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.38613519072532654, "epoch": 0.10398332561371006, "frac_reward_zero_std": 1.0, "grad_norm": 0.006123799365013838, "kl": 0.003989014774560928, "learning_rate": 9.792125984251968e-07, "loss": 0.0002, "num_tokens": 61766723.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2245, "step_time": 19.492330126464367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3111994042992592, "epoch": 0.10402964335340435, "frac_reward_zero_std": 1.0, "grad_norm": 0.001901285257190466, "kl": 0.0015827443567104638, "learning_rate": 9.79203334877258e-07, "loss": 0.0001, "num_tokens": 61802835.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2246, "step_time": 18.98337061330676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.27539508789777756, "epoch": 0.10407596109309866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011728698154911399, "kl": 0.001241259480593726, "learning_rate": 9.79194071329319e-07, "loss": 0.0001, "num_tokens": 61829397.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2247, "step_time": 15.216758225113153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 212.9375, "completions/mean_terminated_length": 212.9375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.22671232372522354, "epoch": 0.10412227883279296, "frac_reward_zero_std": 0.0, "grad_norm": 0.07702209055423737, "kl": 0.0026329063693992794, "learning_rate": 9.791848077813802e-07, "loss": -0.0975, "num_tokens": 61852516.0, "reward": 0.7919193506240845, "reward_std": 0.3722260296344757, "rewards/reward_func/mean": 0.7919193506240845, "rewards/reward_func/std": 0.3722260594367981, "step": 2248, "step_time": 22.158296890556812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.21035386621952057, "epoch": 0.10416859657248727, "frac_reward_zero_std": 0.0, "grad_norm": 0.11730952560901642, "kl": 0.004817256878595799, "learning_rate": 9.791755442334413e-07, "loss": -0.0703, "num_tokens": 61884212.0, "reward": 0.3567105531692505, "reward_std": 0.35956937074661255, "rewards/reward_func/mean": 0.3567105531692505, "rewards/reward_func/std": 0.35956940054893494, "step": 2249, "step_time": 18.170773435384035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 208.1875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.21844256669282913, "epoch": 0.10421491431218156, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034418818540871143, "kl": 0.00255079276394099, "learning_rate": 9.791662806855027e-07, "loss": 0.0001, "num_tokens": 61912503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2250, "step_time": 21.4816131927073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 166.3125, "completions/mean_terminated_length": 166.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.39760953187942505, "epoch": 0.10426123205187587, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021134852431714535, "kl": 0.0018708638963289559, "learning_rate": 9.791570171375636e-07, "loss": 0.0001, "num_tokens": 61945644.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2251, "step_time": 20.125511031597853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 112.4375, "completions/mean_terminated_length": 112.4375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3092518672347069, "epoch": 0.10430754979157017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028821611776947975, "kl": 0.0024511757073923945, "learning_rate": 9.791477535896247e-07, "loss": 0.0001, "num_tokens": 61967811.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2252, "step_time": 13.239852078258991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 149.0625, "completions/mean_terminated_length": 149.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3802819848060608, "epoch": 0.10435386753126448, "frac_reward_zero_std": 1.0, "grad_norm": 0.005940168164670467, "kl": 0.0029723774059675634, "learning_rate": 9.791384900416858e-07, "loss": 0.0001, "num_tokens": 61988980.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2253, "step_time": 17.387725837528706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.27446040511131287, "epoch": 0.10440018527095878, "frac_reward_zero_std": 1.0, "grad_norm": 0.002605691086500883, "kl": 0.00233684346312657, "learning_rate": 9.791292264937472e-07, "loss": 0.0001, "num_tokens": 62038871.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2254, "step_time": 22.929054759442806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1548544466495514, "epoch": 0.10444650301065309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018125120550394058, "kl": 0.0014349717530421913, "learning_rate": 9.791199629458083e-07, "loss": 0.0001, "num_tokens": 62059761.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 2255, "step_time": 13.319679863750935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 164.6875, "completions/mean_terminated_length": 164.6875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.36026181280612946, "epoch": 0.10449282075034738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051782578229904175, "kl": 0.003570305823814124, "learning_rate": 9.791106993978694e-07, "loss": 0.0002, "num_tokens": 62080364.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2256, "step_time": 17.787369839847088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.417837455868721, "epoch": 0.10453913849004169, "frac_reward_zero_std": 1.0, "grad_norm": 0.002698037540540099, "kl": 0.0022583184181712568, "learning_rate": 9.791014358499305e-07, "loss": 0.0001, "num_tokens": 62109182.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2257, "step_time": 22.217093612998724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.329865463078022, "epoch": 0.10458545622973599, "frac_reward_zero_std": 1.0, "grad_norm": 0.004828733392059803, "kl": 0.003312196582555771, "learning_rate": 9.790921723019917e-07, "loss": 0.0002, "num_tokens": 62130732.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2258, "step_time": 18.443362843245268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 172.3125, "completions/mean_terminated_length": 172.3125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.29036425799131393, "epoch": 0.1046317739694303, "frac_reward_zero_std": 1.0, "grad_norm": 0.002593018813058734, "kl": 0.0017635401454754174, "learning_rate": 9.790829087540528e-07, "loss": 0.0001, "num_tokens": 62151313.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2259, "step_time": 18.20293040201068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 472.125, "completions/mean_terminated_length": 472.125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "entropy": 0.09846077300608158, "epoch": 0.1046780917091246, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004796440480276942, "kl": 0.0004201960182399489, "learning_rate": 9.79073645206114e-07, "loss": 0.0, "num_tokens": 62195827.0, "reward": 0.49658530950546265, "reward_std": 0.0, "rewards/reward_func/mean": 0.49658530950546265, "rewards/reward_func/std": 0.0, "step": 2260, "step_time": 43.809696685522795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.39920640736818314, "epoch": 0.1047244094488189, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035484484396874905, "kl": 0.0028538938495330513, "learning_rate": 9.79064381658175e-07, "loss": 0.0001, "num_tokens": 62218264.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2261, "step_time": 19.64612052962184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.26653461903333664, "epoch": 0.1047707271885132, "frac_reward_zero_std": 1.0, "grad_norm": 0.003153836587443948, "kl": 0.0017807056137826294, "learning_rate": 9.790551181102362e-07, "loss": 0.0001, "num_tokens": 62237796.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2262, "step_time": 13.603754296898842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 150.6875, "completions/mean_terminated_length": 150.6875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.21984482184052467, "epoch": 0.10481704492820751, "frac_reward_zero_std": 1.0, "grad_norm": 0.004111500922590494, "kl": 0.002681127982214093, "learning_rate": 9.790458545622973e-07, "loss": 0.0001, "num_tokens": 62259887.0, "reward": 0.8781879544258118, "reward_std": 0.0, "rewards/reward_func/mean": 0.8781879544258118, "rewards/reward_func/std": 0.0, "step": 2263, "step_time": 18.170254323631525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3419824466109276, "epoch": 0.1048633626679018, "frac_reward_zero_std": 1.0, "grad_norm": 0.005954916123300791, "kl": 0.005005842307582498, "learning_rate": 9.790365910143584e-07, "loss": 0.0003, "num_tokens": 62279793.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2264, "step_time": 15.43479885160923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2846032828092575, "epoch": 0.10490968040759611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015523831825703382, "kl": 0.0015785988653078675, "learning_rate": 9.790273274664195e-07, "loss": 0.0001, "num_tokens": 62300156.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2265, "step_time": 13.51823365315795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4687364399433136, "epoch": 0.10495599814729041, "frac_reward_zero_std": 0.0, "grad_norm": 0.10033821314573288, "kl": 0.005565599538385868, "learning_rate": 9.790180639184807e-07, "loss": -0.0393, "num_tokens": 62325186.0, "reward": 0.07428629696369171, "reward_std": 0.24917595088481903, "rewards/reward_func/mean": 0.07428629696369171, "rewards/reward_func/std": 0.24917596578598022, "step": 2266, "step_time": 23.35210544988513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 141.1875, "completions/mean_terminated_length": 141.1875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.34766803681850433, "epoch": 0.10500231588698472, "frac_reward_zero_std": 1.0, "grad_norm": 0.001703002955764532, "kl": 0.001662793365539983, "learning_rate": 9.79008800370542e-07, "loss": 0.0001, "num_tokens": 62353349.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2267, "step_time": 16.5883132442832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.41440922021865845, "epoch": 0.10504863362667902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042607043869793415, "kl": 0.0029541688272729516, "learning_rate": 9.789995368226031e-07, "loss": 0.0001, "num_tokens": 62377757.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2268, "step_time": 18.798588771373034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 147.3125, "completions/mean_terminated_length": 147.3125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.17437032982707024, "epoch": 0.10509495136637333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024604839272797108, "kl": 0.0013842256157658994, "learning_rate": 9.78990273274664e-07, "loss": 0.0001, "num_tokens": 62407154.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2269, "step_time": 17.567027255892754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3062300756573677, "epoch": 0.10514126910606762, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024556834250688553, "kl": 0.0015224769886117429, "learning_rate": 9.789810097267252e-07, "loss": 0.0001, "num_tokens": 62438820.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2270, "step_time": 19.446268923580647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.18224821239709854, "epoch": 0.10518758684576193, "frac_reward_zero_std": 0.0, "grad_norm": 0.11770366132259369, "kl": 0.0016775375988800079, "learning_rate": 9.789717461787865e-07, "loss": 0.0153, "num_tokens": 62471883.0, "reward": 0.9904050827026367, "reward_std": 0.03837955743074417, "rewards/reward_func/mean": 0.9904050827026367, "rewards/reward_func/std": 0.038379568606615067, "step": 2271, "step_time": 20.685834363102913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3279189094901085, "epoch": 0.10523390458545623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022518103942275047, "kl": 0.0020676348358392715, "learning_rate": 9.789624826308476e-07, "loss": 0.0001, "num_tokens": 62498741.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2272, "step_time": 17.294695295393467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2894827052950859, "epoch": 0.10528022232515054, "frac_reward_zero_std": 1.0, "grad_norm": 0.004258665721863508, "kl": 0.002303234301507473, "learning_rate": 9.789532190829087e-07, "loss": 0.0001, "num_tokens": 62519328.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2273, "step_time": 16.723687145859003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 159.25, "completions/mean_terminated_length": 159.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.22289356961846352, "epoch": 0.10532654006484483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034214637707918882, "kl": 0.0023974603973329067, "learning_rate": 9.789439555349699e-07, "loss": 0.0001, "num_tokens": 62540564.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 2274, "step_time": 16.21651890128851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 112.6875, "completions/mean_terminated_length": 112.6875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.24744964390993118, "epoch": 0.10537285780453914, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016079018823802471, "kl": 0.0013612003094749525, "learning_rate": 9.78934691987031e-07, "loss": 0.0001, "num_tokens": 62560495.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2275, "step_time": 12.973549351096153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 156.4375, "completions/mean_terminated_length": 156.4375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4443458691239357, "epoch": 0.10541917554423344, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036383571568876505, "kl": 0.0024798159720376134, "learning_rate": 9.789254284390921e-07, "loss": 0.0001, "num_tokens": 62607318.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2276, "step_time": 23.97321081161499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 219.8125, "completions/mean_terminated_length": 219.8125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.39555609226226807, "epoch": 0.10546549328392775, "frac_reward_zero_std": 0.0, "grad_norm": 0.07289256900548935, "kl": 0.002941161103080958, "learning_rate": 9.789161648911532e-07, "loss": 0.0065, "num_tokens": 62629731.0, "reward": 0.8440167903900146, "reward_std": 0.06088960915803909, "rewards/reward_func/mean": 0.8440167903900146, "rewards/reward_func/std": 0.06088960915803909, "step": 2277, "step_time": 22.16989605501294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 218.0625, "completions/mean_terminated_length": 218.0625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.32909025996923447, "epoch": 0.10551181102362205, "frac_reward_zero_std": 1.0, "grad_norm": 0.009037869051098824, "kl": 0.005931343417614698, "learning_rate": 9.789069013432144e-07, "loss": 0.0003, "num_tokens": 62668852.0, "reward": 0.14145326614379883, "reward_std": 0.0, "rewards/reward_func/mean": 0.14145326614379883, "rewards/reward_func/std": 0.0, "step": 2278, "step_time": 25.76391412690282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 147.0, "completions/mean_terminated_length": 147.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3768761530518532, "epoch": 0.10555812876331636, "frac_reward_zero_std": 1.0, "grad_norm": 0.002742957789450884, "kl": 0.0020997224492020905, "learning_rate": 9.788976377952755e-07, "loss": 0.0001, "num_tokens": 62689572.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2279, "step_time": 15.67442176118493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3708306774497032, "epoch": 0.10560444650301065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017952244961634278, "kl": 0.0017793258593883365, "learning_rate": 9.788883742473368e-07, "loss": 0.0001, "num_tokens": 62721984.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2280, "step_time": 22.377992317080498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.40220367163419724, "epoch": 0.10565076424270496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036408279556781054, "kl": 0.00243093614699319, "learning_rate": 9.788791106993977e-07, "loss": 0.0001, "num_tokens": 62745208.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2281, "step_time": 19.542316388338804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 198.0625, "completions/mean_terminated_length": 198.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.37135354429483414, "epoch": 0.10569708198239926, "frac_reward_zero_std": 0.0, "grad_norm": 0.20602072775363922, "kl": 0.013688758946955204, "learning_rate": 9.788698471514589e-07, "loss": -0.076, "num_tokens": 62768873.0, "reward": 0.46970653533935547, "reward_std": 0.4851108193397522, "rewards/reward_func/mean": 0.46970653533935547, "rewards/reward_func/std": 0.4851108193397522, "step": 2282, "step_time": 24.103362884372473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3340918496251106, "epoch": 0.10574339972209357, "frac_reward_zero_std": 0.0, "grad_norm": 0.10164111852645874, "kl": 0.005126326461322606, "learning_rate": 9.7886058360352e-07, "loss": -0.1064, "num_tokens": 62792647.0, "reward": 0.23367546498775482, "reward_std": 0.42207178473472595, "rewards/reward_func/mean": 0.23367546498775482, "rewards/reward_func/std": 0.42207178473472595, "step": 2283, "step_time": 23.066465586423874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.35164161026477814, "epoch": 0.10578971746178786, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016053604194894433, "kl": 0.0018234541930723935, "learning_rate": 9.788513200555813e-07, "loss": 0.0001, "num_tokens": 62814044.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2284, "step_time": 19.28101134300232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 153.8125, "completions/mean_terminated_length": 153.8125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3576444014906883, "epoch": 0.10583603520148217, "frac_reward_zero_std": 1.0, "grad_norm": 0.002952948911115527, "kl": 0.0024166183138731867, "learning_rate": 9.788420565076425e-07, "loss": 0.0001, "num_tokens": 62835513.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2285, "step_time": 16.41217329725623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 155.4375, "completions/mean_terminated_length": 155.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.18707425519824028, "epoch": 0.10588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.003922322764992714, "kl": 0.002202034753281623, "learning_rate": 9.788327929597036e-07, "loss": 0.0001, "num_tokens": 62862640.0, "reward": 0.5682365894317627, "reward_std": 0.0, "rewards/reward_func/mean": 0.5682365894317627, "rewards/reward_func/std": 0.0, "step": 2286, "step_time": 17.08743765950203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 175.6875, "completions/mean_terminated_length": 175.6875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.2407989427447319, "epoch": 0.10592867068087078, "frac_reward_zero_std": 1.0, "grad_norm": 0.002489193808287382, "kl": 0.0017187929479405284, "learning_rate": 9.788235294117647e-07, "loss": 0.0001, "num_tokens": 62887691.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2287, "step_time": 18.360634196549654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 210.1875, "completions/mean_terminated_length": 210.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3300726041197777, "epoch": 0.10597498842056507, "frac_reward_zero_std": 0.0, "grad_norm": 0.13574030995368958, "kl": 0.00281023868592456, "learning_rate": 9.788142658638258e-07, "loss": 0.3027, "num_tokens": 62913230.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 2288, "step_time": 33.76765315979719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.19006231427192688, "epoch": 0.10602130616025938, "frac_reward_zero_std": 1.0, "grad_norm": 0.003184650093317032, "kl": 0.002425000420771539, "learning_rate": 9.78805002315887e-07, "loss": 0.0001, "num_tokens": 62951848.0, "reward": 0.7742860317230225, "reward_std": 0.0, "rewards/reward_func/mean": 0.7742860317230225, "rewards/reward_func/std": 0.0, "step": 2289, "step_time": 26.346284467726946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 156.4375, "completions/mean_terminated_length": 156.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4017608165740967, "epoch": 0.10606762389995368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015749339945614338, "kl": 0.0016024188371375203, "learning_rate": 9.78795738767948e-07, "loss": 0.0001, "num_tokens": 62988831.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2290, "step_time": 20.73379624262452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3084176033735275, "epoch": 0.10611394163964799, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016018265159800649, "kl": 0.0014150924980640411, "learning_rate": 9.787864752200092e-07, "loss": 0.0001, "num_tokens": 63019921.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2291, "step_time": 16.553121391683817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 136.9375, "completions/mean_terminated_length": 136.9375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2522764801979065, "epoch": 0.10616025937934229, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061822873540222645, "kl": 0.0022942226787563413, "learning_rate": 9.787772116720703e-07, "loss": 0.0001, "num_tokens": 63039648.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2292, "step_time": 14.715745452791452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 146.875, "completions/mean_terminated_length": 146.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.37816808372735977, "epoch": 0.1062065771190366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015219327760860324, "kl": 0.0018334352935198694, "learning_rate": 9.787679481241315e-07, "loss": 0.0001, "num_tokens": 63078798.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2293, "step_time": 20.435246918350458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 135.1875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3749251514673233, "epoch": 0.10625289485873089, "frac_reward_zero_std": 1.0, "grad_norm": 0.003979141358286142, "kl": 0.0027734158793464303, "learning_rate": 9.787586845761926e-07, "loss": 0.0001, "num_tokens": 63108817.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2294, "step_time": 18.0811504162848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.36283373087644577, "epoch": 0.1062992125984252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013740368885919452, "kl": 0.001440608175471425, "learning_rate": 9.787494210282537e-07, "loss": 0.0001, "num_tokens": 63132711.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2295, "step_time": 16.465324983000755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.20616553351283073, "epoch": 0.1063455303381195, "frac_reward_zero_std": 0.0, "grad_norm": 0.081479012966156, "kl": 0.0019770217477343976, "learning_rate": 9.787401574803148e-07, "loss": 0.0286, "num_tokens": 63157404.0, "reward": 0.856032133102417, "reward_std": 0.22827444970607758, "rewards/reward_func/mean": 0.856032133102417, "rewards/reward_func/std": 0.22827443480491638, "step": 2296, "step_time": 17.017949648201466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 102.0, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.29436952620744705, "epoch": 0.10639184807781381, "frac_reward_zero_std": 1.0, "grad_norm": 0.002989846048876643, "kl": 0.0019666525186039507, "learning_rate": 9.787308939323762e-07, "loss": 0.0001, "num_tokens": 63180956.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2297, "step_time": 15.713763888925314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.19966155290603638, "epoch": 0.1064381658175081, "frac_reward_zero_std": 0.0, "grad_norm": 0.08586151897907257, "kl": 0.0008249464735854417, "learning_rate": 9.787216303844373e-07, "loss": 0.0048, "num_tokens": 63214898.0, "reward": 0.9880691766738892, "reward_std": 0.04772331565618515, "rewards/reward_func/mean": 0.9880691766738892, "rewards/reward_func/std": 0.04772332310676575, "step": 2298, "step_time": 22.55654987320304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 222.0625, "completions/mean_terminated_length": 222.0625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.46618911623954773, "epoch": 0.10648448355720241, "frac_reward_zero_std": 1.0, "grad_norm": 0.006537908222526312, "kl": 0.0052906685741618276, "learning_rate": 9.787123668364984e-07, "loss": 0.0003, "num_tokens": 63239875.0, "reward": 3.869818243629197e-09, "reward_std": 8.242940552349864e-09, "rewards/reward_func/mean": 3.869818243629197e-09, "rewards/reward_func/std": 8.242940552349864e-09, "step": 2299, "step_time": 29.141530752182007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 208.8125, "completions/mean_terminated_length": 208.8125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.2937890291213989, "epoch": 0.10653080129689671, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016014168504625559, "kl": 0.0014777045871596783, "learning_rate": 9.787031032885593e-07, "loss": 0.0001, "num_tokens": 63263088.0, "reward": 0.6897482872009277, "reward_std": 0.0, "rewards/reward_func/mean": 0.6897482872009277, "rewards/reward_func/std": 0.0, "step": 2300, "step_time": 20.025199435651302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 151.625, "completions/mean_terminated_length": 151.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.37442679703235626, "epoch": 0.10657711903659102, "frac_reward_zero_std": 1.0, "grad_norm": 0.004565059673041105, "kl": 0.003417026367969811, "learning_rate": 9.786938397406207e-07, "loss": 0.0002, "num_tokens": 63284090.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2301, "step_time": 16.525586277246475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 201.0625, "completions/mean_terminated_length": 201.0625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.1893659010529518, "epoch": 0.10662343677628532, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015991524560377002, "kl": 0.0013263034634292126, "learning_rate": 9.786845761926818e-07, "loss": 0.0001, "num_tokens": 63308315.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 2302, "step_time": 19.861064448952675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 116.0625, "completions/mean_terminated_length": 116.0625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2300916463136673, "epoch": 0.10666975451597963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018000829732045531, "kl": 0.0013958195049781352, "learning_rate": 9.78675312644743e-07, "loss": 0.0001, "num_tokens": 63328508.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2303, "step_time": 13.659082971513271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 149.0625, "completions/mean_terminated_length": 149.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.2581302411854267, "epoch": 0.10671607225567392, "frac_reward_zero_std": 1.0, "grad_norm": 0.002336992183700204, "kl": 0.0014520465047098696, "learning_rate": 9.78666049096804e-07, "loss": 0.0001, "num_tokens": 63348797.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2304, "step_time": 15.82089039310813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4135226383805275, "epoch": 0.10676238999536823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015398422256112099, "kl": 0.0017486756551079452, "learning_rate": 9.786567855488652e-07, "loss": 0.0001, "num_tokens": 63384304.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2305, "step_time": 21.010801576077938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.27658435702323914, "epoch": 0.10680870773506253, "frac_reward_zero_std": 1.0, "grad_norm": 0.003001737641170621, "kl": 0.0021387546439655125, "learning_rate": 9.786475220009263e-07, "loss": 0.0001, "num_tokens": 63404183.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2306, "step_time": 14.322453249245882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 153.5, "completions/mean_terminated_length": 153.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3153928518295288, "epoch": 0.10685502547475684, "frac_reward_zero_std": 1.0, "grad_norm": 0.008436080999672413, "kl": 0.004455897840671241, "learning_rate": 9.786382584529874e-07, "loss": 0.0002, "num_tokens": 63431343.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2307, "step_time": 18.271749652922153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3811292424798012, "epoch": 0.10690134321445113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030828353483229876, "kl": 0.002185209741583094, "learning_rate": 9.786289949050485e-07, "loss": 0.0001, "num_tokens": 63455734.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2308, "step_time": 18.74537069350481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 121.5625, "completions/mean_terminated_length": 121.5625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2895944267511368, "epoch": 0.10694766095414544, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013630129396915436, "kl": 0.001331931067397818, "learning_rate": 9.786197313571097e-07, "loss": 0.0001, "num_tokens": 63477439.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2309, "step_time": 14.01130260899663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.36147908866405487, "epoch": 0.10699397869383974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014920932007953525, "kl": 0.0014613425882998854, "learning_rate": 9.78610467809171e-07, "loss": 0.0001, "num_tokens": 63511107.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2310, "step_time": 22.073243718594313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 189.4375, "completions/mean_terminated_length": 189.4375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.26418977975845337, "epoch": 0.10704029643353405, "frac_reward_zero_std": 0.0, "grad_norm": 0.1080576553940773, "kl": 0.0015041398874018341, "learning_rate": 9.786012042612321e-07, "loss": -0.0444, "num_tokens": 63542954.0, "reward": 0.8950977325439453, "reward_std": 0.04094961658120155, "rewards/reward_func/mean": 0.8950977325439453, "rewards/reward_func/std": 0.040949635207653046, "step": 2311, "step_time": 22.31054286286235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.23768489435315132, "epoch": 0.10708661417322834, "frac_reward_zero_std": 0.0, "grad_norm": 0.09017661213874817, "kl": 0.0025725949089974165, "learning_rate": 9.78591940713293e-07, "loss": -0.0531, "num_tokens": 63572898.0, "reward": 0.9160261154174805, "reward_std": 0.027147367596626282, "rewards/reward_func/mean": 0.9160261154174805, "rewards/reward_func/std": 0.027147362008690834, "step": 2312, "step_time": 20.856951646506786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 181.8125, "completions/mean_terminated_length": 181.8125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.165035892277956, "epoch": 0.10713293191292265, "frac_reward_zero_std": 0.0, "grad_norm": 0.10406757891178131, "kl": 0.0022495368611998856, "learning_rate": 9.785826771653542e-07, "loss": -0.1167, "num_tokens": 63598703.0, "reward": 0.5950411558151245, "reward_std": 0.4749177396297455, "rewards/reward_func/mean": 0.5950411558151245, "rewards/reward_func/std": 0.47491776943206787, "step": 2313, "step_time": 19.663224138319492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 135.0625, "completions/mean_terminated_length": 135.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.2723177596926689, "epoch": 0.10717924965261695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016098006162792444, "kl": 0.0013433267595246434, "learning_rate": 9.785734136174155e-07, "loss": 0.0001, "num_tokens": 63619888.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2314, "step_time": 14.279137838631868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 196.3125, "completions/mean_terminated_length": 196.3125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.39508721977472305, "epoch": 0.10722556739231126, "frac_reward_zero_std": 1.0, "grad_norm": 0.002097412943840027, "kl": 0.0024874747614376247, "learning_rate": 9.785641500694766e-07, "loss": 0.0001, "num_tokens": 63655765.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2315, "step_time": 23.132310081273317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.19757945090532303, "epoch": 0.10727188513200556, "frac_reward_zero_std": 0.0, "grad_norm": 0.11243237555027008, "kl": 0.0047940564109012485, "learning_rate": 9.785548865215377e-07, "loss": -0.0059, "num_tokens": 63694163.0, "reward": 0.35405808687210083, "reward_std": 0.014812404289841652, "rewards/reward_func/mean": 0.35405808687210083, "rewards/reward_func/std": 0.014812405221164227, "step": 2316, "step_time": 25.012777283787727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 151.5625, "completions/mean_terminated_length": 151.5625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3269631415605545, "epoch": 0.10731820287169987, "frac_reward_zero_std": 1.0, "grad_norm": 0.002156304894015193, "kl": 0.0016005643119569868, "learning_rate": 9.785456229735989e-07, "loss": 0.0001, "num_tokens": 63716812.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2317, "step_time": 17.062161348760128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 192.6875, "completions/mean_terminated_length": 192.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2712355703115463, "epoch": 0.10736452061139416, "frac_reward_zero_std": 1.0, "grad_norm": 0.011392252519726753, "kl": 0.00468895654194057, "learning_rate": 9.7853635942566e-07, "loss": 0.0002, "num_tokens": 63740199.0, "reward": 0.3381999135017395, "reward_std": 0.0, "rewards/reward_func/mean": 0.3381999135017395, "rewards/reward_func/std": 0.0, "step": 2318, "step_time": 20.61536430567503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1931329183280468, "epoch": 0.10741083835108847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016985037364065647, "kl": 0.0012550139799714088, "learning_rate": 9.785270958777211e-07, "loss": 0.0001, "num_tokens": 63762211.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2319, "step_time": 18.47048844769597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 129.375, "completions/mean_terminated_length": 129.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3037542253732681, "epoch": 0.10745715609078277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018543946789577603, "kl": 0.001508870889665559, "learning_rate": 9.785178323297822e-07, "loss": 0.0001, "num_tokens": 63788681.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2320, "step_time": 16.705736588686705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.40432457625865936, "epoch": 0.10750347383047708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019889730028808117, "kl": 0.002315671998076141, "learning_rate": 9.785085687818434e-07, "loss": 0.0001, "num_tokens": 63845745.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2321, "step_time": 25.2012220621109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 202.3125, "completions/mean_terminated_length": 202.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3099028021097183, "epoch": 0.10754979157017137, "frac_reward_zero_std": 0.0, "grad_norm": 0.11800023168325424, "kl": 0.006881332607008517, "learning_rate": 9.784993052339045e-07, "loss": -0.1077, "num_tokens": 63869734.0, "reward": 0.5659927129745483, "reward_std": 0.4527941644191742, "rewards/reward_func/mean": 0.5659927129745483, "rewards/reward_func/std": 0.4527941942214966, "step": 2322, "step_time": 21.946988113224506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.26356247067451477, "epoch": 0.10759610930986568, "frac_reward_zero_std": 0.0, "grad_norm": 0.12838982045650482, "kl": 0.0028781691507901996, "learning_rate": 9.784900416859656e-07, "loss": 0.021, "num_tokens": 63894332.0, "reward": 0.33142462372779846, "reward_std": 0.01225903071463108, "rewards/reward_func/mean": 0.33142462372779846, "rewards/reward_func/std": 0.012259027920663357, "step": 2323, "step_time": 17.727836210280657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.20181596651673317, "epoch": 0.10764242704955998, "frac_reward_zero_std": 1.0, "grad_norm": 0.008901465684175491, "kl": 0.0038753908302169293, "learning_rate": 9.78480778138027e-07, "loss": 0.0002, "num_tokens": 63916886.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2324, "step_time": 15.363601807504892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.33088518679142, "epoch": 0.10768874478925429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030767915304750204, "kl": 0.002382044738624245, "learning_rate": 9.784715145900879e-07, "loss": 0.0001, "num_tokens": 63938065.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2325, "step_time": 15.904967341572046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 140.8125, "completions/mean_terminated_length": 140.8125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.15863429754972458, "epoch": 0.10773506252894859, "frac_reward_zero_std": 0.0, "grad_norm": 0.15959049761295319, "kl": 0.0014423929387703538, "learning_rate": 9.78462251042149e-07, "loss": 0.0282, "num_tokens": 63961582.0, "reward": 0.8296091556549072, "reward_std": 0.11370529979467392, "rewards/reward_func/mean": 0.8296091556549072, "rewards/reward_func/std": 0.11370529979467392, "step": 2326, "step_time": 15.291508000344038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 144.4375, "completions/mean_terminated_length": 144.4375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.32636523991823196, "epoch": 0.1077813802686429, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028073210269212723, "kl": 0.0021235549356788397, "learning_rate": 9.784529874942103e-07, "loss": 0.0001, "num_tokens": 63981909.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2327, "step_time": 17.106442864984274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 199.0625, "completions/mean_terminated_length": 199.0625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.31194355338811874, "epoch": 0.10782769800833719, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028667158912867308, "kl": 0.0023261773167178035, "learning_rate": 9.784437239462715e-07, "loss": 0.0001, "num_tokens": 64008758.0, "reward": 0.27390056848526, "reward_std": 0.0, "rewards/reward_func/mean": 0.27390056848526, "rewards/reward_func/std": 0.0, "step": 2328, "step_time": 21.130776807665825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 202.0625, "completions/mean_terminated_length": 202.0625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3795570209622383, "epoch": 0.1078740157480315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055798115208745, "kl": 0.0038282829336822033, "learning_rate": 9.784344603983326e-07, "loss": 0.0002, "num_tokens": 64035367.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2329, "step_time": 27.652083162218332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.22390004619956017, "epoch": 0.1079203334877258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024152437690645456, "kl": 0.0018542756733950227, "learning_rate": 9.784251968503937e-07, "loss": 0.0001, "num_tokens": 64062421.0, "reward": 0.786984384059906, "reward_std": 0.0, "rewards/reward_func/mean": 0.786984384059906, "rewards/reward_func/std": 0.0, "step": 2330, "step_time": 22.75635538250208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 144.5625, "completions/mean_terminated_length": 144.5625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2623891681432724, "epoch": 0.10796665122742011, "frac_reward_zero_std": 1.0, "grad_norm": 0.001554641523398459, "kl": 0.0014957803941797465, "learning_rate": 9.784159333024548e-07, "loss": 0.0001, "num_tokens": 64082382.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2331, "step_time": 14.557039085775614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 137.6875, "completions/mean_terminated_length": 137.6875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3514051139354706, "epoch": 0.1080129689671144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029933264013379812, "kl": 0.0021869066986255348, "learning_rate": 9.78406669754516e-07, "loss": 0.0001, "num_tokens": 64108569.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2332, "step_time": 15.876485411077738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4565359354019165, "epoch": 0.10805928670680871, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034759847912937403, "kl": 0.0026159347617067397, "learning_rate": 9.78397406206577e-07, "loss": 0.0001, "num_tokens": 64130037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2333, "step_time": 17.716689959168434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 202.1875, "completions/mean_terminated_length": 202.1875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.23511254787445068, "epoch": 0.10810560444650301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034925348591059446, "kl": 0.002422249934170395, "learning_rate": 9.783881426586382e-07, "loss": 0.0001, "num_tokens": 64155784.0, "reward": 0.06856315582990646, "reward_std": 0.0, "rewards/reward_func/mean": 0.06856315582990646, "rewards/reward_func/std": 0.0, "step": 2334, "step_time": 20.25643503293395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2809803709387779, "epoch": 0.10815192218619732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016170106828212738, "kl": 0.00152270492981188, "learning_rate": 9.783788791106993e-07, "loss": 0.0001, "num_tokens": 64176968.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2335, "step_time": 13.492836754769087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 135.6875, "completions/mean_terminated_length": 135.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28990789502859116, "epoch": 0.10819823992589161, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017747610108926892, "kl": 0.0013901473430451006, "learning_rate": 9.783696155627605e-07, "loss": 0.0001, "num_tokens": 64197459.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2336, "step_time": 15.477379083633423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2344549000263214, "epoch": 0.10824455766558592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044554611667990685, "kl": 0.0020847307168878615, "learning_rate": 9.783603520148216e-07, "loss": 0.0001, "num_tokens": 64217042.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2337, "step_time": 15.261674121022224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.26227881759405136, "epoch": 0.10829087540528022, "frac_reward_zero_std": 1.0, "grad_norm": 0.007790118455886841, "kl": 0.0023844182142056525, "learning_rate": 9.783510884668827e-07, "loss": 0.0001, "num_tokens": 64238065.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2338, "step_time": 14.55690012872219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.23508936539292336, "epoch": 0.10833719314497453, "frac_reward_zero_std": 0.0, "grad_norm": 0.15546344220638275, "kl": 0.007544756750576198, "learning_rate": 9.783418249189438e-07, "loss": 0.0474, "num_tokens": 64260229.0, "reward": 0.4911891222000122, "reward_std": 0.14480088651180267, "rewards/reward_func/mean": 0.4911891222000122, "rewards/reward_func/std": 0.14480090141296387, "step": 2339, "step_time": 16.440475221723318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 143.6875, "completions/mean_terminated_length": 143.6875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2491108849644661, "epoch": 0.10838351088466883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017591416835784912, "kl": 0.0014814950700383633, "learning_rate": 9.78332561371005e-07, "loss": 0.0001, "num_tokens": 64280192.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2340, "step_time": 14.446493964642286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.18401716277003288, "epoch": 0.10842982862436314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024554389528930187, "kl": 0.001874446781584993, "learning_rate": 9.783232978230663e-07, "loss": 0.0001, "num_tokens": 64309972.0, "reward": 0.9813089370727539, "reward_std": 0.0, "rewards/reward_func/mean": 0.9813089370727539, "rewards/reward_func/std": 0.0, "step": 2341, "step_time": 24.967931482940912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.41685056686401367, "epoch": 0.10847614636405743, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021794848144054413, "kl": 0.002308505936525762, "learning_rate": 9.783140342751274e-07, "loss": 0.0001, "num_tokens": 64362252.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2342, "step_time": 24.696660231798887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 126.875, "completions/mean_terminated_length": 126.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2645728290081024, "epoch": 0.10852246410375174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020788901019841433, "kl": 0.0015851175121497363, "learning_rate": 9.783047707271883e-07, "loss": 0.0001, "num_tokens": 64382746.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2343, "step_time": 14.665306013077497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3263947293162346, "epoch": 0.10856878184344604, "frac_reward_zero_std": 0.0, "grad_norm": 0.10520664602518082, "kl": 0.004593534977175295, "learning_rate": 9.782955071792497e-07, "loss": -0.002, "num_tokens": 64403372.0, "reward": 0.7351804375648499, "reward_std": 0.3647516369819641, "rewards/reward_func/mean": 0.7351804375648499, "rewards/reward_func/std": 0.3647516369819641, "step": 2344, "step_time": 19.894864667207003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 252.0625, "completions/mean_terminated_length": 252.0625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.34028272330760956, "epoch": 0.10861509958314035, "frac_reward_zero_std": 0.0, "grad_norm": 0.10743195563554764, "kl": 0.005560745019465685, "learning_rate": 9.782862436313108e-07, "loss": -0.1802, "num_tokens": 64430701.0, "reward": 0.6157898306846619, "reward_std": 0.49269482493400574, "rewards/reward_func/mean": 0.6157898306846619, "rewards/reward_func/std": 0.4926948547363281, "step": 2345, "step_time": 28.747718140482903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 160.9375, "completions/mean_terminated_length": 160.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1433638110756874, "epoch": 0.10866141732283464, "frac_reward_zero_std": 1.0, "grad_norm": 0.004954099655151367, "kl": 0.003796221222728491, "learning_rate": 9.78276980083372e-07, "loss": 0.0002, "num_tokens": 64455820.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2346, "step_time": 16.379356395453215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.38788267970085144, "epoch": 0.10870773506252895, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036784771364182234, "kl": 0.0028724872681777924, "learning_rate": 9.78267716535433e-07, "loss": 0.0001, "num_tokens": 64478704.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2347, "step_time": 19.262580774724483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 233.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.29155682027339935, "epoch": 0.10875405280222325, "frac_reward_zero_std": 0.0, "grad_norm": 0.09992647916078568, "kl": 0.004710135166533291, "learning_rate": 9.782584529874942e-07, "loss": -0.1581, "num_tokens": 64503282.0, "reward": 0.568049430847168, "reward_std": 0.2834235429763794, "rewards/reward_func/mean": 0.568049430847168, "rewards/reward_func/std": 0.2834235429763794, "step": 2348, "step_time": 27.2289629727602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.27687399834394455, "epoch": 0.10880037054191756, "frac_reward_zero_std": 0.0, "grad_norm": 0.10451581329107285, "kl": 0.004878342500887811, "learning_rate": 9.782491894395553e-07, "loss": -0.0744, "num_tokens": 64540934.0, "reward": 0.5664272904396057, "reward_std": 0.16924944519996643, "rewards/reward_func/mean": 0.5664272904396057, "rewards/reward_func/std": 0.16924946010112762, "step": 2349, "step_time": 24.839217126369476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 172.4375, "completions/mean_terminated_length": 172.4375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.20761826634407043, "epoch": 0.10884668828161186, "frac_reward_zero_std": 0.0, "grad_norm": 0.11507759988307953, "kl": 0.0012529796367743984, "learning_rate": 9.782399258916164e-07, "loss": -0.0554, "num_tokens": 64566013.0, "reward": 0.38113081455230713, "reward_std": 0.409390389919281, "rewards/reward_func/mean": 0.38113081455230713, "rewards/reward_func/std": 0.4093904197216034, "step": 2350, "step_time": 19.725036844611168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 333.375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.23016024380922318, "epoch": 0.10889300602130617, "frac_reward_zero_std": 0.0, "grad_norm": 0.06802016496658325, "kl": 0.0021586233924608678, "learning_rate": 9.782306623436775e-07, "loss": -0.0762, "num_tokens": 64607235.0, "reward": 0.5437136888504028, "reward_std": 0.24644720554351807, "rewards/reward_func/mean": 0.5437136888504028, "rewards/reward_func/std": 0.24644720554351807, "step": 2351, "step_time": 37.613715037703514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.23183806240558624, "epoch": 0.10893932376100046, "frac_reward_zero_std": 1.0, "grad_norm": 0.020201504230499268, "kl": 0.004952602321282029, "learning_rate": 9.782213987957387e-07, "loss": 0.0003, "num_tokens": 64655786.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 2352, "step_time": 29.950926713645458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 135.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.27789459377527237, "epoch": 0.10898564150069477, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016213261988013983, "kl": 0.0014960351691115648, "learning_rate": 9.782121352477998e-07, "loss": 0.0001, "num_tokens": 64683885.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2353, "step_time": 15.662431389093399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 140.6875, "completions/mean_terminated_length": 140.6875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2667175978422165, "epoch": 0.10903195924038907, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033754301257431507, "kl": 0.0020495178760029376, "learning_rate": 9.782028716998611e-07, "loss": 0.0001, "num_tokens": 64704200.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2354, "step_time": 15.436271790415049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 225.4375, "completions/mean_terminated_length": 225.4375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4056299179792404, "epoch": 0.10907827698008338, "frac_reward_zero_std": 0.0, "grad_norm": 0.11347091197967529, "kl": 0.004436219925992191, "learning_rate": 9.78193608151922e-07, "loss": -0.1563, "num_tokens": 64730495.0, "reward": 0.11520224809646606, "reward_std": 0.2597229778766632, "rewards/reward_func/mean": 0.11520224809646606, "rewards/reward_func/std": 0.2597229778766632, "step": 2355, "step_time": 27.59665833041072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 126.9375, "completions/mean_terminated_length": 126.9375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.21263223141431808, "epoch": 0.10912459471977767, "frac_reward_zero_std": 1.0, "grad_norm": 0.013671000488102436, "kl": 0.0021115583658684045, "learning_rate": 9.781843446039832e-07, "loss": 0.0001, "num_tokens": 64751278.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2356, "step_time": 14.197697196155787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 182.125, "completions/mean_terminated_length": 182.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3813173770904541, "epoch": 0.10917091245947198, "frac_reward_zero_std": 1.0, "grad_norm": 0.005697739310562611, "kl": 0.0038346676155924797, "learning_rate": 9.781750810560445e-07, "loss": 0.0002, "num_tokens": 64774592.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2357, "step_time": 19.704799972474575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 231.9375, "completions/mean_terminated_length": 231.9375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.3623291403055191, "epoch": 0.10921723019916628, "frac_reward_zero_std": 0.0, "grad_norm": 0.07882294058799744, "kl": 0.004046881454996765, "learning_rate": 9.781658175081056e-07, "loss": -0.0466, "num_tokens": 64797807.0, "reward": 0.6959699988365173, "reward_std": 0.4154508113861084, "rewards/reward_func/mean": 0.6959699988365173, "rewards/reward_func/std": 0.4154508411884308, "step": 2358, "step_time": 23.98015521466732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 148.1875, "completions/mean_terminated_length": 148.1875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.27642325311899185, "epoch": 0.10926354793886059, "frac_reward_zero_std": 1.0, "grad_norm": 0.002546600066125393, "kl": 0.0018306243291590363, "learning_rate": 9.781565539601668e-07, "loss": 0.0001, "num_tokens": 64833090.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2359, "step_time": 19.557082820683718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.24291365966200829, "epoch": 0.10930986567855489, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027510575018823147, "kl": 0.0019136813352815807, "learning_rate": 9.781472904122279e-07, "loss": 0.0001, "num_tokens": 64852732.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2360, "step_time": 15.28828652203083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.2334187850356102, "epoch": 0.1093561834182492, "frac_reward_zero_std": 0.0, "grad_norm": 0.07622101902961731, "kl": 0.002250398480100557, "learning_rate": 9.78138026864289e-07, "loss": -0.1944, "num_tokens": 64878696.0, "reward": 0.13393382728099823, "reward_std": 0.27229103446006775, "rewards/reward_func/mean": 0.13393382728099823, "rewards/reward_func/std": 0.27229103446006775, "step": 2361, "step_time": 27.91884146258235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 210.4375, "completions/mean_terminated_length": 210.4375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3545868992805481, "epoch": 0.10940250115794349, "frac_reward_zero_std": 0.0, "grad_norm": 0.12284550070762634, "kl": 0.0036865408183075488, "learning_rate": 9.781287633163501e-07, "loss": -0.1176, "num_tokens": 64909599.0, "reward": 0.46391788125038147, "reward_std": 0.39168205857276917, "rewards/reward_func/mean": 0.46391788125038147, "rewards/reward_func/std": 0.39168205857276917, "step": 2362, "step_time": 25.77095314115286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.26528290659189224, "epoch": 0.1094488188976378, "frac_reward_zero_std": 1.0, "grad_norm": 0.00298228464089334, "kl": 0.0026924379053525627, "learning_rate": 9.781194997684113e-07, "loss": 0.0001, "num_tokens": 64931131.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2363, "step_time": 21.232171565294266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 193.8125, "completions/mean_terminated_length": 193.8125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3926021009683609, "epoch": 0.1094951366373321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027155806310474873, "kl": 0.001936954795382917, "learning_rate": 9.781102362204724e-07, "loss": 0.0001, "num_tokens": 64957560.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2364, "step_time": 20.97134505584836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 189.0625, "completions/mean_terminated_length": 189.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.2027004174888134, "epoch": 0.1095414543770264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018587826052680612, "kl": 0.0013235746591817588, "learning_rate": 9.781009726725335e-07, "loss": 0.0001, "num_tokens": 64988777.0, "reward": 0.3487522304058075, "reward_std": 0.0, "rewards/reward_func/mean": 0.3487522304058075, "rewards/reward_func/std": 0.0, "step": 2365, "step_time": 22.576804656535387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 191.6875, "completions/mean_terminated_length": 191.6875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3604848012328148, "epoch": 0.1095877721167207, "frac_reward_zero_std": 1.0, "grad_norm": 0.009871640242636204, "kl": 0.004019507789053023, "learning_rate": 9.780917091245946e-07, "loss": 0.0002, "num_tokens": 65017908.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2366, "step_time": 21.157035641372204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 196.5, "completions/mean_terminated_length": 196.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3952259421348572, "epoch": 0.10963408985641501, "frac_reward_zero_std": 0.0, "grad_norm": 0.09334277361631393, "kl": 0.004445242928341031, "learning_rate": 9.78082445576656e-07, "loss": 0.0072, "num_tokens": 65065388.0, "reward": 0.3393140435218811, "reward_std": 0.4524186849594116, "rewards/reward_func/mean": 0.3393140435218811, "rewards/reward_func/std": 0.4524187445640564, "step": 2367, "step_time": 26.198620542883873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 182.5625, "completions/mean_terminated_length": 182.5625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.29002469778060913, "epoch": 0.10968040759610931, "frac_reward_zero_std": 0.0, "grad_norm": 0.10416804254055023, "kl": 0.006405917811207473, "learning_rate": 9.780731820287169e-07, "loss": 0.0914, "num_tokens": 65095109.0, "reward": 0.7475360631942749, "reward_std": 0.3708817660808563, "rewards/reward_func/mean": 0.7475360631942749, "rewards/reward_func/std": 0.3708817660808563, "step": 2368, "step_time": 24.547886081039906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 169.3125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2909696698188782, "epoch": 0.10972672533580362, "frac_reward_zero_std": 1.0, "grad_norm": 0.005206348840147257, "kl": 0.0033329512807540596, "learning_rate": 9.78063918480778e-07, "loss": 0.0002, "num_tokens": 65117210.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2369, "step_time": 18.68805754557252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 182.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.33839114010334015, "epoch": 0.10977304307549791, "frac_reward_zero_std": 0.0, "grad_norm": 0.10179704427719116, "kl": 0.004062555148266256, "learning_rate": 9.780546549328391e-07, "loss": -0.033, "num_tokens": 65140477.0, "reward": 0.06933650374412537, "reward_std": 0.08119859546422958, "rewards/reward_func/mean": 0.06933650374412537, "rewards/reward_func/std": 0.08119859546422958, "step": 2370, "step_time": 23.381348200142384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 112.1875, "completions/mean_terminated_length": 112.1875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2650522142648697, "epoch": 0.10981936081519222, "frac_reward_zero_std": 1.0, "grad_norm": 0.00396697735413909, "kl": 0.0021386328153312206, "learning_rate": 9.780453913849005e-07, "loss": 0.0001, "num_tokens": 65160944.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2371, "step_time": 12.611635141074657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.33697984367609024, "epoch": 0.10986567855488652, "frac_reward_zero_std": 1.0, "grad_norm": 0.00687724445015192, "kl": 0.004857703344896436, "learning_rate": 9.780361278369616e-07, "loss": 0.0002, "num_tokens": 65189788.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2372, "step_time": 20.00566278770566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 177.9375, "completions/mean_terminated_length": 177.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3338945657014847, "epoch": 0.10991199629458083, "frac_reward_zero_std": 0.0, "grad_norm": 0.11698485165834427, "kl": 0.0042671922710724175, "learning_rate": 9.780268642890227e-07, "loss": -0.1617, "num_tokens": 65212843.0, "reward": 0.22983068227767944, "reward_std": 0.41113361716270447, "rewards/reward_func/mean": 0.22983068227767944, "rewards/reward_func/std": 0.41113361716270447, "step": 2373, "step_time": 22.974309355020523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.35960356146097183, "epoch": 0.10995831403427513, "frac_reward_zero_std": 1.0, "grad_norm": 0.00192203838378191, "kl": 0.001703375281067565, "learning_rate": 9.780176007410838e-07, "loss": 0.0001, "num_tokens": 65235086.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2374, "step_time": 19.51780268922448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 158.4375, "completions/mean_terminated_length": 158.4375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.330436110496521, "epoch": 0.11000463177396944, "frac_reward_zero_std": 1.0, "grad_norm": 0.004185125697404146, "kl": 0.0025320067070424557, "learning_rate": 9.78008337193145e-07, "loss": 0.0001, "num_tokens": 65259797.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2375, "step_time": 16.79103474318981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.41182051599025726, "epoch": 0.11005094951366373, "frac_reward_zero_std": 1.0, "grad_norm": 0.00350063294172287, "kl": 0.002961619582492858, "learning_rate": 9.77999073645206e-07, "loss": 0.0001, "num_tokens": 65289289.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2376, "step_time": 19.341465838253498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 168.125, "completions/mean_terminated_length": 168.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.20241046324372292, "epoch": 0.11009726725335804, "frac_reward_zero_std": 1.0, "grad_norm": 0.003771762130782008, "kl": 0.002782963332720101, "learning_rate": 9.779898100972672e-07, "loss": 0.0001, "num_tokens": 65314155.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2377, "step_time": 18.157934233546257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 129.875, "completions/mean_terminated_length": 129.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.24889836087822914, "epoch": 0.11014358499305234, "frac_reward_zero_std": 1.0, "grad_norm": 0.004847789648920298, "kl": 0.0024113141116686165, "learning_rate": 9.779805465493283e-07, "loss": 0.0001, "num_tokens": 65333673.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2378, "step_time": 15.077660147100687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.330291286110878, "epoch": 0.11018990273274665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019224765710532665, "kl": 0.00165868503972888, "learning_rate": 9.779712830013895e-07, "loss": 0.0001, "num_tokens": 65362805.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2379, "step_time": 16.411409467458725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 150.9375, "completions/mean_terminated_length": 150.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.24974070861935616, "epoch": 0.11023622047244094, "frac_reward_zero_std": 1.0, "grad_norm": 0.004468156490474939, "kl": 0.0033513674279674888, "learning_rate": 9.779620194534506e-07, "loss": 0.0002, "num_tokens": 65399492.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2380, "step_time": 20.17722300067544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.24785011261701584, "epoch": 0.11028253821213525, "frac_reward_zero_std": 0.0, "grad_norm": 0.0666511207818985, "kl": 0.005058192473370582, "learning_rate": 9.779527559055117e-07, "loss": -0.0315, "num_tokens": 65422578.0, "reward": 0.9131531715393066, "reward_std": 0.22993049025535583, "rewards/reward_func/mean": 0.9131531715393066, "rewards/reward_func/std": 0.22993049025535583, "step": 2381, "step_time": 24.465677250176668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3378248065710068, "epoch": 0.11032885595182955, "frac_reward_zero_std": 1.0, "grad_norm": 0.004500414710491896, "kl": 0.003168799390550703, "learning_rate": 9.779434923575728e-07, "loss": 0.0002, "num_tokens": 65442472.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2382, "step_time": 14.456896986812353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.16185901314020157, "epoch": 0.11037517369152386, "frac_reward_zero_std": 1.0, "grad_norm": 0.003626069752499461, "kl": 0.0024742622044868767, "learning_rate": 9.77934228809634e-07, "loss": 0.0001, "num_tokens": 65465558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2383, "step_time": 17.839644107967615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.41052452474832535, "epoch": 0.11042149143121816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013905029045417905, "kl": 0.0019480255723465234, "learning_rate": 9.779249652616953e-07, "loss": 0.0001, "num_tokens": 65529276.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2384, "step_time": 30.762097127735615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2557000517845154, "epoch": 0.11046780917091246, "frac_reward_zero_std": 1.0, "grad_norm": 0.001526663196273148, "kl": 0.001381642243359238, "learning_rate": 9.779157017137564e-07, "loss": 0.0001, "num_tokens": 65548572.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2385, "step_time": 12.958014130592346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.37063341587781906, "epoch": 0.11051412691060676, "frac_reward_zero_std": 1.0, "grad_norm": 0.003017621347680688, "kl": 0.0026910093147307634, "learning_rate": 9.779064381658173e-07, "loss": 0.0001, "num_tokens": 65573210.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2386, "step_time": 17.186774775385857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 142.6875, "completions/mean_terminated_length": 142.6875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.21390212327241898, "epoch": 0.11056044465030107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032537176739424467, "kl": 0.0016970251745078713, "learning_rate": 9.778971746178787e-07, "loss": 0.0001, "num_tokens": 65592981.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2387, "step_time": 15.822840303182602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.43736421316862106, "epoch": 0.11060676238999537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025537472683936357, "kl": 0.0021149092353880405, "learning_rate": 9.778879110699398e-07, "loss": 0.0001, "num_tokens": 65623735.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2388, "step_time": 18.052705015987158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.43385789543390274, "epoch": 0.11065308012968968, "frac_reward_zero_std": 1.0, "grad_norm": 0.003613361855968833, "kl": 0.002479005604982376, "learning_rate": 9.77878647522001e-07, "loss": 0.0001, "num_tokens": 65669201.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2389, "step_time": 21.727573167532682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 256.9375, "completions/mean_terminated_length": 256.9375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.25939323380589485, "epoch": 0.11069939786938397, "frac_reward_zero_std": 0.0, "grad_norm": 0.08898063749074936, "kl": 0.003215736534912139, "learning_rate": 9.77869383974062e-07, "loss": -0.0335, "num_tokens": 65707808.0, "reward": 0.9522287249565125, "reward_std": 0.06066947430372238, "rewards/reward_func/mean": 0.9522287249565125, "rewards/reward_func/std": 0.06066947802901268, "step": 2390, "step_time": 32.65638582408428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.29747195541858673, "epoch": 0.11074571560907828, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026461619418114424, "kl": 0.0016994259494822472, "learning_rate": 9.778601204261232e-07, "loss": 0.0001, "num_tokens": 65728062.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2391, "step_time": 14.331314660608768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 173.1875, "completions/mean_terminated_length": 173.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.18173721432685852, "epoch": 0.11079203334877258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010064563248306513, "kl": 0.0008620497246738523, "learning_rate": 9.778508568781843e-07, "loss": 0.0, "num_tokens": 65762993.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2392, "step_time": 20.52535917982459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.20904475450515747, "epoch": 0.11083835108846689, "frac_reward_zero_std": 1.0, "grad_norm": 0.004252149257808924, "kl": 0.003123885951936245, "learning_rate": 9.778415933302454e-07, "loss": 0.0002, "num_tokens": 65785327.0, "reward": 0.3219582736492157, "reward_std": 0.0, "rewards/reward_func/mean": 0.3219582736492157, "rewards/reward_func/std": 0.0, "step": 2393, "step_time": 16.25042412057519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 141.3125, "completions/mean_terminated_length": 141.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2977365180850029, "epoch": 0.11088466882816118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024261344224214554, "kl": 0.0018466077744960785, "learning_rate": 9.778323297823065e-07, "loss": 0.0001, "num_tokens": 65807556.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2394, "step_time": 15.130787659436464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3205549642443657, "epoch": 0.1109309865678555, "frac_reward_zero_std": 1.0, "grad_norm": 0.005907420068979263, "kl": 0.0032924521365202963, "learning_rate": 9.778230662343677e-07, "loss": 0.0002, "num_tokens": 65828078.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2395, "step_time": 15.151044774800539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 174.8125, "completions/mean_terminated_length": 174.8125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.39984411001205444, "epoch": 0.11097730430754979, "frac_reward_zero_std": 1.0, "grad_norm": 0.008461474440991879, "kl": 0.005715583451092243, "learning_rate": 9.778138026864288e-07, "loss": 0.0003, "num_tokens": 65850715.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2396, "step_time": 17.606170926243067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 242.3125, "completions/mean_terminated_length": 242.3125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "entropy": 0.24380479007959366, "epoch": 0.1110236220472441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033474103547632694, "kl": 0.002408111817203462, "learning_rate": 9.778045391384901e-07, "loss": 0.0001, "num_tokens": 65876736.0, "reward": 0.7326324582099915, "reward_std": 0.0, "rewards/reward_func/mean": 0.7326324582099915, "rewards/reward_func/std": 0.0, "step": 2397, "step_time": 24.573301322758198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 110.3125, "completions/mean_terminated_length": 110.3125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.26128362491726875, "epoch": 0.1110699397869384, "frac_reward_zero_std": 1.0, "grad_norm": 0.005125187337398529, "kl": 0.0023191372747533023, "learning_rate": 9.77795275590551e-07, "loss": 0.0001, "num_tokens": 65897621.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2398, "step_time": 12.474109884351492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3005419969558716, "epoch": 0.1111162575266327, "frac_reward_zero_std": 0.0, "grad_norm": 0.007298790384083986, "kl": 0.002545641007600352, "learning_rate": 9.777860120426122e-07, "loss": 0.0029, "num_tokens": 65918370.0, "reward": 1.9202496332582086e-05, "reward_std": 7.4958870754926465e-06, "rewards/reward_func/mean": 1.9202496332582086e-05, "rewards/reward_func/std": 7.4958875302399974e-06, "step": 2399, "step_time": 17.20006264746189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 115.75, "completions/mean_terminated_length": 115.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2119411677122116, "epoch": 0.111162575266327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017727614613249898, "kl": 0.0012794640206266195, "learning_rate": 9.777767484946733e-07, "loss": 0.0001, "num_tokens": 65937806.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2400, "step_time": 13.171920608729124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.24039685726165771, "epoch": 0.11120889300602131, "frac_reward_zero_std": 1.0, "grad_norm": 0.002546001924201846, "kl": 0.0018284392426721752, "learning_rate": 9.777674849467346e-07, "loss": 0.0001, "num_tokens": 65965852.0, "reward": 0.7788007855415344, "reward_std": 0.0, "rewards/reward_func/mean": 0.7788007855415344, "rewards/reward_func/std": 0.0, "step": 2401, "step_time": 20.73470561951399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.2099083736538887, "epoch": 0.11125521074571561, "frac_reward_zero_std": 1.0, "grad_norm": 0.013738662004470825, "kl": 0.008505105972290039, "learning_rate": 9.777582213987958e-07, "loss": 0.0004, "num_tokens": 65988043.0, "reward": 0.14145326614379883, "reward_std": 0.0, "rewards/reward_func/mean": 0.14145326614379883, "rewards/reward_func/std": 0.0, "step": 2402, "step_time": 17.317905079573393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3355402946472168, "epoch": 0.11130152848540992, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021074803080409765, "kl": 0.001680780842434615, "learning_rate": 9.777489578508569e-07, "loss": 0.0001, "num_tokens": 66024155.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2403, "step_time": 18.197333835065365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 117.8125, "completions/mean_terminated_length": 117.8125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2485005334019661, "epoch": 0.11134784622510421, "frac_reward_zero_std": 1.0, "grad_norm": 0.003974970430135727, "kl": 0.0022931023268029094, "learning_rate": 9.77739694302918e-07, "loss": 0.0001, "num_tokens": 66043560.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2404, "step_time": 12.82517571374774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.38329140841960907, "epoch": 0.11139416396479852, "frac_reward_zero_std": 0.0, "grad_norm": 0.10050483047962189, "kl": 0.004255613719578832, "learning_rate": 9.777304307549791e-07, "loss": 0.0685, "num_tokens": 66075606.0, "reward": 0.3422679305076599, "reward_std": 0.456741601228714, "rewards/reward_func/mean": 0.3422679305076599, "rewards/reward_func/std": 0.4567416310310364, "step": 2405, "step_time": 25.608222983777523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 195.25, "completions/mean_terminated_length": 195.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.1917690858244896, "epoch": 0.11144048170449282, "frac_reward_zero_std": 1.0, "grad_norm": 0.002935251221060753, "kl": 0.0018773071642499417, "learning_rate": 9.777211672070403e-07, "loss": 0.0001, "num_tokens": 66113274.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2406, "step_time": 23.862790696322918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.23762915655970573, "epoch": 0.11148679944418713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015553135890513659, "kl": 0.0013095731555949897, "learning_rate": 9.777119036591014e-07, "loss": 0.0001, "num_tokens": 66134202.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2407, "step_time": 17.738466504961252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3728785365819931, "epoch": 0.11153311718388143, "frac_reward_zero_std": 0.0, "grad_norm": 0.06992531567811966, "kl": 0.0023069052840583026, "learning_rate": 9.777026401111625e-07, "loss": -0.0483, "num_tokens": 66162672.0, "reward": 0.9311447143554688, "reward_std": 0.017449283972382545, "rewards/reward_func/mean": 0.9311447143554688, "rewards/reward_func/std": 0.0174492746591568, "step": 2408, "step_time": 21.930220041424036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4227030873298645, "epoch": 0.11157943492357573, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027652329299598932, "kl": 0.0028306868043728173, "learning_rate": 9.776933765632236e-07, "loss": 0.0001, "num_tokens": 66193946.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2409, "step_time": 19.52329556643963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 185.9375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.39986591041088104, "epoch": 0.11162575266327003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017531183548271656, "kl": 0.0017838965286500752, "learning_rate": 9.776841130152848e-07, "loss": 0.0001, "num_tokens": 66224825.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2410, "step_time": 22.347840026021004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.38478899747133255, "epoch": 0.11167207040296434, "frac_reward_zero_std": 1.0, "grad_norm": 0.011761828325688839, "kl": 0.006792910513468087, "learning_rate": 9.776748494673459e-07, "loss": 0.0003, "num_tokens": 66247289.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2411, "step_time": 18.65980200096965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 131.6875, "completions/mean_terminated_length": 131.6875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.27228139340877533, "epoch": 0.11171838814265864, "frac_reward_zero_std": 1.0, "grad_norm": 0.008121551014482975, "kl": 0.0026825943496078253, "learning_rate": 9.77665585919407e-07, "loss": 0.0001, "num_tokens": 66266740.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2412, "step_time": 14.016145922243595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.26529572159051895, "epoch": 0.11176470588235295, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031022189650684595, "kl": 0.002264054666738957, "learning_rate": 9.776563223714681e-07, "loss": 0.0001, "num_tokens": 66289402.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2413, "step_time": 14.946750197559595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 117.125, "completions/mean_terminated_length": 117.125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.23947672545909882, "epoch": 0.11181102362204724, "frac_reward_zero_std": 1.0, "grad_norm": 0.002311090938746929, "kl": 0.0017960050900001079, "learning_rate": 9.776470588235295e-07, "loss": 0.0001, "num_tokens": 66309196.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2414, "step_time": 15.363229483366013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4139101505279541, "epoch": 0.11185734136174155, "frac_reward_zero_std": 1.0, "grad_norm": 0.007246461696922779, "kl": 0.004397790529765189, "learning_rate": 9.776377952755906e-07, "loss": 0.0002, "num_tokens": 66345873.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2415, "step_time": 23.650139447301626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 165.0625, "completions/mean_terminated_length": 165.0625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4154718965291977, "epoch": 0.11190365910143585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018806760199368, "kl": 0.002184557670261711, "learning_rate": 9.776285317276517e-07, "loss": 0.0001, "num_tokens": 66398242.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2416, "step_time": 24.525605008006096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 149.4375, "completions/mean_terminated_length": 149.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.22699890658259392, "epoch": 0.11194997684113016, "frac_reward_zero_std": 1.0, "grad_norm": 0.001381315989419818, "kl": 0.001268629333935678, "learning_rate": 9.776192681797128e-07, "loss": 0.0001, "num_tokens": 66420873.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2417, "step_time": 17.389300521463156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.33007028698921204, "epoch": 0.11199629458082445, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019194837659597397, "kl": 0.0016540546203032136, "learning_rate": 9.77610004631774e-07, "loss": 0.0001, "num_tokens": 66441745.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2418, "step_time": 17.41394756361842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 194.0625, "completions/mean_terminated_length": 194.0625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.21471988409757614, "epoch": 0.11204261232051876, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034169431310147047, "kl": 0.0028793515521101654, "learning_rate": 9.77600741083835e-07, "loss": 0.0001, "num_tokens": 66470034.0, "reward": 0.5765653252601624, "reward_std": 0.0, "rewards/reward_func/mean": 0.5765653252601624, "rewards/reward_func/std": 0.0, "step": 2419, "step_time": 20.30102963745594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 137.0, "completions/mean_terminated_length": 137.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2592247426509857, "epoch": 0.11208893006021306, "frac_reward_zero_std": 1.0, "grad_norm": 0.009942702949047089, "kl": 0.0028831594972871244, "learning_rate": 9.775914775358962e-07, "loss": 0.0001, "num_tokens": 66490578.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2420, "step_time": 15.012401573359966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 236.1875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.2608606889843941, "epoch": 0.11213524779990737, "frac_reward_zero_std": 0.0, "grad_norm": 0.07392553240060806, "kl": 0.003077751083765179, "learning_rate": 9.775822139879573e-07, "loss": -0.0107, "num_tokens": 66522869.0, "reward": 0.5973880887031555, "reward_std": 0.41606178879737854, "rewards/reward_func/mean": 0.5973880887031555, "rewards/reward_func/std": 0.41606178879737854, "step": 2421, "step_time": 27.28056574985385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.22484597191214561, "epoch": 0.11218156553960167, "frac_reward_zero_std": 1.0, "grad_norm": 0.002945329761132598, "kl": 0.002411195106105879, "learning_rate": 9.775729504400185e-07, "loss": 0.0001, "num_tokens": 66544639.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2422, "step_time": 19.510927099734545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 235.5625, "completions/mean_terminated_length": 235.5625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.41190581023693085, "epoch": 0.11222788327929598, "frac_reward_zero_std": 0.0, "grad_norm": 0.08581096678972244, "kl": 0.006259789690375328, "learning_rate": 9.775636868920796e-07, "loss": -0.1313, "num_tokens": 66578328.0, "reward": 0.15155190229415894, "reward_std": 0.2728007137775421, "rewards/reward_func/mean": 0.15155190229415894, "rewards/reward_func/std": 0.2728007137775421, "step": 2423, "step_time": 32.226893462240696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.29873067885637283, "epoch": 0.11227420101899027, "frac_reward_zero_std": 1.0, "grad_norm": 0.002238903194665909, "kl": 0.0017924793064594269, "learning_rate": 9.775544233441407e-07, "loss": 0.0001, "num_tokens": 66599282.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2424, "step_time": 16.00865687429905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 179.4375, "completions/mean_terminated_length": 179.4375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.419899620115757, "epoch": 0.11232051875868458, "frac_reward_zero_std": 1.0, "grad_norm": 0.004521338734775782, "kl": 0.0033975655678659678, "learning_rate": 9.775451597962018e-07, "loss": 0.0002, "num_tokens": 66621449.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2425, "step_time": 18.19551219791174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3378634974360466, "epoch": 0.11236683649837888, "frac_reward_zero_std": 1.0, "grad_norm": 0.003737666178494692, "kl": 0.0030273490119725466, "learning_rate": 9.77535896248263e-07, "loss": 0.0002, "num_tokens": 66647182.0, "reward": 0.5647181272506714, "reward_std": 0.0, "rewards/reward_func/mean": 0.5647181272506714, "rewards/reward_func/std": 0.0, "step": 2426, "step_time": 21.057276505976915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2890597730875015, "epoch": 0.11241315423807319, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018485448090359569, "kl": 0.0013900745252612978, "learning_rate": 9.775266327003243e-07, "loss": 0.0001, "num_tokens": 66683168.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2427, "step_time": 18.160382740199566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.4234418570995331, "epoch": 0.11245947197776748, "frac_reward_zero_std": 1.0, "grad_norm": 0.002827100455760956, "kl": 0.0024213457363657653, "learning_rate": 9.775173691523854e-07, "loss": 0.0001, "num_tokens": 66745916.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2428, "step_time": 28.01572649553418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3015022277832031, "epoch": 0.1125057897174618, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028600769583135843, "kl": 0.0019417548028286546, "learning_rate": 9.775081056044463e-07, "loss": 0.0001, "num_tokens": 66766667.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2429, "step_time": 13.572780143469572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 198.0625, "completions/mean_terminated_length": 198.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3871123269200325, "epoch": 0.11255210745715609, "frac_reward_zero_std": 0.0, "grad_norm": 0.09472401440143585, "kl": 0.007698494824580848, "learning_rate": 9.774988420565075e-07, "loss": -0.1976, "num_tokens": 66805388.0, "reward": 0.05747421085834503, "reward_std": 0.22989685833454132, "rewards/reward_func/mean": 0.05747421085834503, "rewards/reward_func/std": 0.22989685833454132, "step": 2430, "step_time": 32.72367901727557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.30869461596012115, "epoch": 0.1125984251968504, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023353789001703262, "kl": 0.0014139002305455506, "learning_rate": 9.774895785085688e-07, "loss": 0.0001, "num_tokens": 66825540.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2431, "step_time": 14.9496890604496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3768110051751137, "epoch": 0.1126447429365447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021692905575037003, "kl": 0.0018430263153277338, "learning_rate": 9.7748031496063e-07, "loss": 0.0001, "num_tokens": 66852248.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2432, "step_time": 18.83238858729601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 181.8125, "completions/mean_terminated_length": 181.8125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2170022577047348, "epoch": 0.112691060676239, "frac_reward_zero_std": 1.0, "grad_norm": 0.002259301021695137, "kl": 0.001681604073382914, "learning_rate": 9.77471051412691e-07, "loss": 0.0001, "num_tokens": 66874341.0, "reward": 0.5934875011444092, "reward_std": 0.0, "rewards/reward_func/mean": 0.5934875011444092, "rewards/reward_func/std": 0.0, "step": 2433, "step_time": 19.728549901396036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.42328501492738724, "epoch": 0.1127373784159333, "frac_reward_zero_std": 1.0, "grad_norm": 0.011259117163717747, "kl": 0.0062677806708961725, "learning_rate": 9.774617878647522e-07, "loss": 0.0003, "num_tokens": 66903013.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2434, "step_time": 20.135791525244713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 240.6875, "completions/mean_terminated_length": 240.6875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.2761561721563339, "epoch": 0.11278369615562761, "frac_reward_zero_std": 0.0, "grad_norm": 0.07958658784627914, "kl": 0.008481328841298819, "learning_rate": 9.774525243168133e-07, "loss": 0.0165, "num_tokens": 66932784.0, "reward": 0.8905339241027832, "reward_std": 0.04503028094768524, "rewards/reward_func/mean": 0.8905339241027832, "rewards/reward_func/std": 0.04503028839826584, "step": 2435, "step_time": 24.84669253230095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 198.25, "completions/mean_terminated_length": 198.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.40073180198669434, "epoch": 0.1128300138953219, "frac_reward_zero_std": 0.0, "grad_norm": 0.11505474150180817, "kl": 0.006373452895786613, "learning_rate": 9.774432607688744e-07, "loss": 0.0427, "num_tokens": 66969812.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 2436, "step_time": 24.155964501202106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.26683806255459785, "epoch": 0.11287633163501622, "frac_reward_zero_std": 1.0, "grad_norm": 0.002324956003576517, "kl": 0.0019464888609945774, "learning_rate": 9.774339972209356e-07, "loss": 0.0001, "num_tokens": 67008854.0, "reward": 0.022873464971780777, "reward_std": 0.0, "rewards/reward_func/mean": 0.022873464971780777, "rewards/reward_func/std": 0.0, "step": 2437, "step_time": 23.67304378002882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 215.9375, "completions/mean_terminated_length": 215.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.40574753284454346, "epoch": 0.11292264937471051, "frac_reward_zero_std": 0.0, "grad_norm": 0.08100494742393494, "kl": 0.0038533667102456093, "learning_rate": 9.774247336729967e-07, "loss": -0.1, "num_tokens": 67035349.0, "reward": 0.029568437486886978, "reward_std": 0.09081331640481949, "rewards/reward_func/mean": 0.029568437486886978, "rewards/reward_func/std": 0.09081331640481949, "step": 2438, "step_time": 25.900131553411484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.21693570539355278, "epoch": 0.11296896711440482, "frac_reward_zero_std": 0.0, "grad_norm": 0.15751352906227112, "kl": 0.0033534825779497623, "learning_rate": 9.774154701250578e-07, "loss": -0.0064, "num_tokens": 67058275.0, "reward": 0.9160261154174805, "reward_std": 0.027147367596626282, "rewards/reward_func/mean": 0.9160261154174805, "rewards/reward_func/std": 0.027147362008690834, "step": 2439, "step_time": 14.807053968310356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 215.9375, "completions/mean_terminated_length": 215.9375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.2589073069393635, "epoch": 0.11301528485409912, "frac_reward_zero_std": 0.0, "grad_norm": 0.1171020120382309, "kl": 0.0051482305862009525, "learning_rate": 9.77406206577119e-07, "loss": -0.1343, "num_tokens": 67096722.0, "reward": 0.23852992057800293, "reward_std": 0.39258697628974915, "rewards/reward_func/mean": 0.23852992057800293, "rewards/reward_func/std": 0.39258700609207153, "step": 2440, "step_time": 27.961682315915823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 127.6875, "completions/mean_terminated_length": 127.6875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.30557843297719955, "epoch": 0.11306160259379343, "frac_reward_zero_std": 1.0, "grad_norm": 0.004527962300926447, "kl": 0.0023293305130209774, "learning_rate": 9.7739694302918e-07, "loss": 0.0001, "num_tokens": 67116317.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2441, "step_time": 13.561669934540987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 134.6875, "completions/mean_terminated_length": 134.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3187221437692642, "epoch": 0.11310792033348772, "frac_reward_zero_std": 1.0, "grad_norm": 0.003435620805248618, "kl": 0.001834153721574694, "learning_rate": 9.773876794812412e-07, "loss": 0.0001, "num_tokens": 67145624.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2442, "step_time": 17.71932018175721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.42840543389320374, "epoch": 0.11315423807318203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034899311140179634, "kl": 0.002661260892637074, "learning_rate": 9.773784159333023e-07, "loss": 0.0001, "num_tokens": 67190375.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2443, "step_time": 23.76982979103923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 160.1875, "completions/mean_terminated_length": 160.1875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2066311314702034, "epoch": 0.11320055581287633, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016302632866427302, "kl": 0.0012858854897785932, "learning_rate": 9.773691523853636e-07, "loss": 0.0001, "num_tokens": 67217018.0, "reward": 0.6227038502693176, "reward_std": 0.0, "rewards/reward_func/mean": 0.6227038502693176, "rewards/reward_func/std": 0.0, "step": 2444, "step_time": 17.93243756890297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3939415439963341, "epoch": 0.11324687355257064, "frac_reward_zero_std": 1.0, "grad_norm": 0.01033829990774393, "kl": 0.004616707505192608, "learning_rate": 9.773598888374248e-07, "loss": 0.0002, "num_tokens": 67252991.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2445, "step_time": 22.318165626376867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 220.1875, "completions/mean_terminated_length": 220.1875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.4108148366212845, "epoch": 0.11329319129226494, "frac_reward_zero_std": 0.0, "grad_norm": 0.09378018230199814, "kl": 0.003654955537058413, "learning_rate": 9.773506252894859e-07, "loss": -0.0269, "num_tokens": 67285250.0, "reward": 0.11413758993148804, "reward_std": 0.31188327074050903, "rewards/reward_func/mean": 0.11413758993148804, "rewards/reward_func/std": 0.31188327074050903, "step": 2446, "step_time": 23.56105723977089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 120.3125, "completions/mean_terminated_length": 120.3125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.29247353225946426, "epoch": 0.11333950903195925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023144776932895184, "kl": 0.0017470692691858858, "learning_rate": 9.77341361741547e-07, "loss": 0.0001, "num_tokens": 67309751.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2447, "step_time": 15.461145281791687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.3118080571293831, "epoch": 0.11338582677165354, "frac_reward_zero_std": 0.0, "grad_norm": 0.09761524945497513, "kl": 0.0035818603937514126, "learning_rate": 9.773320981936081e-07, "loss": -0.0051, "num_tokens": 67333135.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 2448, "step_time": 22.700984682887793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.17893319949507713, "epoch": 0.11343214451134785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028546981047838926, "kl": 0.00166060306946747, "learning_rate": 9.773228346456693e-07, "loss": 0.0001, "num_tokens": 67362628.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2449, "step_time": 17.541016452014446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.39219270646572113, "epoch": 0.11347846225104215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016609176527708769, "kl": 0.0018543552432674915, "learning_rate": 9.773135710977304e-07, "loss": 0.0001, "num_tokens": 67396688.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2450, "step_time": 20.098556522279978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 116.0, "completions/max_terminated_length": 116.0, "completions/mean_length": 100.8125, "completions/mean_terminated_length": 100.8125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.253155704587698, "epoch": 0.11352477999073646, "frac_reward_zero_std": 1.0, "grad_norm": 0.006407290231436491, "kl": 0.001897846581414342, "learning_rate": 9.773043075497915e-07, "loss": 0.0001, "num_tokens": 67415869.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2451, "step_time": 11.394967649132013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 203.5625, "completions/mean_terminated_length": 203.5625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.24599649012088776, "epoch": 0.11357109773043075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018369624158367515, "kl": 0.0014612114755436778, "learning_rate": 9.772950440018526e-07, "loss": 0.0001, "num_tokens": 67450502.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 2452, "step_time": 25.18342625722289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 185.9375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.18431689217686653, "epoch": 0.11361741547012506, "frac_reward_zero_std": 0.0, "grad_norm": 0.08234444260597229, "kl": 0.001167101989267394, "learning_rate": 9.772857804539138e-07, "loss": 0.0121, "num_tokens": 67472933.0, "reward": 0.9698399305343628, "reward_std": 0.03531983122229576, "rewards/reward_func/mean": 0.9698399305343628, "rewards/reward_func/std": 0.03531982749700546, "step": 2453, "step_time": 20.30852472409606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 163.5625, "completions/mean_terminated_length": 163.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3077933043241501, "epoch": 0.11366373320981936, "frac_reward_zero_std": 1.0, "grad_norm": 0.002004193374887109, "kl": 0.0016860596660990268, "learning_rate": 9.772765169059749e-07, "loss": 0.0001, "num_tokens": 67494798.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2454, "step_time": 19.051043465733528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 127.0625, "completions/mean_terminated_length": 127.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3176931366324425, "epoch": 0.11371005094951367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023375828750431538, "kl": 0.0020944648422300816, "learning_rate": 9.77267253358036e-07, "loss": 0.0001, "num_tokens": 67523487.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2455, "step_time": 16.009984277188778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 194.5, "completions/mean_terminated_length": 194.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.37010613828897476, "epoch": 0.11375636868920797, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046645901165902615, "kl": 0.0030380228417925537, "learning_rate": 9.772579898100971e-07, "loss": 0.0002, "num_tokens": 67545591.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2456, "step_time": 20.34482028707862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2914397120475769, "epoch": 0.11380268642890227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021178536117076874, "kl": 0.0018803998245857656, "learning_rate": 9.772487262621585e-07, "loss": 0.0001, "num_tokens": 67581829.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2457, "step_time": 20.24321200698614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29282809793949127, "epoch": 0.11384900416859657, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017254592385143042, "kl": 0.0013090102875139564, "learning_rate": 9.772394627142196e-07, "loss": 0.0001, "num_tokens": 67604856.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2458, "step_time": 14.605748381465673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 120.25, "completions/mean_terminated_length": 120.25, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2606007978320122, "epoch": 0.11389532190829088, "frac_reward_zero_std": 1.0, "grad_norm": 0.004506888799369335, "kl": 0.0016115727194119245, "learning_rate": 9.772301991662807e-07, "loss": 0.0001, "num_tokens": 67627132.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2459, "step_time": 14.128421925008297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 135.6875, "completions/mean_terminated_length": 135.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2472737394273281, "epoch": 0.11394163964798518, "frac_reward_zero_std": 1.0, "grad_norm": 0.004102014470845461, "kl": 0.0022828959627076983, "learning_rate": 9.772209356183416e-07, "loss": 0.0001, "num_tokens": 67646807.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2460, "step_time": 14.82594895362854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.41472016274929047, "epoch": 0.11398795738767949, "frac_reward_zero_std": 1.0, "grad_norm": 0.002807251876220107, "kl": 0.0025067999376915395, "learning_rate": 9.77211672070403e-07, "loss": 0.0001, "num_tokens": 67689193.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2461, "step_time": 22.917190868407488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 156.6875, "completions/mean_terminated_length": 156.6875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.24410531669855118, "epoch": 0.11403427512737378, "frac_reward_zero_std": 1.0, "grad_norm": 0.002435762668028474, "kl": 0.0017704214551486075, "learning_rate": 9.77202408522464e-07, "loss": 0.0001, "num_tokens": 67709908.0, "reward": 0.8242367506027222, "reward_std": 0.0, "rewards/reward_func/mean": 0.8242367506027222, "rewards/reward_func/std": 0.0, "step": 2462, "step_time": 17.05655563622713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.39517487585544586, "epoch": 0.11408059286706809, "frac_reward_zero_std": 0.0, "grad_norm": 0.10505260527133942, "kl": 0.002370894537307322, "learning_rate": 9.771931449745252e-07, "loss": 0.0833, "num_tokens": 67730962.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 2463, "step_time": 22.13871493563056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 159.5625, "completions/mean_terminated_length": 159.5625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3184591829776764, "epoch": 0.11412691060676239, "frac_reward_zero_std": 1.0, "grad_norm": 0.003013376612216234, "kl": 0.0021464183810167015, "learning_rate": 9.771838814265863e-07, "loss": 0.0001, "num_tokens": 67754427.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2464, "step_time": 16.785066470503807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 236.5625, "completions/mean_terminated_length": 236.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4302835613489151, "epoch": 0.1141732283464567, "frac_reward_zero_std": 0.0, "grad_norm": 0.09732300043106079, "kl": 0.0053596136276610196, "learning_rate": 9.771746178786475e-07, "loss": -0.0337, "num_tokens": 67778484.0, "reward": 0.11073075234889984, "reward_std": 0.3025740087032318, "rewards/reward_func/mean": 0.11073075234889984, "rewards/reward_func/std": 0.3025740087032318, "step": 2465, "step_time": 28.65038428083062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 163.0625, "completions/mean_terminated_length": 163.0625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.19304820522665977, "epoch": 0.114219546086151, "frac_reward_zero_std": 1.0, "grad_norm": 0.001923121279105544, "kl": 0.0011828722199425101, "learning_rate": 9.771653543307086e-07, "loss": 0.0001, "num_tokens": 67800149.0, "reward": 0.8464817404747009, "reward_std": 0.0, "rewards/reward_func/mean": 0.8464817404747009, "rewards/reward_func/std": 0.0, "step": 2466, "step_time": 17.765714410692453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.27853118628263474, "epoch": 0.1142658638258453, "frac_reward_zero_std": 1.0, "grad_norm": 0.007093959022313356, "kl": 0.003363996569532901, "learning_rate": 9.771560907827697e-07, "loss": 0.0002, "num_tokens": 67823848.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2467, "step_time": 18.3814713396132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.44477756321430206, "epoch": 0.1143121815655396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014332630671560764, "kl": 0.0017497410881333053, "learning_rate": 9.771468272348308e-07, "loss": 0.0001, "num_tokens": 67871458.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2468, "step_time": 23.765099693089724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 175.6875, "completions/mean_terminated_length": 175.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.191434808075428, "epoch": 0.11435849930523391, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014370506396517158, "kl": 0.001366449665511027, "learning_rate": 9.77137563686892e-07, "loss": 0.0001, "num_tokens": 67909229.0, "reward": 0.9214109182357788, "reward_std": 0.0, "rewards/reward_func/mean": 0.9214109182357788, "rewards/reward_func/std": 0.0, "step": 2469, "step_time": 22.32357655465603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 212.1875, "completions/mean_terminated_length": 212.1875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3637712076306343, "epoch": 0.1144048170449282, "frac_reward_zero_std": 0.0, "grad_norm": 0.07481560856103897, "kl": 0.005705999210476875, "learning_rate": 9.77128300138953e-07, "loss": -0.003, "num_tokens": 67933520.0, "reward": 0.17361770570278168, "reward_std": 0.3732668459415436, "rewards/reward_func/mean": 0.17361770570278168, "rewards/reward_func/std": 0.3732668459415436, "step": 2470, "step_time": 24.43019162490964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.40962257981300354, "epoch": 0.11445113478462252, "frac_reward_zero_std": 1.0, "grad_norm": 0.007986698299646378, "kl": 0.0065677674720063806, "learning_rate": 9.771190365910144e-07, "loss": 0.0003, "num_tokens": 67959370.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2471, "step_time": 21.364749550819397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 204.9375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.33088134974241257, "epoch": 0.11449745252431681, "frac_reward_zero_std": 0.0, "grad_norm": 0.10680411756038666, "kl": 0.0036308920243754983, "learning_rate": 9.771097730430753e-07, "loss": 0.0064, "num_tokens": 67987625.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 2472, "step_time": 22.0128981359303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 155.0625, "completions/mean_terminated_length": 155.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1786942407488823, "epoch": 0.11454377026401112, "frac_reward_zero_std": 1.0, "grad_norm": 0.005208641290664673, "kl": 0.0022907425882294774, "learning_rate": 9.771005094951365e-07, "loss": 0.0001, "num_tokens": 68009706.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2473, "step_time": 15.87685788795352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.1408357098698616, "epoch": 0.11459008800370542, "frac_reward_zero_std": 1.0, "grad_norm": 0.003429030068218708, "kl": 0.0022986862750258297, "learning_rate": 9.770912459471978e-07, "loss": 0.0001, "num_tokens": 68029555.0, "reward": 0.51341712474823, "reward_std": 0.0, "rewards/reward_func/mean": 0.51341712474823, "rewards/reward_func/std": 0.0, "step": 2474, "step_time": 12.78066784888506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 193.625, "completions/mean_terminated_length": 193.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.42769351601600647, "epoch": 0.11463640574339973, "frac_reward_zero_std": 1.0, "grad_norm": 0.001790591748431325, "kl": 0.0019036740413866937, "learning_rate": 9.77081982399259e-07, "loss": 0.0001, "num_tokens": 68057197.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2475, "step_time": 21.227740541100502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 122.375, "completions/mean_terminated_length": 122.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.32360443472862244, "epoch": 0.11468272348309402, "frac_reward_zero_std": 1.0, "grad_norm": 0.008208557032048702, "kl": 0.003330700274091214, "learning_rate": 9.7707271885132e-07, "loss": 0.0002, "num_tokens": 68077907.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2476, "step_time": 14.623869501054287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 191.9375, "completions/mean_terminated_length": 191.9375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4546266943216324, "epoch": 0.11472904122278833, "frac_reward_zero_std": 1.0, "grad_norm": 0.006665271706879139, "kl": 0.004630137351341546, "learning_rate": 9.770634553033812e-07, "loss": 0.0002, "num_tokens": 68107554.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2477, "step_time": 21.555527418851852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.27639149501919746, "epoch": 0.11477535896248263, "frac_reward_zero_std": 1.0, "grad_norm": 0.004952006973326206, "kl": 0.0035169259645044804, "learning_rate": 9.770541917554423e-07, "loss": 0.0002, "num_tokens": 68131668.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 2478, "step_time": 18.94213031604886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3737163841724396, "epoch": 0.11482167670217694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038529098965227604, "kl": 0.0026498447987250984, "learning_rate": 9.770449282075034e-07, "loss": 0.0001, "num_tokens": 68157302.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2479, "step_time": 19.80051399767399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 194.4375, "completions/mean_terminated_length": 194.4375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.30508412420749664, "epoch": 0.11486799444187124, "frac_reward_zero_std": 1.0, "grad_norm": 0.007849538698792458, "kl": 0.0031641992973163724, "learning_rate": 9.770356646595646e-07, "loss": 0.0002, "num_tokens": 68192989.0, "reward": 0.5081327557563782, "reward_std": 0.0, "rewards/reward_func/mean": 0.5081327557563782, "rewards/reward_func/std": 0.0, "step": 2480, "step_time": 22.772242203354836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 155.4375, "completions/mean_terminated_length": 155.4375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3316722735762596, "epoch": 0.11491431218156554, "frac_reward_zero_std": 1.0, "grad_norm": 0.002754985122010112, "kl": 0.0022676190128549933, "learning_rate": 9.770264011116257e-07, "loss": 0.0001, "num_tokens": 68217060.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2481, "step_time": 16.53058822080493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.3607417270541191, "epoch": 0.11496062992125984, "frac_reward_zero_std": 0.0, "grad_norm": 0.10553093254566193, "kl": 0.01073522213846445, "learning_rate": 9.770171375636868e-07, "loss": -0.1059, "num_tokens": 68251236.0, "reward": 0.3325149118900299, "reward_std": 0.4026811718940735, "rewards/reward_func/mean": 0.3325149118900299, "rewards/reward_func/std": 0.4026811718940735, "step": 2482, "step_time": 30.127475015819073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3244210407137871, "epoch": 0.11500694766095415, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023280037567019463, "kl": 0.0018257657357025892, "learning_rate": 9.77007874015748e-07, "loss": 0.0001, "num_tokens": 68276110.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2483, "step_time": 18.436642192304134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 135.8125, "completions/mean_terminated_length": 135.8125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.20642798021435738, "epoch": 0.11505326540064845, "frac_reward_zero_std": 1.0, "grad_norm": 0.002512483624741435, "kl": 0.0014322706556413323, "learning_rate": 9.769986104678093e-07, "loss": 0.0001, "num_tokens": 68295835.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2484, "step_time": 14.553882360458374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.32794320583343506, "epoch": 0.11509958314034276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017755960579961538, "kl": 0.0019293739460408688, "learning_rate": 9.769893469198702e-07, "loss": 0.0001, "num_tokens": 68327093.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2485, "step_time": 18.01482929289341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4321519210934639, "epoch": 0.11514590088003705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014660722808912396, "kl": 0.002090283203870058, "learning_rate": 9.769800833719313e-07, "loss": 0.0001, "num_tokens": 68370150.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2486, "step_time": 21.97736431285739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.26701557636260986, "epoch": 0.11519221861973136, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029601918067783117, "kl": 0.0023622983135282993, "learning_rate": 9.769708198239926e-07, "loss": 0.0001, "num_tokens": 68393512.0, "reward": 0.7958667874336243, "reward_std": 0.0, "rewards/reward_func/mean": 0.7958667874336243, "rewards/reward_func/std": 0.0, "step": 2487, "step_time": 16.45381862297654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.29334940016269684, "epoch": 0.11523853635942566, "frac_reward_zero_std": 0.0, "grad_norm": 0.121180459856987, "kl": 0.0024091697414405644, "learning_rate": 9.769615562760538e-07, "loss": -0.0464, "num_tokens": 68417407.0, "reward": 0.18374274671077728, "reward_std": 0.010824820958077908, "rewards/reward_func/mean": 0.18374274671077728, "rewards/reward_func/std": 0.010824819095432758, "step": 2488, "step_time": 18.614742059260607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 112.25, "completions/mean_terminated_length": 112.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.26405394822359085, "epoch": 0.11528485409911997, "frac_reward_zero_std": 1.0, "grad_norm": 0.002262190915644169, "kl": 0.0015944482001941651, "learning_rate": 9.769522927281149e-07, "loss": 0.0001, "num_tokens": 68440739.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2489, "step_time": 13.781056050211191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 137.6875, "completions/mean_terminated_length": 137.6875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3129817843437195, "epoch": 0.11533117183881426, "frac_reward_zero_std": 1.0, "grad_norm": 0.002422789577394724, "kl": 0.001710813317913562, "learning_rate": 9.76943029180176e-07, "loss": 0.0001, "num_tokens": 68462494.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2490, "step_time": 14.91152261197567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4279804676771164, "epoch": 0.11537748957850857, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019432164262980223, "kl": 0.0018543907208368182, "learning_rate": 9.769337656322371e-07, "loss": 0.0001, "num_tokens": 68495004.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2491, "step_time": 21.28641689941287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 183.0, "completions/mean_terminated_length": 183.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.35045985877513885, "epoch": 0.11542380731820287, "frac_reward_zero_std": 1.0, "grad_norm": 0.009118014015257359, "kl": 0.005484711611643434, "learning_rate": 9.769245020842983e-07, "loss": 0.0003, "num_tokens": 68532764.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2492, "step_time": 22.926713228225708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 199.5625, "completions/mean_terminated_length": 199.5625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.32855086028575897, "epoch": 0.11547012505789718, "frac_reward_zero_std": 0.0, "grad_norm": 0.10711812973022461, "kl": 0.008030668599531054, "learning_rate": 9.769152385363594e-07, "loss": -0.0518, "num_tokens": 68570245.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 2493, "step_time": 24.291283402591944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 179.5625, "completions/mean_terminated_length": 179.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.384690523147583, "epoch": 0.11551644279759148, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048539177514612675, "kl": 0.0027220979100093246, "learning_rate": 9.769059749884205e-07, "loss": 0.0001, "num_tokens": 68601886.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2494, "step_time": 21.380510710179806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 135.25, "completions/mean_terminated_length": 135.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2969150096178055, "epoch": 0.11556276053728579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058605242520570755, "kl": 0.002608942857477814, "learning_rate": 9.768967114404816e-07, "loss": 0.0001, "num_tokens": 68624962.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2495, "step_time": 14.64000740274787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.28155994415283203, "epoch": 0.11560907827698008, "frac_reward_zero_std": 0.0, "grad_norm": 0.12155213952064514, "kl": 0.0028275814838707447, "learning_rate": 9.768874478925428e-07, "loss": 0.0209, "num_tokens": 68653154.0, "reward": 0.9307626485824585, "reward_std": 0.05584240332245827, "rewards/reward_func/mean": 0.9307626485824585, "rewards/reward_func/std": 0.05584241822361946, "step": 2496, "step_time": 21.322447326034307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 178.6875, "completions/mean_terminated_length": 178.6875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3451504111289978, "epoch": 0.11565539601667439, "frac_reward_zero_std": 1.0, "grad_norm": 0.00205622217617929, "kl": 0.0020811408176086843, "learning_rate": 9.768781843446039e-07, "loss": 0.0001, "num_tokens": 68683149.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2497, "step_time": 19.86602247133851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.22228306159377098, "epoch": 0.11570171375636869, "frac_reward_zero_std": 0.0, "grad_norm": 0.10164478421211243, "kl": 0.0013924910745117813, "learning_rate": 9.76868920796665e-07, "loss": 0.0231, "num_tokens": 68705768.0, "reward": 0.8223152160644531, "reward_std": 0.05801927670836449, "rewards/reward_func/mean": 0.8223152160644531, "rewards/reward_func/std": 0.058019280433654785, "step": 2498, "step_time": 19.609724581241608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.10717789456248283, "epoch": 0.115748031496063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024656946770846844, "kl": 0.0008034743805183098, "learning_rate": 9.768596572487261e-07, "loss": 0.0, "num_tokens": 68744029.0, "reward": 0.7598356604576111, "reward_std": 0.0, "rewards/reward_func/mean": 0.7598356604576111, "rewards/reward_func/std": 0.0, "step": 2499, "step_time": 19.850645527243614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 152.9375, "completions/mean_terminated_length": 152.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.24117330461740494, "epoch": 0.1157943492357573, "frac_reward_zero_std": 1.0, "grad_norm": 0.00201165908947587, "kl": 0.001557450246764347, "learning_rate": 9.768503937007873e-07, "loss": 0.0001, "num_tokens": 68765308.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2500, "step_time": 16.42570138722658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 166.125, "completions/mean_terminated_length": 166.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.380124993622303, "epoch": 0.1158406669754516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053825657814741135, "kl": 0.003818499739281833, "learning_rate": 9.768411301528486e-07, "loss": 0.0002, "num_tokens": 68789294.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2501, "step_time": 20.48052605614066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 251.5625, "completions/mean_terminated_length": 251.5625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.20027782395482063, "epoch": 0.1158869847151459, "frac_reward_zero_std": 0.0, "grad_norm": 0.12447930872440338, "kl": 0.007670757360756397, "learning_rate": 9.768318666049097e-07, "loss": -0.1226, "num_tokens": 68815015.0, "reward": 0.904274046421051, "reward_std": 0.2615731358528137, "rewards/reward_func/mean": 0.904274046421051, "rewards/reward_func/std": 0.2615731358528137, "step": 2502, "step_time": 25.985407132655382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3250097781419754, "epoch": 0.11593330245484021, "frac_reward_zero_std": 1.0, "grad_norm": 0.003276855917647481, "kl": 0.002020838321186602, "learning_rate": 9.768226030569706e-07, "loss": 0.0001, "num_tokens": 68835066.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2503, "step_time": 14.984255533665419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 112.6875, "completions/mean_terminated_length": 112.6875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.28627268224954605, "epoch": 0.1159796201945345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021926662884652615, "kl": 0.0016013141721487045, "learning_rate": 9.76813339509032e-07, "loss": 0.0001, "num_tokens": 68856005.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2504, "step_time": 13.714712552726269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.26569344848394394, "epoch": 0.11602593793422881, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037235531490296125, "kl": 0.0022294174996204674, "learning_rate": 9.76804075961093e-07, "loss": 0.0001, "num_tokens": 68878823.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2505, "step_time": 15.604820631444454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.24384716153144836, "epoch": 0.11607225567392311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025913529098033905, "kl": 0.002238882676465437, "learning_rate": 9.767948124131542e-07, "loss": 0.0001, "num_tokens": 68903745.0, "reward": 0.5384570360183716, "reward_std": 0.0, "rewards/reward_func/mean": 0.5384570360183716, "rewards/reward_func/std": 0.0, "step": 2506, "step_time": 19.494688913226128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2504768893122673, "epoch": 0.11611857341361742, "frac_reward_zero_std": 1.0, "grad_norm": 0.003581949509680271, "kl": 0.0018356178188696504, "learning_rate": 9.767855488652154e-07, "loss": 0.0001, "num_tokens": 68924256.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2507, "step_time": 13.948278229683638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 194.6875, "completions/mean_terminated_length": 194.6875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4146835431456566, "epoch": 0.11616489115331172, "frac_reward_zero_std": 1.0, "grad_norm": 0.004226749762892723, "kl": 0.0034904314088635147, "learning_rate": 9.767762853172765e-07, "loss": 0.0002, "num_tokens": 68953003.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2508, "step_time": 22.90442780032754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 171.5, "completions/mean_terminated_length": 171.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.17129074409604073, "epoch": 0.11621120889300603, "frac_reward_zero_std": 1.0, "grad_norm": 0.003777128178626299, "kl": 0.002778336522169411, "learning_rate": 9.767670217693376e-07, "loss": 0.0001, "num_tokens": 68975363.0, "reward": 0.7091062068939209, "reward_std": 0.0, "rewards/reward_func/mean": 0.7091062068939209, "rewards/reward_func/std": 0.0, "step": 2509, "step_time": 17.512671183794737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.1635386347770691, "epoch": 0.11625752663270032, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027541702147573233, "kl": 0.001805927458917722, "learning_rate": 9.767577582213987e-07, "loss": 0.0001, "num_tokens": 68996411.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2510, "step_time": 15.2217398583889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 141.3125, "completions/mean_terminated_length": 141.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.22284924611449242, "epoch": 0.11630384437239463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014736582525074482, "kl": 0.0010546468110987917, "learning_rate": 9.767484946734599e-07, "loss": 0.0001, "num_tokens": 69016128.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2511, "step_time": 15.050313018262386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3137256056070328, "epoch": 0.11635016211208893, "frac_reward_zero_std": 1.0, "grad_norm": 0.004238894674926996, "kl": 0.0024793065967969596, "learning_rate": 9.76739231125521e-07, "loss": 0.0001, "num_tokens": 69040616.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2512, "step_time": 15.094173938035965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 121.3125, "completions/mean_terminated_length": 121.3125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2705349326133728, "epoch": 0.11639647985178324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013780392473563552, "kl": 0.0012340772082097828, "learning_rate": 9.76729967577582e-07, "loss": 0.0001, "num_tokens": 69063181.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2513, "step_time": 14.087721854448318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 190.8125, "completions/mean_terminated_length": 190.8125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.15664460882544518, "epoch": 0.11644279759147753, "frac_reward_zero_std": 1.0, "grad_norm": 0.003150130156427622, "kl": 0.0015929357905406505, "learning_rate": 9.767207040296434e-07, "loss": 0.0001, "num_tokens": 69100618.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2514, "step_time": 21.97874530404806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3434393182396889, "epoch": 0.11648911533117184, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027103142347186804, "kl": 0.0019133105815853924, "learning_rate": 9.767114404817044e-07, "loss": 0.0001, "num_tokens": 69125222.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2515, "step_time": 16.80498769879341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3457399159669876, "epoch": 0.11653543307086614, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031414953991770744, "kl": 0.0023865659604780376, "learning_rate": 9.767021769337655e-07, "loss": 0.0001, "num_tokens": 69150946.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2516, "step_time": 17.40716779232025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.22851867228746414, "epoch": 0.11658175081056045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013847298687323928, "kl": 0.001284622703678906, "learning_rate": 9.766929133858268e-07, "loss": 0.0001, "num_tokens": 69171474.0, "reward": 0.3384654223918915, "reward_std": 0.0, "rewards/reward_func/mean": 0.3384654223918915, "rewards/reward_func/std": 0.0, "step": 2517, "step_time": 16.074111629277468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 153.25, "completions/mean_terminated_length": 153.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3559677377343178, "epoch": 0.11662806855025475, "frac_reward_zero_std": 1.0, "grad_norm": 0.003983703907579184, "kl": 0.00274868356063962, "learning_rate": 9.76683649837888e-07, "loss": 0.0001, "num_tokens": 69195398.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2518, "step_time": 16.23988012596965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4453985244035721, "epoch": 0.11667438628994906, "frac_reward_zero_std": 0.0, "grad_norm": 0.12077568471431732, "kl": 0.00353471894050017, "learning_rate": 9.76674386289949e-07, "loss": 0.0761, "num_tokens": 69216186.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 2519, "step_time": 24.598925530910492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 144.3125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.26991620659828186, "epoch": 0.11672070402964335, "frac_reward_zero_std": 1.0, "grad_norm": 0.01442884560674429, "kl": 0.0031042208429425955, "learning_rate": 9.766651227420102e-07, "loss": 0.0002, "num_tokens": 69236959.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2520, "step_time": 16.014219731092453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 132.0625, "completions/mean_terminated_length": 132.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2830207869410515, "epoch": 0.11676702176933766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031016753055155277, "kl": 0.002233900304418057, "learning_rate": 9.766558591940713e-07, "loss": 0.0001, "num_tokens": 69260544.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2521, "step_time": 16.300537552684546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 178.8125, "completions/mean_terminated_length": 178.8125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.27052196860313416, "epoch": 0.11681333950903196, "frac_reward_zero_std": 0.0, "grad_norm": 0.13894326984882355, "kl": 0.004763779055792838, "learning_rate": 9.766465956461324e-07, "loss": 0.0117, "num_tokens": 69291805.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2522, "step_time": 20.154803916811943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 159.9375, "completions/mean_terminated_length": 159.9375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.40736597776412964, "epoch": 0.11685965724872627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026998703833669424, "kl": 0.0018361476249992847, "learning_rate": 9.766373320981936e-07, "loss": 0.0001, "num_tokens": 69330492.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2523, "step_time": 21.67042900249362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3571884483098984, "epoch": 0.11690597498842056, "frac_reward_zero_std": 0.0, "grad_norm": 0.11671131104230881, "kl": 0.010688660433515906, "learning_rate": 9.766280685502547e-07, "loss": 0.0162, "num_tokens": 69351553.0, "reward": 0.7045598030090332, "reward_std": 0.4201183021068573, "rewards/reward_func/mean": 0.7045598030090332, "rewards/reward_func/std": 0.4201183021068573, "step": 2524, "step_time": 19.222447354346514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 132.8125, "completions/mean_terminated_length": 132.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.34327084571123123, "epoch": 0.11695229272811487, "frac_reward_zero_std": 1.0, "grad_norm": 0.003010922111570835, "kl": 0.0019749950733967125, "learning_rate": 9.766188050023158e-07, "loss": 0.0001, "num_tokens": 69377950.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2525, "step_time": 16.51058854907751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.3339350149035454, "epoch": 0.11699861046780917, "frac_reward_zero_std": 0.0, "grad_norm": 0.12125305831432343, "kl": 0.001954075414687395, "learning_rate": 9.76609541454377e-07, "loss": 0.0955, "num_tokens": 69399086.0, "reward": 0.6449456214904785, "reward_std": 0.17198549211025238, "rewards/reward_func/mean": 0.6449456214904785, "rewards/reward_func/std": 0.17198549211025238, "step": 2526, "step_time": 23.677948355674744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.26828376948833466, "epoch": 0.11704492820750348, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028099219780415297, "kl": 0.0018537837022449821, "learning_rate": 9.766002779064383e-07, "loss": 0.0001, "num_tokens": 69419805.0, "reward": 0.7632111310958862, "reward_std": 0.0, "rewards/reward_func/mean": 0.7632111310958862, "rewards/reward_func/std": 0.0, "step": 2527, "step_time": 16.765187423676252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 168.0625, "completions/mean_terminated_length": 168.0625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4304979592561722, "epoch": 0.11709124594719778, "frac_reward_zero_std": 1.0, "grad_norm": 0.002772212726995349, "kl": 0.0025947302929125726, "learning_rate": 9.765910143584992e-07, "loss": 0.0001, "num_tokens": 69452286.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2528, "step_time": 20.656904868781567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 165.3125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3324563130736351, "epoch": 0.11713756368689208, "frac_reward_zero_std": 0.0, "grad_norm": 0.13781264424324036, "kl": 0.005630438216030598, "learning_rate": 9.765817508105603e-07, "loss": 0.0223, "num_tokens": 69473795.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 2529, "step_time": 20.35880806297064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.1971878558397293, "epoch": 0.11718388142658638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0050765895284712315, "kl": 0.003672609105706215, "learning_rate": 9.765724872626214e-07, "loss": 0.0002, "num_tokens": 69498147.0, "reward": 0.8343905210494995, "reward_std": 0.0, "rewards/reward_func/mean": 0.8343905210494995, "rewards/reward_func/std": 0.0, "step": 2530, "step_time": 29.195719808340073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 264.5625, "completions/mean_terminated_length": 264.5625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.19106394052505493, "epoch": 0.11723019916628069, "frac_reward_zero_std": 1.0, "grad_norm": 0.002047973684966564, "kl": 0.0019100763311143965, "learning_rate": 9.765632237146828e-07, "loss": 0.0001, "num_tokens": 69524460.0, "reward": 0.795195996761322, "reward_std": 0.0, "rewards/reward_func/mean": 0.795195996761322, "rewards/reward_func/std": 0.0, "step": 2531, "step_time": 25.892326433211565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.23522640764713287, "epoch": 0.11727651690597499, "frac_reward_zero_std": 0.0, "grad_norm": 0.13442489504814148, "kl": 0.004400189907755703, "learning_rate": 9.76553960166744e-07, "loss": -0.0669, "num_tokens": 69546870.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 2532, "step_time": 18.030234690755606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 132.75, "completions/mean_terminated_length": 132.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.30949801206588745, "epoch": 0.1173228346456693, "frac_reward_zero_std": 1.0, "grad_norm": 0.004218805581331253, "kl": 0.002690243854885921, "learning_rate": 9.76544696618805e-07, "loss": 0.0001, "num_tokens": 69582722.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2533, "step_time": 18.262815680354834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4334608465433121, "epoch": 0.11736915238536359, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019446579972282052, "kl": 0.0019211559265386313, "learning_rate": 9.765354330708661e-07, "loss": 0.0001, "num_tokens": 69603804.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2534, "step_time": 17.4964744374156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 163.8125, "completions/mean_terminated_length": 163.8125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3907167837023735, "epoch": 0.1174154701250579, "frac_reward_zero_std": 1.0, "grad_norm": 0.008374964818358421, "kl": 0.0043603170197457075, "learning_rate": 9.765261695229273e-07, "loss": 0.0002, "num_tokens": 69633849.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2535, "step_time": 18.502758789807558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 102.5, "completions/mean_terminated_length": 102.5, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.2020980902016163, "epoch": 0.1174617878647522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015470476355403662, "kl": 0.001157489256002009, "learning_rate": 9.765169059749884e-07, "loss": 0.0001, "num_tokens": 69653137.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2536, "step_time": 12.262867517769337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2808125540614128, "epoch": 0.11750810560444651, "frac_reward_zero_std": 1.0, "grad_norm": 0.0043172030709683895, "kl": 0.0021788671147078276, "learning_rate": 9.765076424270495e-07, "loss": 0.0001, "num_tokens": 69672479.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2537, "step_time": 13.167741309851408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 143.3125, "completions/mean_terminated_length": 143.3125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2784837633371353, "epoch": 0.1175544233441408, "frac_reward_zero_std": 1.0, "grad_norm": 0.005533813498914242, "kl": 0.003838031552731991, "learning_rate": 9.764983788791106e-07, "loss": 0.0002, "num_tokens": 69693860.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2538, "step_time": 14.989644382148981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 128.8125, "completions/mean_terminated_length": 128.8125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.342710942029953, "epoch": 0.11760074108383511, "frac_reward_zero_std": 1.0, "grad_norm": 0.002589267445728183, "kl": 0.0018046482873614877, "learning_rate": 9.764891153311718e-07, "loss": 0.0001, "num_tokens": 69717617.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2539, "step_time": 15.151292867958546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.2687339186668396, "epoch": 0.11764705882352941, "frac_reward_zero_std": 0.0, "grad_norm": 0.07287666201591492, "kl": 0.002109242486767471, "learning_rate": 9.76479851783233e-07, "loss": -0.0628, "num_tokens": 69749941.0, "reward": 0.1672072559595108, "reward_std": 0.08809083700180054, "rewards/reward_func/mean": 0.1672072559595108, "rewards/reward_func/std": 0.08809084445238113, "step": 2540, "step_time": 31.04066489636898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 132.9375, "completions/mean_terminated_length": 132.9375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.27311180531978607, "epoch": 0.11769337656322372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028962416108697653, "kl": 0.0020032948814332485, "learning_rate": 9.76470588235294e-07, "loss": 0.0001, "num_tokens": 69773412.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2541, "step_time": 14.571888111531734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 166.4375, "completions/mean_terminated_length": 166.4375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3374975621700287, "epoch": 0.11773969430291802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031090453267097473, "kl": 0.001794400392100215, "learning_rate": 9.764613246873551e-07, "loss": 0.0001, "num_tokens": 69793563.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2542, "step_time": 17.30877362936735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3300483003258705, "epoch": 0.11778601204261233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032420754432678223, "kl": 0.0022386827040463686, "learning_rate": 9.764520611394163e-07, "loss": 0.0001, "num_tokens": 69814685.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2543, "step_time": 15.987787026911974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 206.0625, "completions/mean_terminated_length": 206.0625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.1973423846065998, "epoch": 0.11783232978230662, "frac_reward_zero_std": 1.0, "grad_norm": 0.005508746951818466, "kl": 0.00416121503803879, "learning_rate": 9.764427975914776e-07, "loss": 0.0002, "num_tokens": 69848958.0, "reward": 0.6807124018669128, "reward_std": 0.0, "rewards/reward_func/mean": 0.6807124018669128, "rewards/reward_func/std": 0.0, "step": 2544, "step_time": 23.922963842749596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.29599516093730927, "epoch": 0.11787864752200093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034114622976630926, "kl": 0.0027078649727627635, "learning_rate": 9.764335340435387e-07, "loss": 0.0001, "num_tokens": 69877846.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2545, "step_time": 17.332238290458918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.1840292140841484, "epoch": 0.11792496526169523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019980361685156822, "kl": 0.0013126096746418625, "learning_rate": 9.764242704955996e-07, "loss": 0.0001, "num_tokens": 69928726.0, "reward": 0.9091564416885376, "reward_std": 0.0, "rewards/reward_func/mean": 0.9091564416885376, "rewards/reward_func/std": 0.0, "step": 2546, "step_time": 27.377622980624437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 177.3125, "completions/mean_terminated_length": 177.3125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.37352460622787476, "epoch": 0.11797128300138954, "frac_reward_zero_std": 1.0, "grad_norm": 0.01190211158245802, "kl": 0.006103089544922113, "learning_rate": 9.76415006947661e-07, "loss": 0.0003, "num_tokens": 69952907.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2547, "step_time": 19.16219438239932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 173.0625, "completions/mean_terminated_length": 173.0625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.29012996703386307, "epoch": 0.11801760074108383, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029143390711396933, "kl": 0.0019404477789066732, "learning_rate": 9.764057433997221e-07, "loss": 0.0001, "num_tokens": 69982636.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2548, "step_time": 19.47384374216199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3809279501438141, "epoch": 0.11806391848077814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041020638309419155, "kl": 0.0033375016937498003, "learning_rate": 9.763964798517832e-07, "loss": 0.0002, "num_tokens": 70019802.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2549, "step_time": 22.1692301556468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.3624110445380211, "epoch": 0.11811023622047244, "frac_reward_zero_std": 0.0, "grad_norm": 0.11295898258686066, "kl": 0.005656790337525308, "learning_rate": 9.763872163038444e-07, "loss": -0.1305, "num_tokens": 70046406.0, "reward": 0.3129962086677551, "reward_std": 0.46819740533828735, "rewards/reward_func/mean": 0.3129962086677551, "rewards/reward_func/std": 0.46819743514060974, "step": 2550, "step_time": 28.486052256077528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.37478768825531006, "epoch": 0.11815655396016675, "frac_reward_zero_std": 0.0, "grad_norm": 0.1396738439798355, "kl": 0.004766521626152098, "learning_rate": 9.763779527559055e-07, "loss": 0.0838, "num_tokens": 70071898.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 2551, "step_time": 25.988287832587957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 252.3125, "completions/mean_terminated_length": 252.3125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.3208429887890816, "epoch": 0.11820287169986105, "frac_reward_zero_std": 0.0, "grad_norm": 0.08668361604213715, "kl": 0.004409196495544165, "learning_rate": 9.763686892079666e-07, "loss": -0.1385, "num_tokens": 70111055.0, "reward": 0.34896644949913025, "reward_std": 0.3645111322402954, "rewards/reward_func/mean": 0.34896644949913025, "rewards/reward_func/std": 0.3645111620426178, "step": 2552, "step_time": 35.64171186834574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4924083277583122, "epoch": 0.11824918943955535, "frac_reward_zero_std": 0.0, "grad_norm": 0.09804312139749527, "kl": 0.0031258187373168766, "learning_rate": 9.763594256600277e-07, "loss": 0.0219, "num_tokens": 70136737.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.5625, "rewards/reward_func/std": 0.5123475790023804, "step": 2553, "step_time": 29.227614890784025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.29411546885967255, "epoch": 0.11829550717924965, "frac_reward_zero_std": 1.0, "grad_norm": 0.001870762906037271, "kl": 0.0017390170833095908, "learning_rate": 9.763501621120889e-07, "loss": 0.0001, "num_tokens": 70158845.0, "reward": 0.5308194756507874, "reward_std": 0.0, "rewards/reward_func/mean": 0.5308194756507874, "rewards/reward_func/std": 0.0, "step": 2554, "step_time": 23.829374082386494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 109.3125, "completions/mean_terminated_length": 109.3125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3609594851732254, "epoch": 0.11834182491894396, "frac_reward_zero_std": 1.0, "grad_norm": 0.00425384845584631, "kl": 0.002779354923404753, "learning_rate": 9.7634089856415e-07, "loss": 0.0001, "num_tokens": 70184034.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2555, "step_time": 14.10405632853508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 159.9375, "completions/mean_terminated_length": 159.9375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3891973942518234, "epoch": 0.11838814265863826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011609444627538323, "kl": 0.0014020515664014965, "learning_rate": 9.763316350162111e-07, "loss": 0.0001, "num_tokens": 70240625.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2556, "step_time": 25.749696049839258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.35119353979825974, "epoch": 0.11843446039833257, "frac_reward_zero_std": 1.0, "grad_norm": 0.004023308400064707, "kl": 0.0023010563745629042, "learning_rate": 9.763223714682724e-07, "loss": 0.0001, "num_tokens": 70276882.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2557, "step_time": 19.22999330982566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3454890549182892, "epoch": 0.11848077813802686, "frac_reward_zero_std": 1.0, "grad_norm": 0.004776523914188147, "kl": 0.003007733845151961, "learning_rate": 9.763131079203334e-07, "loss": 0.0001, "num_tokens": 70299003.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2558, "step_time": 17.241932556033134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.1966676041483879, "epoch": 0.11852709587772117, "frac_reward_zero_std": 1.0, "grad_norm": 0.001377631677314639, "kl": 0.0009226291585946456, "learning_rate": 9.763038443723945e-07, "loss": 0.0, "num_tokens": 70332252.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 2559, "step_time": 20.28189316019416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 352.0625, "completions/mean_terminated_length": 352.0625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.1965409442782402, "epoch": 0.11857341361741547, "frac_reward_zero_std": 1.0, "grad_norm": 0.001986697083339095, "kl": 0.0017113542126026005, "learning_rate": 9.762945808244556e-07, "loss": 0.0001, "num_tokens": 70374653.0, "reward": 0.7498524785041809, "reward_std": 0.0, "rewards/reward_func/mean": 0.7498524785041809, "rewards/reward_func/std": 0.0, "step": 2560, "step_time": 37.20276174321771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 218.4375, "completions/mean_terminated_length": 218.4375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.41453102231025696, "epoch": 0.11861973135710978, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035396451130509377, "kl": 0.002845336392056197, "learning_rate": 9.76285317276517e-07, "loss": 0.0001, "num_tokens": 70399076.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2561, "step_time": 26.513055469840765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.36704379320144653, "epoch": 0.11866604909680407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027061717119067907, "kl": 0.002521127520594746, "learning_rate": 9.76276053728578e-07, "loss": 0.0001, "num_tokens": 70423558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2562, "step_time": 19.267679549753666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 130.8125, "completions/mean_terminated_length": 130.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.32686904817819595, "epoch": 0.11871236683649838, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027874025981873274, "kl": 0.002084024017676711, "learning_rate": 9.762667901806392e-07, "loss": 0.0001, "num_tokens": 70444867.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2563, "step_time": 14.60996313393116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.26866399496793747, "epoch": 0.11875868457619268, "frac_reward_zero_std": 1.0, "grad_norm": 0.003486102446913719, "kl": 0.002615977020468563, "learning_rate": 9.762575266327003e-07, "loss": 0.0001, "num_tokens": 70468373.0, "reward": 0.780767560005188, "reward_std": 0.0, "rewards/reward_func/mean": 0.780767560005188, "rewards/reward_func/std": 0.0, "step": 2564, "step_time": 17.069831989705563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 132.875, "completions/mean_terminated_length": 132.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.17322808876633644, "epoch": 0.11880500231588699, "frac_reward_zero_std": 0.0, "grad_norm": 0.13548368215560913, "kl": 0.003650828613899648, "learning_rate": 9.762482630847614e-07, "loss": -0.0752, "num_tokens": 70491299.0, "reward": 0.3941449522972107, "reward_std": 0.10506203025579453, "rewards/reward_func/mean": 0.3941449522972107, "rewards/reward_func/std": 0.10506203770637512, "step": 2565, "step_time": 16.4831387065351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 105.0, "completions/mean_terminated_length": 105.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2161022536456585, "epoch": 0.11885132005558129, "frac_reward_zero_std": 1.0, "grad_norm": 0.002563272602856159, "kl": 0.001570798660395667, "learning_rate": 9.762389995368226e-07, "loss": 0.0001, "num_tokens": 70511267.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2566, "step_time": 12.12009947001934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3114311099052429, "epoch": 0.1188976377952756, "frac_reward_zero_std": 0.0, "grad_norm": 0.12095338851213455, "kl": 0.00406394392484799, "learning_rate": 9.762297359888837e-07, "loss": 0.0562, "num_tokens": 70534231.0, "reward": 0.7375170588493347, "reward_std": 0.19667121767997742, "rewards/reward_func/mean": 0.7375170588493347, "rewards/reward_func/std": 0.19667121767997742, "step": 2567, "step_time": 20.278707768768072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 132.75, "completions/mean_terminated_length": 132.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2700130119919777, "epoch": 0.11894395553496989, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033549421932548285, "kl": 0.002145721489796415, "learning_rate": 9.762204724409448e-07, "loss": 0.0001, "num_tokens": 70554931.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2568, "step_time": 15.19302299246192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 203.8125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.40337076783180237, "epoch": 0.1189902732746642, "frac_reward_zero_std": 0.0, "grad_norm": 0.08717337250709534, "kl": 0.005992951802909374, "learning_rate": 9.76211208893006e-07, "loss": -0.0625, "num_tokens": 70579136.0, "reward": 0.05871332064270973, "reward_std": 0.23485326766967773, "rewards/reward_func/mean": 0.05871332064270973, "rewards/reward_func/std": 0.23485326766967773, "step": 2569, "step_time": 25.097491294145584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.34616900235414505, "epoch": 0.1190365910143585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039023745339363813, "kl": 0.0027815004577860236, "learning_rate": 9.76201945345067e-07, "loss": 0.0001, "num_tokens": 70600954.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2570, "step_time": 16.051095638424158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 157.4375, "completions/mean_terminated_length": 157.4375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.23404672741889954, "epoch": 0.11908290875405281, "frac_reward_zero_std": 1.0, "grad_norm": 0.005644877906888723, "kl": 0.0047567912843078375, "learning_rate": 9.761926817971282e-07, "loss": 0.0002, "num_tokens": 70630833.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2571, "step_time": 18.684407092630863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 145.8125, "completions/mean_terminated_length": 145.8125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.324734590947628, "epoch": 0.1191292264937471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038916415069252253, "kl": 0.0022496864548884332, "learning_rate": 9.761834182491893e-07, "loss": 0.0001, "num_tokens": 70651374.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2572, "step_time": 15.262799922376871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3342023342847824, "epoch": 0.11917554423344141, "frac_reward_zero_std": 0.0, "grad_norm": 0.1105809360742569, "kl": 0.00803664920385927, "learning_rate": 9.761741547012504e-07, "loss": -0.1154, "num_tokens": 70680602.0, "reward": 0.11411845684051514, "reward_std": 0.311830997467041, "rewards/reward_func/mean": 0.11411845684051514, "rewards/reward_func/std": 0.311830997467041, "step": 2573, "step_time": 24.5421023927629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3873288184404373, "epoch": 0.11922186197313571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023885746486485004, "kl": 0.0021763765544164926, "learning_rate": 9.761648911533118e-07, "loss": 0.0001, "num_tokens": 70718064.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2574, "step_time": 23.26786745339632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2526915520429611, "epoch": 0.11926817971283002, "frac_reward_zero_std": 1.0, "grad_norm": 0.004863819573074579, "kl": 0.003397050779312849, "learning_rate": 9.76155627605373e-07, "loss": 0.0002, "num_tokens": 70754659.0, "reward": 0.45559218525886536, "reward_std": 0.0, "rewards/reward_func/mean": 0.45559218525886536, "rewards/reward_func/std": 0.0, "step": 2575, "step_time": 20.47814941033721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2699509561061859, "epoch": 0.11931449745252432, "frac_reward_zero_std": 1.0, "grad_norm": 0.004671419970691204, "kl": 0.002445378544507548, "learning_rate": 9.76146364057434e-07, "loss": 0.0001, "num_tokens": 70774231.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2576, "step_time": 14.873398952186108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.36087697744369507, "epoch": 0.11936081519221863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034232509788125753, "kl": 0.0024271892034448683, "learning_rate": 9.76137100509495e-07, "loss": 0.0001, "num_tokens": 70799515.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2577, "step_time": 19.80249995365739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3625039979815483, "epoch": 0.11940713293191292, "frac_reward_zero_std": 0.0, "grad_norm": 0.09588645398616791, "kl": 0.0036418578820303082, "learning_rate": 9.761278369615563e-07, "loss": 0.0728, "num_tokens": 70823959.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2578, "step_time": 21.55460439249873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 196.3125, "completions/mean_terminated_length": 196.3125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.29934902489185333, "epoch": 0.11945345067160723, "frac_reward_zero_std": 0.0, "grad_norm": 0.09274392575025558, "kl": 0.005649623810313642, "learning_rate": 9.761185734136174e-07, "loss": -0.0318, "num_tokens": 70847436.0, "reward": 0.5290153622627258, "reward_std": 0.42527130246162415, "rewards/reward_func/mean": 0.5290153622627258, "rewards/reward_func/std": 0.42527130246162415, "step": 2579, "step_time": 19.9176363684237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.29691868275403976, "epoch": 0.11949976841130153, "frac_reward_zero_std": 1.0, "grad_norm": 0.006261197850108147, "kl": 0.003267173538915813, "learning_rate": 9.761093098656785e-07, "loss": 0.0002, "num_tokens": 70868916.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2580, "step_time": 13.36207140609622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3163677826523781, "epoch": 0.11954608615099584, "frac_reward_zero_std": 1.0, "grad_norm": 0.002074207179248333, "kl": 0.0016918099427130073, "learning_rate": 9.761000463177397e-07, "loss": 0.0001, "num_tokens": 70895595.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2581, "step_time": 16.02123297378421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4043612703680992, "epoch": 0.11959240389069013, "frac_reward_zero_std": 0.0, "grad_norm": 0.08586780726909637, "kl": 0.005323318357113749, "learning_rate": 9.760907827698008e-07, "loss": -0.0612, "num_tokens": 70925303.0, "reward": 0.11031211167573929, "reward_std": 0.3014300763607025, "rewards/reward_func/mean": 0.11031211167573929, "rewards/reward_func/std": 0.3014300763607025, "step": 2582, "step_time": 23.118612952530384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.19518357142806053, "epoch": 0.11963872163038444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030153447296470404, "kl": 0.0020839257049374282, "learning_rate": 9.76081519221862e-07, "loss": 0.0001, "num_tokens": 70947020.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 2583, "step_time": 16.24068732187152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.37758927047252655, "epoch": 0.11968503937007874, "frac_reward_zero_std": 0.0, "grad_norm": 0.12246187776327133, "kl": 0.002398409938905388, "learning_rate": 9.76072255673923e-07, "loss": -0.0719, "num_tokens": 70971526.0, "reward": 0.05173708125948906, "reward_std": 0.20694832503795624, "rewards/reward_func/mean": 0.05173708125948906, "rewards/reward_func/std": 0.20694833993911743, "step": 2584, "step_time": 21.39801001176238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.39037761837244034, "epoch": 0.11973135710977305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014450823655351996, "kl": 0.0017771337588783354, "learning_rate": 9.760629921259842e-07, "loss": 0.0001, "num_tokens": 71013768.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2585, "step_time": 20.699903801083565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3848694711923599, "epoch": 0.11977767484946734, "frac_reward_zero_std": 1.0, "grad_norm": 0.005286119412630796, "kl": 0.0035452512674964964, "learning_rate": 9.760537285780453e-07, "loss": 0.0002, "num_tokens": 71035806.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2586, "step_time": 19.202445048838854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 118.8125, "completions/mean_terminated_length": 118.8125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3058849424123764, "epoch": 0.11982399258916165, "frac_reward_zero_std": 1.0, "grad_norm": 0.005261395126581192, "kl": 0.002903709071688354, "learning_rate": 9.760444650301066e-07, "loss": 0.0001, "num_tokens": 71055963.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2587, "step_time": 13.46128412336111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1507360003888607, "epoch": 0.11987031032885595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009575008880347013, "kl": 0.0008947111200541258, "learning_rate": 9.760352014821677e-07, "loss": 0.0, "num_tokens": 71077789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2588, "step_time": 16.469407685101032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.25572817400097847, "epoch": 0.11991662806855026, "frac_reward_zero_std": 1.0, "grad_norm": 0.002382407197728753, "kl": 0.0014073697093408555, "learning_rate": 9.760259379342287e-07, "loss": 0.0001, "num_tokens": 71099015.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2589, "step_time": 15.378913719207048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.24062542989850044, "epoch": 0.11996294580824456, "frac_reward_zero_std": 1.0, "grad_norm": 0.005165655631572008, "kl": 0.005093358689919114, "learning_rate": 9.760166743862898e-07, "loss": 0.0003, "num_tokens": 71123901.0, "reward": 0.26359713077545166, "reward_std": 0.0, "rewards/reward_func/mean": 0.26359713077545166, "rewards/reward_func/std": 0.0, "step": 2590, "step_time": 18.162719149142504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 160.6875, "completions/mean_terminated_length": 160.6875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.24116867035627365, "epoch": 0.12000926354793887, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029185765888541937, "kl": 0.001830112305469811, "learning_rate": 9.760074108383511e-07, "loss": 0.0001, "num_tokens": 71148328.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 2591, "step_time": 18.324216801673174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 182.9375, "completions/mean_terminated_length": 182.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.45394784212112427, "epoch": 0.12005558128763316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038516896311193705, "kl": 0.0026644898462109268, "learning_rate": 9.759981472904122e-07, "loss": 0.0001, "num_tokens": 71176535.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2592, "step_time": 20.339137833565474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 121.6875, "completions/mean_terminated_length": 121.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.22174376249313354, "epoch": 0.12010189902732747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024807672016322613, "kl": 0.0015302609535865486, "learning_rate": 9.759888837424734e-07, "loss": 0.0001, "num_tokens": 71195826.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2593, "step_time": 13.547740194946527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 166.4375, "completions/mean_terminated_length": 166.4375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1806536540389061, "epoch": 0.12014821676702177, "frac_reward_zero_std": 0.0, "grad_norm": 0.09671524912118912, "kl": 0.0009749295422807336, "learning_rate": 9.759796201945345e-07, "loss": -0.0581, "num_tokens": 71227993.0, "reward": 0.9300388693809509, "reward_std": 0.027310028672218323, "rewards/reward_func/mean": 0.9300388693809509, "rewards/reward_func/std": 0.027310030534863472, "step": 2594, "step_time": 20.047545570880175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 142.4375, "completions/mean_terminated_length": 142.4375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.29837728291749954, "epoch": 0.12019453450671608, "frac_reward_zero_std": 1.0, "grad_norm": 0.00585030484944582, "kl": 0.0030422385316342115, "learning_rate": 9.759703566465956e-07, "loss": 0.0002, "num_tokens": 71248192.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2595, "step_time": 15.379008781164885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 324.75, "completions/mean_terminated_length": 324.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.2556700184941292, "epoch": 0.12024085224641037, "frac_reward_zero_std": 0.0, "grad_norm": 0.07441122084856033, "kl": 0.0030584142077714205, "learning_rate": 9.759610930986567e-07, "loss": -0.2181, "num_tokens": 71275212.0, "reward": 0.30286088585853577, "reward_std": 0.3167165517807007, "rewards/reward_func/mean": 0.30286088585853577, "rewards/reward_func/std": 0.3167165517807007, "step": 2596, "step_time": 35.48474219441414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.21612903103232384, "epoch": 0.12028716998610468, "frac_reward_zero_std": 1.0, "grad_norm": 0.008417708799242973, "kl": 0.004941831110045314, "learning_rate": 9.759518295507179e-07, "loss": 0.0002, "num_tokens": 71306199.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2597, "step_time": 19.600908558815718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 158.0, "completions/mean_terminated_length": 158.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.277831107378006, "epoch": 0.12033348772579898, "frac_reward_zero_std": 1.0, "grad_norm": 0.005522989667952061, "kl": 0.004178764531388879, "learning_rate": 9.75942566002779e-07, "loss": 0.0002, "num_tokens": 71331847.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2598, "step_time": 18.852451380342245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.39470214396715164, "epoch": 0.12037980546549329, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023999400436878204, "kl": 0.0022791545488871634, "learning_rate": 9.759333024548401e-07, "loss": 0.0001, "num_tokens": 71363839.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2599, "step_time": 18.758001688867807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.1609300747513771, "epoch": 0.12042612320518759, "frac_reward_zero_std": 0.0, "grad_norm": 0.0843634232878685, "kl": 0.0023851240111980587, "learning_rate": 9.759240389069012e-07, "loss": -0.1551, "num_tokens": 71391031.0, "reward": 0.7583180069923401, "reward_std": 0.3343544900417328, "rewards/reward_func/mean": 0.7583180069923401, "rewards/reward_func/std": 0.3343545198440552, "step": 2600, "step_time": 29.403232384473085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.44475042819976807, "epoch": 0.1204724409448819, "frac_reward_zero_std": 0.0, "grad_norm": 0.11624807119369507, "kl": 0.004848725977353752, "learning_rate": 9.759147753589624e-07, "loss": -0.0869, "num_tokens": 71415833.0, "reward": 0.0005881556426174939, "reward_std": 0.0023526225704699755, "rewards/reward_func/mean": 0.0005881556426174939, "rewards/reward_func/std": 0.002352622803300619, "step": 2601, "step_time": 26.487067949026823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.37038053572177887, "epoch": 0.12051875868457619, "frac_reward_zero_std": 0.0, "grad_norm": 0.11069431155920029, "kl": 0.004275827028322965, "learning_rate": 9.759055118110235e-07, "loss": -0.0373, "num_tokens": 71452647.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 2602, "step_time": 22.033695995807648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2706213742494583, "epoch": 0.1205650764242705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022401295136660337, "kl": 0.001878547394881025, "learning_rate": 9.758962482630846e-07, "loss": 0.0001, "num_tokens": 71472253.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2603, "step_time": 14.252593986690044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.22292347624897957, "epoch": 0.1206113941639648, "frac_reward_zero_std": 0.0, "grad_norm": 0.1287321150302887, "kl": 0.009223586996085942, "learning_rate": 9.75886984715146e-07, "loss": 0.0937, "num_tokens": 71496172.0, "reward": 0.8979924917221069, "reward_std": 0.28671565651893616, "rewards/reward_func/mean": 0.8979924917221069, "rewards/reward_func/std": 0.28671562671661377, "step": 2604, "step_time": 23.865822471678257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.35358650237321854, "epoch": 0.1206577119036591, "frac_reward_zero_std": 1.0, "grad_norm": 0.005480792839080095, "kl": 0.004116107884328812, "learning_rate": 9.75877721167207e-07, "loss": 0.0002, "num_tokens": 71517802.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2605, "step_time": 17.80082791671157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.35293833166360855, "epoch": 0.1207040296433534, "frac_reward_zero_std": 0.0, "grad_norm": 0.12171871960163116, "kl": 0.006391683709807694, "learning_rate": 9.758684576192682e-07, "loss": -0.0574, "num_tokens": 71540912.0, "reward": 0.6539702415466309, "reward_std": 0.45536643266677856, "rewards/reward_func/mean": 0.6539702415466309, "rewards/reward_func/std": 0.45536643266677856, "step": 2606, "step_time": 22.94267251715064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 219.0625, "completions/mean_terminated_length": 219.0625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.34524622559547424, "epoch": 0.12075034738304771, "frac_reward_zero_std": 0.0, "grad_norm": 0.07989061623811722, "kl": 0.008095955941826105, "learning_rate": 9.758591940713293e-07, "loss": -0.1107, "num_tokens": 71562625.0, "reward": 0.5537140369415283, "reward_std": 0.5048525929450989, "rewards/reward_func/mean": 0.5537140369415283, "rewards/reward_func/std": 0.5048525929450989, "step": 2607, "step_time": 30.270117606967688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 118.125, "completions/mean_terminated_length": 118.125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.29393845051527023, "epoch": 0.12079666512274201, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019197019282728434, "kl": 0.0014777126198168844, "learning_rate": 9.758499305233905e-07, "loss": 0.0001, "num_tokens": 71584819.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2608, "step_time": 13.530429046601057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.15781072154641151, "epoch": 0.12084298286243632, "frac_reward_zero_std": 1.0, "grad_norm": 0.004372835159301758, "kl": 0.002986446488648653, "learning_rate": 9.758406669754516e-07, "loss": 0.0001, "num_tokens": 71608574.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2609, "step_time": 20.004991702735424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 184.8125, "completions/mean_terminated_length": 184.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.38397885859012604, "epoch": 0.12088930060213061, "frac_reward_zero_std": 0.0, "grad_norm": 0.139508455991745, "kl": 0.00393298888229765, "learning_rate": 9.758314034275127e-07, "loss": -0.0203, "num_tokens": 71638475.0, "reward": 0.05819142237305641, "reward_std": 0.23276568949222565, "rewards/reward_func/mean": 0.05819142237305641, "rewards/reward_func/std": 0.23276568949222565, "step": 2610, "step_time": 22.404640428721905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.45040803402662277, "epoch": 0.12093561834182492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020225944463163614, "kl": 0.0021493701497092843, "learning_rate": 9.758221398795738e-07, "loss": 0.0001, "num_tokens": 71676509.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2611, "step_time": 23.311338245868683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.23867250233888626, "epoch": 0.12098193608151922, "frac_reward_zero_std": 0.0, "grad_norm": 0.05662308260798454, "kl": 0.003504421270918101, "learning_rate": 9.75812876331635e-07, "loss": 0.0014, "num_tokens": 71716773.0, "reward": 0.954357922077179, "reward_std": 0.1825682520866394, "rewards/reward_func/mean": 0.954357922077179, "rewards/reward_func/std": 0.1825682371854782, "step": 2612, "step_time": 36.65858679264784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.253683403134346, "epoch": 0.12102825382121353, "frac_reward_zero_std": 0.0, "grad_norm": 0.15901704132556915, "kl": 0.007373962085694075, "learning_rate": 9.75803612783696e-07, "loss": -0.0235, "num_tokens": 71737977.0, "reward": 0.7204751968383789, "reward_std": 0.3126756548881531, "rewards/reward_func/mean": 0.7204751968383789, "rewards/reward_func/std": 0.3126756548881531, "step": 2613, "step_time": 17.51973421871662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.14996739476919174, "epoch": 0.12107457156090783, "frac_reward_zero_std": 0.0, "grad_norm": 0.07721827179193497, "kl": 0.0010218483803328127, "learning_rate": 9.757943492357572e-07, "loss": -0.0523, "num_tokens": 71761713.0, "reward": 0.9444707632064819, "reward_std": 0.033111196011304855, "rewards/reward_func/mean": 0.9444707632064819, "rewards/reward_func/std": 0.03311121463775635, "step": 2614, "step_time": 20.895534090697765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.18209245428442955, "epoch": 0.12112088930060214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027935488615185022, "kl": 0.0019301688007544726, "learning_rate": 9.757850856878183e-07, "loss": 0.0001, "num_tokens": 71787145.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2615, "step_time": 22.16355014592409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.1945566162467003, "epoch": 0.12116720704029643, "frac_reward_zero_std": 1.0, "grad_norm": 0.00685846246778965, "kl": 0.0025526779936626554, "learning_rate": 9.757758221398794e-07, "loss": 0.0001, "num_tokens": 71807905.0, "reward": 0.8574039340019226, "reward_std": 0.0, "rewards/reward_func/mean": 0.8574039340019226, "rewards/reward_func/std": 0.0, "step": 2616, "step_time": 15.647626988589764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 143.3125, "completions/mean_terminated_length": 143.3125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3358049765229225, "epoch": 0.12121352477999074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012972364202141762, "kl": 0.0014078713720664382, "learning_rate": 9.757665585919408e-07, "loss": 0.0001, "num_tokens": 71833318.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2617, "step_time": 16.322649911046028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3704020455479622, "epoch": 0.12125984251968504, "frac_reward_zero_std": 1.0, "grad_norm": 0.004383946768939495, "kl": 0.002583772409707308, "learning_rate": 9.75757295044002e-07, "loss": 0.0001, "num_tokens": 71856072.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2618, "step_time": 16.43452950939536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3492243140935898, "epoch": 0.12130616025937935, "frac_reward_zero_std": 0.0, "grad_norm": 0.1025136262178421, "kl": 0.004411309317220002, "learning_rate": 9.75748031496063e-07, "loss": -0.0108, "num_tokens": 71878600.0, "reward": 0.7462133169174194, "reward_std": 0.4452076554298401, "rewards/reward_func/mean": 0.7462133169174194, "rewards/reward_func/std": 0.4452076852321625, "step": 2619, "step_time": 23.544443495571613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.25944357365369797, "epoch": 0.12135247799907364, "frac_reward_zero_std": 0.0, "grad_norm": 0.1123427227139473, "kl": 0.005246045300737023, "learning_rate": 9.75738767948124e-07, "loss": -0.0224, "num_tokens": 71911508.0, "reward": 0.399912029504776, "reward_std": 0.3898077607154846, "rewards/reward_func/mean": 0.399912029504776, "rewards/reward_func/std": 0.3898077607154846, "step": 2620, "step_time": 22.157683491706848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 146.75, "completions/mean_terminated_length": 146.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3209523856639862, "epoch": 0.12139879573876795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011952482163906097, "kl": 0.001210305985296145, "learning_rate": 9.757295044001853e-07, "loss": 0.0001, "num_tokens": 71937264.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2621, "step_time": 15.85201332718134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 172.8125, "completions/mean_terminated_length": 172.8125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3508635461330414, "epoch": 0.12144511347846225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022795661352574825, "kl": 0.001664304523728788, "learning_rate": 9.757202408522464e-07, "loss": 0.0001, "num_tokens": 71971261.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2622, "step_time": 22.573575280606747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.26133843511343, "epoch": 0.12149143121815656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022759540006518364, "kl": 0.0015060979349073023, "learning_rate": 9.757109773043075e-07, "loss": 0.0001, "num_tokens": 71990838.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2623, "step_time": 14.58366310223937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 165.6875, "completions/mean_terminated_length": 165.6875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.24231448769569397, "epoch": 0.12153774895785086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015472517115995288, "kl": 0.0014273403503466398, "learning_rate": 9.757017137563687e-07, "loss": 0.0001, "num_tokens": 72016017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2624, "step_time": 17.579478468745947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.39033132046461105, "epoch": 0.12158406669754517, "frac_reward_zero_std": 0.0, "grad_norm": 0.07186461240053177, "kl": 0.008960372069850564, "learning_rate": 9.756924502084298e-07, "loss": -0.058, "num_tokens": 72049801.0, "reward": 0.019167421385645866, "reward_std": 0.02770277112722397, "rewards/reward_func/mean": 0.019167421385645866, "rewards/reward_func/std": 0.02770277112722397, "step": 2625, "step_time": 28.25566239282489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "entropy": 0.25303342938423157, "epoch": 0.12163038443723946, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027672620490193367, "kl": 0.0017814805614762008, "learning_rate": 9.75683186660491e-07, "loss": 0.0001, "num_tokens": 72088959.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2626, "step_time": 30.33176765963435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 168.6875, "completions/mean_terminated_length": 168.6875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2397082783281803, "epoch": 0.12167670217693377, "frac_reward_zero_std": 0.0, "grad_norm": 0.10525412112474442, "kl": 0.002499947266187519, "learning_rate": 9.75673923112552e-07, "loss": -0.099, "num_tokens": 72120266.0, "reward": 0.6049246191978455, "reward_std": 0.31606027483940125, "rewards/reward_func/mean": 0.6049246191978455, "rewards/reward_func/std": 0.31606027483940125, "step": 2627, "step_time": 20.56935388967395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 188.8125, "completions/mean_terminated_length": 188.8125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.41700972616672516, "epoch": 0.12172301991662807, "frac_reward_zero_std": 0.0, "grad_norm": 0.1298043578863144, "kl": 0.007502533379010856, "learning_rate": 9.756646595646132e-07, "loss": -0.1065, "num_tokens": 72147207.0, "reward": 0.11742663383483887, "reward_std": 0.32087063789367676, "rewards/reward_func/mean": 0.11742663383483887, "rewards/reward_func/std": 0.32087066769599915, "step": 2628, "step_time": 22.719747003167868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 196.4375, "completions/mean_terminated_length": 196.4375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2458793967962265, "epoch": 0.12176933765632238, "frac_reward_zero_std": 1.0, "grad_norm": 0.003123057307675481, "kl": 0.0017426459526177496, "learning_rate": 9.756553960166743e-07, "loss": 0.0001, "num_tokens": 72184718.0, "reward": 0.6761743426322937, "reward_std": 0.0, "rewards/reward_func/mean": 0.6761743426322937, "rewards/reward_func/std": 0.0, "step": 2629, "step_time": 23.518625486642122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3304623067378998, "epoch": 0.12181565539601667, "frac_reward_zero_std": 1.0, "grad_norm": 0.020174020901322365, "kl": 0.004885137430392206, "learning_rate": 9.756461324687354e-07, "loss": 0.0002, "num_tokens": 72224226.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2630, "step_time": 26.105417896062136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 194.8125, "completions/mean_terminated_length": 194.8125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.39290958642959595, "epoch": 0.12186197313571098, "frac_reward_zero_std": 1.0, "grad_norm": 0.001977944280952215, "kl": 0.002162375603802502, "learning_rate": 9.756368689207967e-07, "loss": 0.0001, "num_tokens": 72283327.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2631, "step_time": 30.047461956739426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 142.4375, "completions/mean_terminated_length": 142.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.1866447888314724, "epoch": 0.12190829087540528, "frac_reward_zero_std": 0.0, "grad_norm": 0.1311248540878296, "kl": 0.004332931013777852, "learning_rate": 9.756276053728577e-07, "loss": -0.0128, "num_tokens": 72304950.0, "reward": 0.9241602420806885, "reward_std": 0.10709454119205475, "rewards/reward_func/mean": 0.9241602420806885, "rewards/reward_func/std": 0.10709454119205475, "step": 2632, "step_time": 15.393566634505987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 227.3125, "completions/mean_terminated_length": 227.3125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.38326458632946014, "epoch": 0.12195460861509959, "frac_reward_zero_std": 1.0, "grad_norm": 0.005797903053462505, "kl": 0.004306967603042722, "learning_rate": 9.756183418249188e-07, "loss": 0.0002, "num_tokens": 72334171.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2633, "step_time": 24.80897504463792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3198411762714386, "epoch": 0.12200092635479388, "frac_reward_zero_std": 1.0, "grad_norm": 0.007689288817346096, "kl": 0.0034146554535254836, "learning_rate": 9.756090782769801e-07, "loss": 0.0002, "num_tokens": 72369195.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2634, "step_time": 19.71334460005164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.27632561698555946, "epoch": 0.1220472440944882, "frac_reward_zero_std": 0.0, "grad_norm": 0.10215043276548386, "kl": 0.005216853343881667, "learning_rate": 9.755998147290412e-07, "loss": -0.0255, "num_tokens": 72391185.0, "reward": 0.5789077281951904, "reward_std": 0.3379634916782379, "rewards/reward_func/mean": 0.5789077281951904, "rewards/reward_func/std": 0.3379634916782379, "step": 2635, "step_time": 21.428825974464417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3496779501438141, "epoch": 0.12209356183418249, "frac_reward_zero_std": 1.0, "grad_norm": 0.00287781935185194, "kl": 0.0020887544378638268, "learning_rate": 9.755905511811024e-07, "loss": 0.0001, "num_tokens": 72414331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2636, "step_time": 15.327082812786102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.24473798647522926, "epoch": 0.1221398795738768, "frac_reward_zero_std": 0.0, "grad_norm": 0.0794215276837349, "kl": 0.005211790441535413, "learning_rate": 9.755812876331635e-07, "loss": -0.0117, "num_tokens": 72435925.0, "reward": 0.9945688247680664, "reward_std": 0.021724820137023926, "rewards/reward_func/mean": 0.9945688247680664, "rewards/reward_func/std": 0.021724820137023926, "step": 2637, "step_time": 20.33620259165764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 139.3125, "completions/mean_terminated_length": 139.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.23275908082723618, "epoch": 0.1221861973135711, "frac_reward_zero_std": 1.0, "grad_norm": 0.002283212961629033, "kl": 0.0019031875126529485, "learning_rate": 9.755720240852246e-07, "loss": 0.0001, "num_tokens": 72455546.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2638, "step_time": 15.039844371378422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 189.3125, "completions/mean_terminated_length": 189.3125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.2202717587351799, "epoch": 0.1222325150532654, "frac_reward_zero_std": 1.0, "grad_norm": 0.003921009600162506, "kl": 0.0030107529601082206, "learning_rate": 9.755627605372857e-07, "loss": 0.0002, "num_tokens": 72480735.0, "reward": 0.9000876545906067, "reward_std": 0.0, "rewards/reward_func/mean": 0.9000876545906067, "rewards/reward_func/std": 0.0, "step": 2639, "step_time": 20.762874558568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1303444765508175, "epoch": 0.1222788327929597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010071864817291498, "kl": 0.0007015015071374364, "learning_rate": 9.755534969893469e-07, "loss": 0.0, "num_tokens": 72513690.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 2640, "step_time": 19.09148909151554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.162997767329216, "epoch": 0.12232515053265401, "frac_reward_zero_std": 1.0, "grad_norm": 0.00249607115983963, "kl": 0.0019170307787135243, "learning_rate": 9.75544233441408e-07, "loss": 0.0001, "num_tokens": 72537901.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2641, "step_time": 17.687778558582067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 110.25, "completions/mean_terminated_length": 110.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2281123660504818, "epoch": 0.12237146827234831, "frac_reward_zero_std": 1.0, "grad_norm": 0.002711585024371743, "kl": 0.0018292743479833007, "learning_rate": 9.755349698934691e-07, "loss": 0.0001, "num_tokens": 72557601.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2642, "step_time": 13.201411411166191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2339770272374153, "epoch": 0.12241778601204262, "frac_reward_zero_std": 1.0, "grad_norm": 0.008354433812201023, "kl": 0.004901603446342051, "learning_rate": 9.755257063455302e-07, "loss": 0.0002, "num_tokens": 72577185.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2643, "step_time": 14.315678246319294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 163.1875, "completions/mean_terminated_length": 163.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.37413283437490463, "epoch": 0.12246410375173691, "frac_reward_zero_std": 1.0, "grad_norm": 0.002193914959207177, "kl": 0.002328070200746879, "learning_rate": 9.755164427975916e-07, "loss": 0.0001, "num_tokens": 72609316.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2644, "step_time": 18.856964860111475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.36122526973485947, "epoch": 0.12251042149143122, "frac_reward_zero_std": 0.0, "grad_norm": 0.1051226556301117, "kl": 0.007154420600272715, "learning_rate": 9.755071792496525e-07, "loss": -0.1066, "num_tokens": 72630632.0, "reward": 0.09804060310125351, "reward_std": 0.26789790391921997, "rewards/reward_func/mean": 0.09804060310125351, "rewards/reward_func/std": 0.26789796352386475, "step": 2645, "step_time": 24.785052228718996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.18327955156564713, "epoch": 0.12255673923112552, "frac_reward_zero_std": 1.0, "grad_norm": 0.001266221865080297, "kl": 0.0010286837787134573, "learning_rate": 9.754979157017136e-07, "loss": 0.0001, "num_tokens": 72663506.0, "reward": 0.8668779134750366, "reward_std": 0.0, "rewards/reward_func/mean": 0.8668779134750366, "rewards/reward_func/std": 0.0, "step": 2646, "step_time": 21.732679691165686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.23567169904708862, "epoch": 0.12260305697081983, "frac_reward_zero_std": 0.0, "grad_norm": 0.09320887923240662, "kl": 0.002246777032269165, "learning_rate": 9.754886521537747e-07, "loss": -0.0223, "num_tokens": 72688000.0, "reward": 0.24785666167736053, "reward_std": 0.013929652981460094, "rewards/reward_func/mean": 0.24785666167736053, "rewards/reward_func/std": 0.013929653912782669, "step": 2647, "step_time": 19.900317683815956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 194.3125, "completions/mean_terminated_length": 194.3125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.21999873220920563, "epoch": 0.12264937471051413, "frac_reward_zero_std": 0.0, "grad_norm": 0.13728246092796326, "kl": 0.004719353746622801, "learning_rate": 9.75479388605836e-07, "loss": -0.0452, "num_tokens": 72709285.0, "reward": 0.9377368092536926, "reward_std": 0.2490527629852295, "rewards/reward_func/mean": 0.9377368092536926, "rewards/reward_func/std": 0.2490527629852295, "step": 2648, "step_time": 20.027044255286455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.0625, "completions/mean_terminated_length": 179.0625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.20912526175379753, "epoch": 0.12269569245020844, "frac_reward_zero_std": 1.0, "grad_norm": 0.003093697363510728, "kl": 0.0020216996781527996, "learning_rate": 9.754701250578972e-07, "loss": 0.0001, "num_tokens": 72742998.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2649, "step_time": 22.07695698738098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.33261389285326004, "epoch": 0.12274201018990273, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024126721546053886, "kl": 0.0023820858041290194, "learning_rate": 9.754608615099583e-07, "loss": 0.0001, "num_tokens": 72783276.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2650, "step_time": 24.320692989975214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 125.0625, "completions/mean_terminated_length": 125.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3045237138867378, "epoch": 0.12278832792959704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012520633172243834, "kl": 0.0013071002904325724, "learning_rate": 9.754515979620195e-07, "loss": 0.0001, "num_tokens": 72807533.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2651, "step_time": 15.852440193295479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 218.4375, "completions/mean_terminated_length": 218.4375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.15901795402169228, "epoch": 0.12283464566929134, "frac_reward_zero_std": 1.0, "grad_norm": 0.003128107637166977, "kl": 0.0022057093447074294, "learning_rate": 9.754423344140806e-07, "loss": 0.0001, "num_tokens": 72832084.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2652, "step_time": 21.951898373663425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 134.6875, "completions/mean_terminated_length": 134.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.30614544451236725, "epoch": 0.12288096340898565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032622588332742453, "kl": 0.0024048115592449903, "learning_rate": 9.754330708661417e-07, "loss": 0.0001, "num_tokens": 72853919.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2653, "step_time": 15.294451046735048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 185.1875, "completions/mean_terminated_length": 185.1875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4151734560728073, "epoch": 0.12292728114867994, "frac_reward_zero_std": 1.0, "grad_norm": 0.01059812679886818, "kl": 0.004364458902273327, "learning_rate": 9.754238073182028e-07, "loss": 0.0002, "num_tokens": 72878610.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2654, "step_time": 20.326975125819445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 248.8125, "completions/mean_terminated_length": 248.8125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.21510307118296623, "epoch": 0.12297359888837425, "frac_reward_zero_std": 0.0, "grad_norm": 0.09212217479944229, "kl": 0.007700381334871054, "learning_rate": 9.75414543770264e-07, "loss": -0.0667, "num_tokens": 72906239.0, "reward": 0.8717219829559326, "reward_std": 0.137044757604599, "rewards/reward_func/mean": 0.8717219829559326, "rewards/reward_func/std": 0.137044757604599, "step": 2655, "step_time": 25.168050318956375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 139.8125, "completions/mean_terminated_length": 139.8125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.17870675027370453, "epoch": 0.12301991662806855, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011638562427833676, "kl": 0.0007753275713184848, "learning_rate": 9.75405280222325e-07, "loss": 0.0, "num_tokens": 72941644.0, "reward": 0.8007373809814453, "reward_std": 0.0, "rewards/reward_func/mean": 0.8007373809814453, "rewards/reward_func/std": 0.0, "step": 2656, "step_time": 18.49212707579136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 168.4375, "completions/mean_terminated_length": 168.4375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.27530380338430405, "epoch": 0.12306623436776286, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025134237948805094, "kl": 0.0016946768737398088, "learning_rate": 9.753960166743862e-07, "loss": 0.0001, "num_tokens": 72966051.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2657, "step_time": 17.530892979353666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.75, "completions/mean_terminated_length": 117.75, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.30782726407051086, "epoch": 0.12311255210745715, "frac_reward_zero_std": 1.0, "grad_norm": 0.004912033211439848, "kl": 0.002673172624781728, "learning_rate": 9.753867531264473e-07, "loss": 0.0001, "num_tokens": 72987167.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2658, "step_time": 13.192748345434666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.32113301008939743, "epoch": 0.12315886984715146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024710255675017834, "kl": 0.0023215124383568764, "learning_rate": 9.753774895785085e-07, "loss": 0.0001, "num_tokens": 73015877.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2659, "step_time": 15.505543787032366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.23406724631786346, "epoch": 0.12320518758684576, "frac_reward_zero_std": 1.0, "grad_norm": 0.004036585334688425, "kl": 0.002077404933515936, "learning_rate": 9.753682260305696e-07, "loss": 0.0001, "num_tokens": 73035989.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2660, "step_time": 15.954175382852554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3086076006293297, "epoch": 0.12325150532654007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027811750769615173, "kl": 0.002006874361541122, "learning_rate": 9.75358962482631e-07, "loss": 0.0001, "num_tokens": 73060579.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2661, "step_time": 16.9653382524848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.26886285096406937, "epoch": 0.12329782306623437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034270649775862694, "kl": 0.0019199447997380048, "learning_rate": 9.75349698934692e-07, "loss": 0.0001, "num_tokens": 73080089.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2662, "step_time": 13.628558360040188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2314070351421833, "epoch": 0.12334414080592868, "frac_reward_zero_std": 1.0, "grad_norm": 0.002566870767623186, "kl": 0.001634041196666658, "learning_rate": 9.75340435386753e-07, "loss": 0.0001, "num_tokens": 73105721.0, "reward": 0.024372844025492668, "reward_std": 0.0, "rewards/reward_func/mean": 0.024372844025492668, "rewards/reward_func/std": 0.0, "step": 2663, "step_time": 16.70699230581522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 179.625, "completions/mean_terminated_length": 179.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.33533792942762375, "epoch": 0.12339045854562297, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024146882351487875, "kl": 0.0016336151165887713, "learning_rate": 9.753311718388143e-07, "loss": 0.0001, "num_tokens": 73135939.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2664, "step_time": 20.23602031543851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 266.9375, "completions/mean_terminated_length": 266.9375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4119087755680084, "epoch": 0.12343677628531728, "frac_reward_zero_std": 0.0, "grad_norm": 0.07697068154811859, "kl": 0.004283018934074789, "learning_rate": 9.753219082908754e-07, "loss": -0.044, "num_tokens": 73162658.0, "reward": 0.06978751718997955, "reward_std": 0.24942469596862793, "rewards/reward_func/mean": 0.06978751718997955, "rewards/reward_func/std": 0.24942469596862793, "step": 2665, "step_time": 37.28246930614114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 193.4375, "completions/mean_terminated_length": 193.4375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.23821564763784409, "epoch": 0.12348309402501158, "frac_reward_zero_std": 0.0, "grad_norm": 0.08502568304538727, "kl": 0.006176152848638594, "learning_rate": 9.753126447429365e-07, "loss": -0.0676, "num_tokens": 73201129.0, "reward": 0.08006329834461212, "reward_std": 0.1994636505842209, "rewards/reward_func/mean": 0.08006329834461212, "rewards/reward_func/std": 0.19946368038654327, "step": 2666, "step_time": 25.550567347556353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3212057426571846, "epoch": 0.12352941176470589, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021431755740195513, "kl": 0.0019139970827382058, "learning_rate": 9.753033811949977e-07, "loss": 0.0001, "num_tokens": 73230832.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2667, "step_time": 17.31324952840805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.46347784250974655, "epoch": 0.12357572950440018, "frac_reward_zero_std": 1.0, "grad_norm": 0.002197732450440526, "kl": 0.001981659181183204, "learning_rate": 9.752941176470588e-07, "loss": 0.0001, "num_tokens": 73273367.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2668, "step_time": 23.228794887661934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 156.4375, "completions/mean_terminated_length": 156.4375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1714119277894497, "epoch": 0.1236220472440945, "frac_reward_zero_std": 0.0, "grad_norm": 0.12525725364685059, "kl": 0.002197854861151427, "learning_rate": 9.7528485409912e-07, "loss": 0.004, "num_tokens": 73300238.0, "reward": 0.44468438625335693, "reward_std": 0.02321789413690567, "rewards/reward_func/mean": 0.44468438625335693, "rewards/reward_func/std": 0.02321789413690567, "step": 2669, "step_time": 17.755487963557243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3024565950036049, "epoch": 0.12366836498378879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014723469503223896, "kl": 0.0016822517500258982, "learning_rate": 9.75275590551181e-07, "loss": 0.0001, "num_tokens": 73320505.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2670, "step_time": 13.288980275392532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.33811046183109283, "epoch": 0.1237146827234831, "frac_reward_zero_std": 1.0, "grad_norm": 0.005108274519443512, "kl": 0.003483735374175012, "learning_rate": 9.752663270032422e-07, "loss": 0.0002, "num_tokens": 73341844.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2671, "step_time": 14.610954966396093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 255.5625, "completions/mean_terminated_length": 255.5625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.43313534557819366, "epoch": 0.1237610004631774, "frac_reward_zero_std": 0.0, "grad_norm": 0.10161624103784561, "kl": 0.0060406067641451955, "learning_rate": 9.752570634553033e-07, "loss": 0.0048, "num_tokens": 73367805.0, "reward": 0.0008797428454272449, "reward_std": 0.0013829093659296632, "rewards/reward_func/mean": 0.0008797428454272449, "rewards/reward_func/std": 0.001382909482344985, "step": 2672, "step_time": 35.404101356863976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.38385750353336334, "epoch": 0.1238073182028717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034687796141952276, "kl": 0.0028997634071856737, "learning_rate": 9.752477999073644e-07, "loss": 0.0001, "num_tokens": 73400913.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2673, "step_time": 20.9024547226727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3986580818891525, "epoch": 0.123853635942566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015908145578578115, "kl": 0.0015960049058776349, "learning_rate": 9.752385363594258e-07, "loss": 0.0001, "num_tokens": 73439727.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2674, "step_time": 22.429946400225163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.19064150750637054, "epoch": 0.12389995368226031, "frac_reward_zero_std": 1.0, "grad_norm": 0.00739160506054759, "kl": 0.0064025112660601735, "learning_rate": 9.752292728114867e-07, "loss": 0.0003, "num_tokens": 73461829.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2675, "step_time": 19.385738972574472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3214581310749054, "epoch": 0.1239462714219546, "frac_reward_zero_std": 1.0, "grad_norm": 0.005774408113211393, "kl": 0.004261001828126609, "learning_rate": 9.752200092635478e-07, "loss": 0.0002, "num_tokens": 73482093.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2676, "step_time": 17.04783011227846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2922755032777786, "epoch": 0.12399258916164892, "frac_reward_zero_std": 1.0, "grad_norm": 0.009557011537253857, "kl": 0.006010084645822644, "learning_rate": 9.75210745715609e-07, "loss": 0.0003, "num_tokens": 73502575.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2677, "step_time": 15.541265804320574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 219.875, "completions/mean_terminated_length": 219.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3992610350251198, "epoch": 0.12403890690134321, "frac_reward_zero_std": 0.0, "grad_norm": 0.10632207989692688, "kl": 0.0053071328438818455, "learning_rate": 9.752014821676703e-07, "loss": -0.0821, "num_tokens": 73537949.0, "reward": 0.34610211849212646, "reward_std": 0.4094793498516083, "rewards/reward_func/mean": 0.34610211849212646, "rewards/reward_func/std": 0.4094793200492859, "step": 2678, "step_time": 25.628438390791416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.23493099212646484, "epoch": 0.12408522464103752, "frac_reward_zero_std": 0.0, "grad_norm": 0.11918675154447556, "kl": 0.012475994299165905, "learning_rate": 9.751922186197314e-07, "loss": -0.0148, "num_tokens": 73559283.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 2679, "step_time": 20.169689398258924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 191.375, "completions/mean_terminated_length": 191.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.24099867045879364, "epoch": 0.12413154238073182, "frac_reward_zero_std": 1.0, "grad_norm": 0.18033720552921295, "kl": 0.018772387644276023, "learning_rate": 9.751829550717925e-07, "loss": 0.0011, "num_tokens": 73593225.0, "reward": 0.3162277638912201, "reward_std": 0.0, "rewards/reward_func/mean": 0.3162277638912201, "rewards/reward_func/std": 0.0, "step": 2680, "step_time": 25.221865363419056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.36500103771686554, "epoch": 0.12417786012042613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018618119647726417, "kl": 0.0018729929579421878, "learning_rate": 9.751736915238536e-07, "loss": 0.0001, "num_tokens": 73626159.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2681, "step_time": 23.625973116606474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 113.3125, "completions/mean_terminated_length": 113.3125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3030073195695877, "epoch": 0.12422417786012042, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038442956283688545, "kl": 0.002038826758507639, "learning_rate": 9.751644279759148e-07, "loss": 0.0001, "num_tokens": 73647604.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2682, "step_time": 13.847161881625652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3768431022763252, "epoch": 0.12427049559981473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017164384480565786, "kl": 0.0014914613857399672, "learning_rate": 9.751551644279759e-07, "loss": 0.0001, "num_tokens": 73679717.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2683, "step_time": 25.22353618964553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.24410822242498398, "epoch": 0.12431681333950903, "frac_reward_zero_std": 1.0, "grad_norm": 0.01254366897046566, "kl": 0.008021327201277018, "learning_rate": 9.75145900880037e-07, "loss": 0.0004, "num_tokens": 73701018.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2684, "step_time": 18.110782250761986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.26619939506053925, "epoch": 0.12436313107920334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031394518446177244, "kl": 0.0026092990301549435, "learning_rate": 9.751366373320981e-07, "loss": 0.0001, "num_tokens": 73735654.0, "reward": 0.0949237272143364, "reward_std": 0.0, "rewards/reward_func/mean": 0.0949237272143364, "rewards/reward_func/std": 0.0, "step": 2685, "step_time": 21.56949655711651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.31667467951774597, "epoch": 0.12440944881889764, "frac_reward_zero_std": 0.0, "grad_norm": 0.07866465300321579, "kl": 0.0056940873619169, "learning_rate": 9.751273737841593e-07, "loss": -0.0308, "num_tokens": 73769792.0, "reward": 0.786245584487915, "reward_std": 0.20967189967632294, "rewards/reward_func/mean": 0.786245584487915, "rewards/reward_func/std": 0.20967191457748413, "step": 2686, "step_time": 33.06748655810952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.45203348994255066, "epoch": 0.12445576655859195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024263125378638506, "kl": 0.002125900180544704, "learning_rate": 9.751181102362206e-07, "loss": 0.0001, "num_tokens": 73808991.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2687, "step_time": 20.177612725645304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.19452517479658127, "epoch": 0.12450208429828624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014004954136908054, "kl": 0.0011580679565668106, "learning_rate": 9.751088466882815e-07, "loss": 0.0001, "num_tokens": 73832031.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 2688, "step_time": 19.230411875993013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 149.5625, "completions/mean_terminated_length": 149.5625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.310242123901844, "epoch": 0.12454840203798055, "frac_reward_zero_std": 1.0, "grad_norm": 0.003193320706486702, "kl": 0.002363665000302717, "learning_rate": 9.750995831403426e-07, "loss": 0.0001, "num_tokens": 73858328.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2689, "step_time": 17.874194260686636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 122.0625, "completions/mean_terminated_length": 122.0625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2829183302819729, "epoch": 0.12459471977767485, "frac_reward_zero_std": 1.0, "grad_norm": 0.00409874739125371, "kl": 0.002622196276206523, "learning_rate": 9.750903195924037e-07, "loss": 0.0001, "num_tokens": 73877753.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2690, "step_time": 13.463900413364172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 290.8125, "completions/mean_terminated_length": 290.8125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.211647417396307, "epoch": 0.12464103751736916, "frac_reward_zero_std": 0.0, "grad_norm": 0.09355708211660385, "kl": 0.008499447256326675, "learning_rate": 9.75081056044465e-07, "loss": -0.0077, "num_tokens": 73902870.0, "reward": 0.9208402633666992, "reward_std": 0.0006037076818756759, "rewards/reward_func/mean": 0.9208402633666992, "rewards/reward_func/std": 0.0006037129205651581, "step": 2691, "step_time": 27.374474808573723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3093295618891716, "epoch": 0.12468735525706345, "frac_reward_zero_std": 0.0, "grad_norm": 0.13858255743980408, "kl": 0.005437292158603668, "learning_rate": 9.750717924965262e-07, "loss": -0.0444, "num_tokens": 73927404.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2692, "step_time": 21.022662118077278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.20365708321332932, "epoch": 0.12473367299675776, "frac_reward_zero_std": 1.0, "grad_norm": 0.003993342164903879, "kl": 0.0024475062964484096, "learning_rate": 9.750625289485873e-07, "loss": 0.0001, "num_tokens": 73946686.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2693, "step_time": 12.865562237799168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 219.9375, "completions/mean_terminated_length": 219.9375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.23746725171804428, "epoch": 0.12477999073645206, "frac_reward_zero_std": 0.0, "grad_norm": 0.0872751995921135, "kl": 0.005896298564039171, "learning_rate": 9.750532654006485e-07, "loss": 0.0241, "num_tokens": 73982685.0, "reward": 0.7184863090515137, "reward_std": 0.30690306425094604, "rewards/reward_func/mean": 0.7184863090515137, "rewards/reward_func/std": 0.30690306425094604, "step": 2694, "step_time": 24.768761537969112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3785416856408119, "epoch": 0.12482630847614637, "frac_reward_zero_std": 1.0, "grad_norm": 0.004713712725788355, "kl": 0.003453952958807349, "learning_rate": 9.750440018527096e-07, "loss": 0.0002, "num_tokens": 74004596.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2695, "step_time": 18.08088992908597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 134.5625, "completions/mean_terminated_length": 134.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.31142277270555496, "epoch": 0.12487262621584067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019075494492426515, "kl": 0.0015639386547263712, "learning_rate": 9.750347383047707e-07, "loss": 0.0001, "num_tokens": 74025037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2696, "step_time": 14.804278913885355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.30009637773036957, "epoch": 0.12491894395553498, "frac_reward_zero_std": 1.0, "grad_norm": 0.003276183269917965, "kl": 0.0020019312505610287, "learning_rate": 9.750254747568318e-07, "loss": 0.0001, "num_tokens": 74045639.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2697, "step_time": 15.823785934597254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.23315386101603508, "epoch": 0.12496526169522927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009364706929773092, "kl": 0.001000961783574894, "learning_rate": 9.75016211208893e-07, "loss": 0.0001, "num_tokens": 74069127.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2698, "step_time": 13.778093438595533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.33924783021211624, "epoch": 0.12501157943492358, "frac_reward_zero_std": 1.0, "grad_norm": 0.004711588844656944, "kl": 0.0026648000348359346, "learning_rate": 9.75006947660954e-07, "loss": 0.0001, "num_tokens": 74092413.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2699, "step_time": 16.340378548949957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.44041479378938675, "epoch": 0.12505789717461788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017850829754024744, "kl": 0.0020732416305691004, "learning_rate": 9.749976841130152e-07, "loss": 0.0001, "num_tokens": 74144048.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2700, "step_time": 24.832873705774546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 130.5625, "completions/mean_terminated_length": 130.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3139389604330063, "epoch": 0.12510421491431217, "frac_reward_zero_std": 1.0, "grad_norm": 0.002681613201275468, "kl": 0.00191219849511981, "learning_rate": 9.749884205650763e-07, "loss": 0.0001, "num_tokens": 74165929.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2701, "step_time": 14.726565402001143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.25517112016677856, "epoch": 0.1251505326540065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047874790616333485, "kl": 0.0021214752341620624, "learning_rate": 9.749791570171375e-07, "loss": 0.0001, "num_tokens": 74189331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2702, "step_time": 15.88565081730485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2613317035138607, "epoch": 0.1251968503937008, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034698653034865856, "kl": 0.0033956827828660607, "learning_rate": 9.749698934691986e-07, "loss": 0.0002, "num_tokens": 74211765.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 2703, "step_time": 18.23936701565981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.4038449004292488, "epoch": 0.1252431681333951, "frac_reward_zero_std": 1.0, "grad_norm": 0.005769040901213884, "kl": 0.0036217946326360106, "learning_rate": 9.7496062992126e-07, "loss": 0.0002, "num_tokens": 74245845.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2704, "step_time": 20.69897275790572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2649060934782028, "epoch": 0.12528948587308938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013550998410210013, "kl": 0.0011289288086118177, "learning_rate": 9.74951366373321e-07, "loss": 0.0001, "num_tokens": 74279236.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2705, "step_time": 18.662751965224743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 179.3125, "completions/mean_terminated_length": 179.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.35424934327602386, "epoch": 0.1253358036127837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028632325120270252, "kl": 0.002004824666073546, "learning_rate": 9.74942102825382e-07, "loss": 0.0001, "num_tokens": 74320233.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2706, "step_time": 23.352882966399193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3750879764556885, "epoch": 0.125382121352478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013110886793583632, "kl": 0.0014111300115473568, "learning_rate": 9.74932839277443e-07, "loss": 0.0001, "num_tokens": 74354417.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2707, "step_time": 19.81010114774108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 227.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.19847723096609116, "epoch": 0.1254284390921723, "frac_reward_zero_std": 1.0, "grad_norm": 0.008015139028429985, "kl": 0.00550473207840696, "learning_rate": 9.749235757295044e-07, "loss": 0.0003, "num_tokens": 74380533.0, "reward": 0.9775290489196777, "reward_std": 0.0, "rewards/reward_func/mean": 0.9775290489196777, "rewards/reward_func/std": 0.0, "step": 2708, "step_time": 21.945704523473978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 198.9375, "completions/mean_terminated_length": 198.9375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4454718679189682, "epoch": 0.1254747568318666, "frac_reward_zero_std": 1.0, "grad_norm": 0.004729503765702248, "kl": 0.003273056587204337, "learning_rate": 9.749143121815655e-07, "loss": 0.0002, "num_tokens": 74406804.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2709, "step_time": 20.904861342161894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2619287669658661, "epoch": 0.12552107457156092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012338546803221107, "kl": 0.0011804668611148372, "learning_rate": 9.749050486336267e-07, "loss": 0.0001, "num_tokens": 74434381.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2710, "step_time": 15.598942276090384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.28496143221855164, "epoch": 0.12556739231125522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021542469039559364, "kl": 0.001996227045310661, "learning_rate": 9.748957850856878e-07, "loss": 0.0001, "num_tokens": 74454577.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2711, "step_time": 15.838134922087193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2400508113205433, "epoch": 0.1256137100509495, "frac_reward_zero_std": 1.0, "grad_norm": 0.006706647574901581, "kl": 0.004638098063878715, "learning_rate": 9.74886521537749e-07, "loss": 0.0002, "num_tokens": 74475914.0, "reward": 0.8242367506027222, "reward_std": 0.0, "rewards/reward_func/mean": 0.8242367506027222, "rewards/reward_func/std": 0.0, "step": 2712, "step_time": 17.55512172728777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 191.3125, "completions/mean_terminated_length": 191.3125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.23226720094680786, "epoch": 0.1256600277906438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018562600016593933, "kl": 0.0015250964497681707, "learning_rate": 9.7487725798981e-07, "loss": 0.0001, "num_tokens": 74499311.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2713, "step_time": 19.46345605701208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 199.4375, "completions/mean_terminated_length": 199.4375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.43358640372753143, "epoch": 0.12570634553033813, "frac_reward_zero_std": 0.0, "grad_norm": 0.08696886897087097, "kl": 0.003077430708799511, "learning_rate": 9.748679944418712e-07, "loss": 0.0547, "num_tokens": 74525078.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2714, "step_time": 23.7141883186996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 163.75, "completions/mean_terminated_length": 163.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3527919724583626, "epoch": 0.12575266327003243, "frac_reward_zero_std": 1.0, "grad_norm": 0.006981628946959972, "kl": 0.0041662033763714135, "learning_rate": 9.748587308939323e-07, "loss": 0.0002, "num_tokens": 74551490.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2715, "step_time": 20.072610300034285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.35750482976436615, "epoch": 0.12579898100972672, "frac_reward_zero_std": 1.0, "grad_norm": 0.005137501284480095, "kl": 0.0038633313379250467, "learning_rate": 9.748494673459934e-07, "loss": 0.0002, "num_tokens": 74571638.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2716, "step_time": 16.836354076862335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 157.9375, "completions/mean_terminated_length": 157.9375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.42012248933315277, "epoch": 0.12584529874942102, "frac_reward_zero_std": 1.0, "grad_norm": 0.003115291241556406, "kl": 0.002565705159213394, "learning_rate": 9.748402037980545e-07, "loss": 0.0001, "num_tokens": 74592533.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2717, "step_time": 16.934609431773424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 194.625, "completions/mean_terminated_length": 194.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3938866928219795, "epoch": 0.12589161648911534, "frac_reward_zero_std": 0.0, "grad_norm": 0.11733521521091461, "kl": 0.005466120084747672, "learning_rate": 9.748309402501157e-07, "loss": -0.0075, "num_tokens": 74630127.0, "reward": 0.20493784546852112, "reward_std": 0.3782358467578888, "rewards/reward_func/mean": 0.20493784546852112, "rewards/reward_func/std": 0.3782358765602112, "step": 2718, "step_time": 27.114779822528362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 130.125, "completions/mean_terminated_length": 130.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24157185852527618, "epoch": 0.12593793422880964, "frac_reward_zero_std": 1.0, "grad_norm": 0.003785189939662814, "kl": 0.0017986957682296634, "learning_rate": 9.748216767021768e-07, "loss": 0.0001, "num_tokens": 74649697.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2719, "step_time": 14.310607746243477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.31972789764404297, "epoch": 0.12598425196850394, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024709198623895645, "kl": 0.0017789184930734336, "learning_rate": 9.74812413154238e-07, "loss": 0.0001, "num_tokens": 74672169.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2720, "step_time": 13.845789514482021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 175.25, "completions/mean_terminated_length": 175.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.26165006309747696, "epoch": 0.12603056970819823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035965414717793465, "kl": 0.002663189312443137, "learning_rate": 9.748031496062993e-07, "loss": 0.0001, "num_tokens": 74693933.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2721, "step_time": 17.649345494806767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2825714349746704, "epoch": 0.12607688744789255, "frac_reward_zero_std": 1.0, "grad_norm": 0.00237577548250556, "kl": 0.0020054624474141747, "learning_rate": 9.747938860583604e-07, "loss": 0.0001, "num_tokens": 74720757.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2722, "step_time": 14.613527905195951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 143.6875, "completions/mean_terminated_length": 143.6875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.22826269268989563, "epoch": 0.12612320518758685, "frac_reward_zero_std": 1.0, "grad_norm": 0.00878122914582491, "kl": 0.00345516212109942, "learning_rate": 9.747846225104215e-07, "loss": 0.0002, "num_tokens": 74743328.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 2723, "step_time": 16.88019995391369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.29919153451919556, "epoch": 0.12616952292728115, "frac_reward_zero_std": 1.0, "grad_norm": 0.005003768019378185, "kl": 0.0022477228776551783, "learning_rate": 9.747753589624826e-07, "loss": 0.0001, "num_tokens": 74779412.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2724, "step_time": 18.6339902728796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24651046097278595, "epoch": 0.12621584066697544, "frac_reward_zero_std": 1.0, "grad_norm": 0.004378539510071278, "kl": 0.0019843547779601067, "learning_rate": 9.747660954145438e-07, "loss": 0.0001, "num_tokens": 74798776.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2725, "step_time": 13.920933213084936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 138.3125, "completions/mean_terminated_length": 138.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.26773545891046524, "epoch": 0.12626215840666977, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020556682720780373, "kl": 0.0015064789913594723, "learning_rate": 9.747568318666049e-07, "loss": 0.0001, "num_tokens": 74820461.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2726, "step_time": 16.31049646437168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 150.1875, "completions/mean_terminated_length": 150.1875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3176315873861313, "epoch": 0.12630847614636406, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022769500501453876, "kl": 0.0014689369418192655, "learning_rate": 9.74747568318666e-07, "loss": 0.0001, "num_tokens": 74851584.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2727, "step_time": 18.583209179341793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3354658707976341, "epoch": 0.12635479388605836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017642880557104945, "kl": 0.0014925982104614377, "learning_rate": 9.747383047707271e-07, "loss": 0.0001, "num_tokens": 74878756.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2728, "step_time": 15.775880549103022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.38607513904571533, "epoch": 0.12640111162575265, "frac_reward_zero_std": 0.0, "grad_norm": 0.12646599113941193, "kl": 0.0063538794638589025, "learning_rate": 9.747290412227883e-07, "loss": -0.164, "num_tokens": 74901282.0, "reward": 0.38645732402801514, "reward_std": 0.4928838312625885, "rewards/reward_func/mean": 0.38645732402801514, "rewards/reward_func/std": 0.4928838610649109, "step": 2729, "step_time": 24.98566211387515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.2186572328209877, "epoch": 0.12644742936544698, "frac_reward_zero_std": 0.0, "grad_norm": 0.10843881964683533, "kl": 0.0016772676608525217, "learning_rate": 9.747197776748494e-07, "loss": -0.019, "num_tokens": 74931030.0, "reward": 0.9276199340820312, "reward_std": 0.050811730325222015, "rewards/reward_func/mean": 0.9276199340820312, "rewards/reward_func/std": 0.05081172287464142, "step": 2730, "step_time": 20.78960970044136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2189238965511322, "epoch": 0.12649374710514127, "frac_reward_zero_std": 0.0, "grad_norm": 0.11478737741708755, "kl": 0.002773190673906356, "learning_rate": 9.747105141269105e-07, "loss": -0.0107, "num_tokens": 74961258.0, "reward": 0.29867756366729736, "reward_std": 0.14011363685131073, "rewards/reward_func/mean": 0.29867756366729736, "rewards/reward_func/std": 0.14011362195014954, "step": 2731, "step_time": 20.48701937869191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1975894197821617, "epoch": 0.12654006484483557, "frac_reward_zero_std": 0.0, "grad_norm": 0.14189352095127106, "kl": 0.0022395200503524393, "learning_rate": 9.747012505789716e-07, "loss": 0.0182, "num_tokens": 74991046.0, "reward": 0.9431997537612915, "reward_std": 0.015146732330322266, "rewards/reward_func/mean": 0.9431997537612915, "rewards/reward_func/std": 0.015146732330322266, "step": 2732, "step_time": 20.003262981772423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 184.3125, "completions/mean_terminated_length": 184.3125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1383097842335701, "epoch": 0.12658638258452987, "frac_reward_zero_std": 1.0, "grad_norm": 0.005424704868346453, "kl": 0.0015339643141487613, "learning_rate": 9.746919870310328e-07, "loss": 0.0001, "num_tokens": 75014795.0, "reward": 0.8817122578620911, "reward_std": 0.0, "rewards/reward_func/mean": 0.8817122578620911, "rewards/reward_func/std": 0.0, "step": 2733, "step_time": 19.45877280086279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 169.8125, "completions/mean_terminated_length": 169.8125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.23734011128544807, "epoch": 0.1266327003242242, "frac_reward_zero_std": 0.0, "grad_norm": 0.114932119846344, "kl": 0.003702098852954805, "learning_rate": 9.74682723483094e-07, "loss": 0.0547, "num_tokens": 75036504.0, "reward": 0.8614631295204163, "reward_std": 0.23074392974376678, "rewards/reward_func/mean": 0.8614631295204163, "rewards/reward_func/std": 0.23074392974376678, "step": 2734, "step_time": 18.45914290472865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 224.3125, "completions/mean_terminated_length": 224.3125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4532436206936836, "epoch": 0.12667901806391849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021933862008154392, "kl": 0.002340391860343516, "learning_rate": 9.746734599351552e-07, "loss": 0.0001, "num_tokens": 75057789.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2735, "step_time": 26.696069829165936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.32509417086839676, "epoch": 0.12672533580361278, "frac_reward_zero_std": 0.0, "grad_norm": 0.07923617213964462, "kl": 0.005715687992051244, "learning_rate": 9.746641963872163e-07, "loss": -0.0206, "num_tokens": 75086783.0, "reward": 0.26315927505493164, "reward_std": 0.3508790135383606, "rewards/reward_func/mean": 0.26315927505493164, "rewards/reward_func/std": 0.3508790135383606, "step": 2736, "step_time": 28.975221525877714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.17860662564635277, "epoch": 0.12677165354330708, "frac_reward_zero_std": 0.0, "grad_norm": 0.12073924392461777, "kl": 0.005260401172563434, "learning_rate": 9.746549328392773e-07, "loss": -0.0395, "num_tokens": 75123793.0, "reward": 0.5995413064956665, "reward_std": 0.20553939044475555, "rewards/reward_func/mean": 0.5995413064956665, "rewards/reward_func/std": 0.20553939044475555, "step": 2737, "step_time": 23.80088046193123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.5625, "completions/mean_terminated_length": 133.5625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3274148404598236, "epoch": 0.1268179712830014, "frac_reward_zero_std": 1.0, "grad_norm": 0.003784723812714219, "kl": 0.0029381680069491267, "learning_rate": 9.746456692913386e-07, "loss": 0.0001, "num_tokens": 75146986.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2738, "step_time": 14.571484304964542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.34107372909784317, "epoch": 0.1268642890226957, "frac_reward_zero_std": 1.0, "grad_norm": 0.012313502840697765, "kl": 0.007771961740218103, "learning_rate": 9.746364057433997e-07, "loss": 0.0004, "num_tokens": 75173985.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2739, "step_time": 22.20461354777217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 181.1875, "completions/mean_terminated_length": 181.1875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3508002907037735, "epoch": 0.12691060676239, "frac_reward_zero_std": 1.0, "grad_norm": 0.004565728362649679, "kl": 0.0035977655788883567, "learning_rate": 9.746271421954608e-07, "loss": 0.0002, "num_tokens": 75195332.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2740, "step_time": 19.280513919889927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 204.6875, "completions/mean_terminated_length": 204.6875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.44792069494724274, "epoch": 0.1269569245020843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020373642910271883, "kl": 0.0022946663666516542, "learning_rate": 9.74617878647522e-07, "loss": 0.0001, "num_tokens": 75216655.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2741, "step_time": 22.429401483386755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.38756484538316727, "epoch": 0.1270032422417786, "frac_reward_zero_std": 1.0, "grad_norm": 0.012684179469943047, "kl": 0.011083966586738825, "learning_rate": 9.74608615099583e-07, "loss": 0.0005, "num_tokens": 75237313.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2742, "step_time": 18.614252384752035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 262.6875, "completions/mean_terminated_length": 262.6875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.21404609829187393, "epoch": 0.1270495599814729, "frac_reward_zero_std": 1.0, "grad_norm": 0.006447000429034233, "kl": 0.0054478979436680675, "learning_rate": 9.745993515516442e-07, "loss": 0.0003, "num_tokens": 75272684.0, "reward": 0.687289297580719, "reward_std": 0.0, "rewards/reward_func/mean": 0.687289297580719, "rewards/reward_func/std": 0.0, "step": 2743, "step_time": 28.914159759879112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.20668013021349907, "epoch": 0.1270958777211672, "frac_reward_zero_std": 1.0, "grad_norm": 0.00394948897883296, "kl": 0.0021256963373161852, "learning_rate": 9.745900880037053e-07, "loss": 0.0001, "num_tokens": 75293399.0, "reward": 0.2817692756652832, "reward_std": 0.0, "rewards/reward_func/mean": 0.2817692756652832, "rewards/reward_func/std": 0.0, "step": 2744, "step_time": 16.90724340826273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 186.4375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3060798943042755, "epoch": 0.1271421954608615, "frac_reward_zero_std": 0.0, "grad_norm": 0.09296601265668869, "kl": 0.0064141659531742334, "learning_rate": 9.745808244557665e-07, "loss": 0.056, "num_tokens": 75316430.0, "reward": 0.1687968224287033, "reward_std": 0.19767460227012634, "rewards/reward_func/mean": 0.1687968224287033, "rewards/reward_func/std": 0.19767460227012634, "step": 2745, "step_time": 20.155911333858967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 117.0625, "completions/mean_terminated_length": 117.0625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2240525707602501, "epoch": 0.12718851320055582, "frac_reward_zero_std": 1.0, "grad_norm": 0.020976727828383446, "kl": 0.004377532051876187, "learning_rate": 9.745715609078276e-07, "loss": 0.0002, "num_tokens": 75335615.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2746, "step_time": 14.6689806394279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.39229030907154083, "epoch": 0.12723483094025012, "frac_reward_zero_std": 1.0, "grad_norm": 0.002199604408815503, "kl": 0.002196538494899869, "learning_rate": 9.745622973598887e-07, "loss": 0.0001, "num_tokens": 75392315.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2747, "step_time": 29.404766894876957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 185.5, "completions/mean_terminated_length": 185.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.35546116530895233, "epoch": 0.12728114867994442, "frac_reward_zero_std": 0.0, "grad_norm": 0.11322785168886185, "kl": 0.0035763249616138637, "learning_rate": 9.7455303381195e-07, "loss": -0.0153, "num_tokens": 75420019.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2748, "step_time": 21.715355332940817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1605934351682663, "epoch": 0.1273274664196387, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016634712228551507, "kl": 0.000984200305538252, "learning_rate": 9.74543770264011e-07, "loss": 0.0, "num_tokens": 75454071.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 2749, "step_time": 18.8595955632627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.397569015622139, "epoch": 0.12737378415933304, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038190491031855345, "kl": 0.0030509496573358774, "learning_rate": 9.74534506716072e-07, "loss": 0.0002, "num_tokens": 75485019.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2750, "step_time": 17.797695234417915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 168.5625, "completions/mean_terminated_length": 168.5625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.35855329781770706, "epoch": 0.12742010189902733, "frac_reward_zero_std": 1.0, "grad_norm": 0.003678632667288184, "kl": 0.002897813101299107, "learning_rate": 9.745252431681334e-07, "loss": 0.0001, "num_tokens": 75505780.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2751, "step_time": 19.002348955720663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 140.3125, "completions/mean_terminated_length": 140.3125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.29185565561056137, "epoch": 0.12746641963872163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033416072838008404, "kl": 0.0021536298736464232, "learning_rate": 9.745159796201946e-07, "loss": 0.0001, "num_tokens": 75526985.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2752, "step_time": 16.202499013394117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 182.6875, "completions/mean_terminated_length": 182.6875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.17059067636728287, "epoch": 0.12751273737841592, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015629567205905914, "kl": 0.0010609637683955953, "learning_rate": 9.745067160722557e-07, "loss": 0.0001, "num_tokens": 75574436.0, "reward": 0.9000876545906067, "reward_std": 0.0, "rewards/reward_func/mean": 0.9000876545906067, "rewards/reward_func/std": 0.0, "step": 2753, "step_time": 23.694838035851717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.14076201617717743, "epoch": 0.12755905511811025, "frac_reward_zero_std": 1.0, "grad_norm": 0.001567707397043705, "kl": 0.0011508175230119377, "learning_rate": 9.744974525243168e-07, "loss": 0.0001, "num_tokens": 75595683.0, "reward": 0.9555630087852478, "reward_std": 0.0, "rewards/reward_func/mean": 0.9555630087852478, "rewards/reward_func/std": 0.0, "step": 2754, "step_time": 16.34590784087777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 157.3125, "completions/mean_terminated_length": 157.3125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4654070883989334, "epoch": 0.12760537285780454, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027254438027739525, "kl": 0.0026476834318600595, "learning_rate": 9.74488188976378e-07, "loss": 0.0001, "num_tokens": 75630792.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2755, "step_time": 20.714604150503874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 215.8125, "completions/mean_terminated_length": 215.8125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.22874604910612106, "epoch": 0.12765169059749884, "frac_reward_zero_std": 0.0, "grad_norm": 0.09279012680053711, "kl": 0.007919730502180755, "learning_rate": 9.74478925428439e-07, "loss": -0.0144, "num_tokens": 75652917.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 2756, "step_time": 20.54040264710784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2796595022082329, "epoch": 0.12769800833719314, "frac_reward_zero_std": 1.0, "grad_norm": 0.001744032371789217, "kl": 0.0013569175789598376, "learning_rate": 9.744696618805002e-07, "loss": 0.0001, "num_tokens": 75674504.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2757, "step_time": 13.419739801436663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 122.75, "completions/mean_terminated_length": 122.75, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3256135880947113, "epoch": 0.12774432607688746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016883771168068051, "kl": 0.0018302177195437253, "learning_rate": 9.744603983325613e-07, "loss": 0.0001, "num_tokens": 75699268.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2758, "step_time": 14.83268203213811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2979929596185684, "epoch": 0.12779064381658176, "frac_reward_zero_std": 1.0, "grad_norm": 0.001972433878108859, "kl": 0.001491810631705448, "learning_rate": 9.744511347846224e-07, "loss": 0.0001, "num_tokens": 75727760.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2759, "step_time": 16.303051222115755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.36969175189733505, "epoch": 0.12783696155627605, "frac_reward_zero_std": 1.0, "grad_norm": 0.004510269034653902, "kl": 0.003775397897697985, "learning_rate": 9.744418712366836e-07, "loss": 0.0002, "num_tokens": 75750378.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2760, "step_time": 18.004489295184612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.32959359884262085, "epoch": 0.12788327929597035, "frac_reward_zero_std": 0.0, "grad_norm": 0.11003851890563965, "kl": 0.006389186019077897, "learning_rate": 9.744326076887447e-07, "loss": -0.0182, "num_tokens": 75773303.0, "reward": 0.8806997537612915, "reward_std": 0.23485326766967773, "rewards/reward_func/mean": 0.8806997537612915, "rewards/reward_func/std": 0.23485328257083893, "step": 2761, "step_time": 21.36713733896613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 164.8125, "completions/mean_terminated_length": 164.8125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2286955863237381, "epoch": 0.12792959703566467, "frac_reward_zero_std": 0.0, "grad_norm": 0.1614483892917633, "kl": 0.012991887633688748, "learning_rate": 9.744233441408058e-07, "loss": -0.0101, "num_tokens": 75794740.0, "reward": 0.9262244701385498, "reward_std": 0.15861256420612335, "rewards/reward_func/mean": 0.9262244701385498, "rewards/reward_func/std": 0.15861256420612335, "step": 2762, "step_time": 16.983039274811745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.3125, "completions/mean_terminated_length": 124.3125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2735665291547775, "epoch": 0.12797591477535897, "frac_reward_zero_std": 1.0, "grad_norm": 0.002134870272129774, "kl": 0.0016807553183753043, "learning_rate": 9.74414080592867e-07, "loss": 0.0001, "num_tokens": 75815449.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2763, "step_time": 13.364386174827814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.24697228893637657, "epoch": 0.12802223251505326, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022184564732015133, "kl": 0.0017724183562677354, "learning_rate": 9.744048170449283e-07, "loss": 0.0001, "num_tokens": 75840365.0, "reward": 0.9648571610450745, "reward_std": 0.0, "rewards/reward_func/mean": 0.9648571610450745, "rewards/reward_func/std": 0.0, "step": 2764, "step_time": 22.50494721531868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 127.3125, "completions/mean_terminated_length": 127.3125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2863641083240509, "epoch": 0.12806855025474756, "frac_reward_zero_std": 1.0, "grad_norm": 0.002310654614120722, "kl": 0.0019112858863081783, "learning_rate": 9.743955534969894e-07, "loss": 0.0001, "num_tokens": 75863650.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2765, "step_time": 14.678524300456047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 227.875, "completions/mean_terminated_length": 227.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.19780412316322327, "epoch": 0.12811486799444188, "frac_reward_zero_std": 1.0, "grad_norm": 0.004414843861013651, "kl": 0.0038772448897361755, "learning_rate": 9.743862899490505e-07, "loss": 0.0002, "num_tokens": 75889392.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2766, "step_time": 21.383998308330774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 126.875, "completions/mean_terminated_length": 126.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2553219012916088, "epoch": 0.12816118573413618, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018279865616932511, "kl": 0.0014974544756114483, "learning_rate": 9.743770264011116e-07, "loss": 0.0001, "num_tokens": 75908750.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2767, "step_time": 14.12143798545003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2772565595805645, "epoch": 0.12820750347383048, "frac_reward_zero_std": 0.0, "grad_norm": 0.09665929526090622, "kl": 0.010590121382847428, "learning_rate": 9.743677628531728e-07, "loss": -0.0762, "num_tokens": 75929837.0, "reward": 0.548355758190155, "reward_std": 0.4709661900997162, "rewards/reward_func/mean": 0.548355758190155, "rewards/reward_func/std": 0.4709661900997162, "step": 2768, "step_time": 21.317594226449728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 106.3125, "completions/mean_terminated_length": 106.3125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2962646558880806, "epoch": 0.12825382121352477, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066306861117482185, "kl": 0.0021396586671471596, "learning_rate": 9.743584993052339e-07, "loss": 0.0001, "num_tokens": 75951442.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2769, "step_time": 12.336755074560642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 180.5625, "completions/mean_terminated_length": 180.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3718944415450096, "epoch": 0.1283001389532191, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036362269893288612, "kl": 0.002736643247772008, "learning_rate": 9.74349235757295e-07, "loss": 0.0001, "num_tokens": 75974187.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2770, "step_time": 19.46478195488453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.4375, "completions/mean_terminated_length": 169.4375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.39902693778276443, "epoch": 0.1283464566929134, "frac_reward_zero_std": 1.0, "grad_norm": 0.00557924248278141, "kl": 0.0039357165223918855, "learning_rate": 9.743399722093561e-07, "loss": 0.0002, "num_tokens": 76011122.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2771, "step_time": 20.970091186463833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.21097231283783913, "epoch": 0.1283927744326077, "frac_reward_zero_std": 0.0, "grad_norm": 0.12273252010345459, "kl": 0.0033145806519314647, "learning_rate": 9.743307086614173e-07, "loss": -0.0722, "num_tokens": 76036102.0, "reward": 0.11289691925048828, "reward_std": 0.23656082153320312, "rewards/reward_func/mean": 0.11289691925048828, "rewards/reward_func/std": 0.23656083643436432, "step": 2772, "step_time": 17.545169688761234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 181.5625, "completions/mean_terminated_length": 181.5625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.24039329588413239, "epoch": 0.12843909217230198, "frac_reward_zero_std": 1.0, "grad_norm": 0.004032476805150509, "kl": 0.0030674479785375297, "learning_rate": 9.743214451134784e-07, "loss": 0.0002, "num_tokens": 76057743.0, "reward": 0.31760504841804504, "reward_std": 0.0, "rewards/reward_func/mean": 0.31760504841804504, "rewards/reward_func/std": 0.0, "step": 2773, "step_time": 18.22195716202259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.37091164290905, "epoch": 0.1284854099119963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025031070690602064, "kl": 0.002361326478421688, "learning_rate": 9.743121815655395e-07, "loss": 0.0001, "num_tokens": 76085465.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2774, "step_time": 20.650371208786964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 165.25, "completions/mean_terminated_length": 165.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4116973876953125, "epoch": 0.1285317276516906, "frac_reward_zero_std": 1.0, "grad_norm": 0.002033612225204706, "kl": 0.002104911662172526, "learning_rate": 9.743029180176006e-07, "loss": 0.0001, "num_tokens": 76130317.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2775, "step_time": 22.69060404598713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.321686789393425, "epoch": 0.1285780453913849, "frac_reward_zero_std": 0.0, "grad_norm": 0.12945276498794556, "kl": 0.006211111904121935, "learning_rate": 9.742936544696618e-07, "loss": 0.0791, "num_tokens": 76151330.0, "reward": 0.008539922535419464, "reward_std": 0.0022773125674575567, "rewards/reward_func/mean": 0.008539922535419464, "rewards/reward_func/std": 0.0022773125674575567, "step": 2776, "step_time": 20.260665953159332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 117.5625, "completions/mean_terminated_length": 117.5625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.29922936111688614, "epoch": 0.1286243631310792, "frac_reward_zero_std": 1.0, "grad_norm": 0.006918784696608782, "kl": 0.0030423023272305727, "learning_rate": 9.742843909217229e-07, "loss": 0.0001, "num_tokens": 76175003.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2777, "step_time": 14.081242229789495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.32472098618745804, "epoch": 0.12867068087077352, "frac_reward_zero_std": 1.0, "grad_norm": 0.003332186955958605, "kl": 0.0022369700018316507, "learning_rate": 9.742751273737842e-07, "loss": 0.0001, "num_tokens": 76199105.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2778, "step_time": 15.179923698306084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3663220778107643, "epoch": 0.12871699861046781, "frac_reward_zero_std": 1.0, "grad_norm": 0.01513900700956583, "kl": 0.011680921306833625, "learning_rate": 9.742658638258453e-07, "loss": 0.0006, "num_tokens": 76226449.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2779, "step_time": 24.165780186653137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.27411989122629166, "epoch": 0.1287633163501621, "frac_reward_zero_std": 0.0, "grad_norm": 0.10122770816087723, "kl": 0.006216954789124429, "learning_rate": 9.742566002779063e-07, "loss": -0.0375, "num_tokens": 76264061.0, "reward": 0.34300726652145386, "reward_std": 0.2744058072566986, "rewards/reward_func/mean": 0.34300726652145386, "rewards/reward_func/std": 0.2744058072566986, "step": 2780, "step_time": 24.751993499696255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.1366775520145893, "epoch": 0.1288096340898564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009560997714288533, "kl": 0.0007964491378515959, "learning_rate": 9.742473367299676e-07, "loss": 0.0, "num_tokens": 76285807.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2781, "step_time": 17.288374941796064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.3125, "completions/mean_terminated_length": 122.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2840088978409767, "epoch": 0.12885595182955073, "frac_reward_zero_std": 1.0, "grad_norm": 0.002545348834246397, "kl": 0.0017528543248772621, "learning_rate": 9.742380731820287e-07, "loss": 0.0001, "num_tokens": 76306932.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2782, "step_time": 13.426001753658056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.27867213636636734, "epoch": 0.12890226956924503, "frac_reward_zero_std": 1.0, "grad_norm": 0.002323121763765812, "kl": 0.00176383025245741, "learning_rate": 9.742288096340898e-07, "loss": 0.0001, "num_tokens": 76331020.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2783, "step_time": 14.70707942545414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.19457445666193962, "epoch": 0.12894858730893932, "frac_reward_zero_std": 0.0, "grad_norm": 0.11523640155792236, "kl": 0.00239378004334867, "learning_rate": 9.74219546086151e-07, "loss": -0.0114, "num_tokens": 76354584.0, "reward": 0.9422565698623657, "reward_std": 0.23097383975982666, "rewards/reward_func/mean": 0.9422565698623657, "rewards/reward_func/std": 0.23097383975982666, "step": 2784, "step_time": 18.583884596824646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 134.4375, "completions/mean_terminated_length": 134.4375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.26692158728837967, "epoch": 0.12899490504863362, "frac_reward_zero_std": 1.0, "grad_norm": 0.010094476863741875, "kl": 0.0031349931086879224, "learning_rate": 9.74210282538212e-07, "loss": 0.0002, "num_tokens": 76381455.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2785, "step_time": 16.40631280466914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 139.0625, "completions/mean_terminated_length": 139.0625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2784252278506756, "epoch": 0.12904122278832794, "frac_reward_zero_std": 0.0, "grad_norm": 0.19294673204421997, "kl": 0.004295072401873767, "learning_rate": 9.742010189902732e-07, "loss": 0.0017, "num_tokens": 76416352.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 2786, "step_time": 19.347070194780827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 146.9375, "completions/mean_terminated_length": 146.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.38738276809453964, "epoch": 0.12908754052802224, "frac_reward_zero_std": 1.0, "grad_norm": 0.003739331616088748, "kl": 0.0035139237297698855, "learning_rate": 9.741917554423343e-07, "loss": 0.0002, "num_tokens": 76448223.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2787, "step_time": 18.148137751966715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 159.4375, "completions/mean_terminated_length": 159.4375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.19509608671069145, "epoch": 0.12913385826771653, "frac_reward_zero_std": 1.0, "grad_norm": 0.003527343040332198, "kl": 0.001734732068143785, "learning_rate": 9.741824918943955e-07, "loss": 0.0001, "num_tokens": 76470230.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2788, "step_time": 15.995738919824362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.343943752348423, "epoch": 0.12918017600741083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020726339425891638, "kl": 0.001907685917103663, "learning_rate": 9.741732283464566e-07, "loss": 0.0001, "num_tokens": 76492156.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2789, "step_time": 18.224940542131662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 175.0625, "completions/mean_terminated_length": 175.0625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4176969677209854, "epoch": 0.12922649374710515, "frac_reward_zero_std": 1.0, "grad_norm": 0.011609376408159733, "kl": 0.00793652969878167, "learning_rate": 9.741639647985177e-07, "loss": 0.0004, "num_tokens": 76516781.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2790, "step_time": 20.1127092204988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.36189454793930054, "epoch": 0.12927281148679945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036154116969555616, "kl": 0.003362747753271833, "learning_rate": 9.74154701250579e-07, "loss": 0.0002, "num_tokens": 76548994.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2791, "step_time": 19.28479090332985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 189.1875, "completions/mean_terminated_length": 189.1875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4514002129435539, "epoch": 0.12931912922649375, "frac_reward_zero_std": 1.0, "grad_norm": 0.006092879921197891, "kl": 0.004910267889499664, "learning_rate": 9.7414543770264e-07, "loss": 0.0002, "num_tokens": 76594005.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2792, "step_time": 27.47137390822172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.35142024606466293, "epoch": 0.12936544696618804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029565657023340464, "kl": 0.002456948481267318, "learning_rate": 9.74136174154701e-07, "loss": 0.0001, "num_tokens": 76617183.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2793, "step_time": 17.76403970271349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 166.0625, "completions/mean_terminated_length": 166.0625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.290935218334198, "epoch": 0.12941176470588237, "frac_reward_zero_std": 1.0, "grad_norm": 0.005273391027003527, "kl": 0.0038551719044335186, "learning_rate": 9.741269106067624e-07, "loss": 0.0002, "num_tokens": 76642080.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2794, "step_time": 18.212548714131117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 146.5625, "completions/mean_terminated_length": 146.5625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3950282782316208, "epoch": 0.12945808244557666, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018907836638391018, "kl": 0.0021647471003234386, "learning_rate": 9.741176470588236e-07, "loss": 0.0001, "num_tokens": 76671529.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2795, "step_time": 18.63210655003786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 111.5, "completions/mean_terminated_length": 111.5, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23798342049121857, "epoch": 0.12950440018527096, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018192932475358248, "kl": 0.0012378752580843866, "learning_rate": 9.741083835108847e-07, "loss": 0.0001, "num_tokens": 76691969.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2796, "step_time": 13.495010670274496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.20825041830539703, "epoch": 0.12955071792496525, "frac_reward_zero_std": 1.0, "grad_norm": 0.002133691916242242, "kl": 0.0016764855245128274, "learning_rate": 9.740991199629458e-07, "loss": 0.0001, "num_tokens": 76712009.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2797, "step_time": 15.695982314646244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 119.0, "completions/max_terminated_length": 119.0, "completions/mean_length": 109.375, "completions/mean_terminated_length": 109.375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.22557823359966278, "epoch": 0.12959703566465958, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020180430728942156, "kl": 0.0015038259298307821, "learning_rate": 9.74089856415007e-07, "loss": 0.0001, "num_tokens": 76731167.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2798, "step_time": 11.681996446102858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.14772458374500275, "epoch": 0.12964335340435387, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011993461521342397, "kl": 0.0010793707915581763, "learning_rate": 9.74080592867068e-07, "loss": 0.0001, "num_tokens": 76777584.0, "reward": 0.796358048915863, "reward_std": 0.0, "rewards/reward_func/mean": 0.796358048915863, "rewards/reward_func/std": 0.0, "step": 2799, "step_time": 22.49989054724574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.6875, "completions/mean_terminated_length": 121.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2562808692455292, "epoch": 0.12968967114404817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010770438238978386, "kl": 0.0009614722366677597, "learning_rate": 9.740713293191292e-07, "loss": 0.0, "num_tokens": 76806011.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2800, "step_time": 14.480131164193153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3112131878733635, "epoch": 0.12973598888374246, "frac_reward_zero_std": 1.0, "grad_norm": 0.002840921049937606, "kl": 0.0020887310383841395, "learning_rate": 9.740620657711903e-07, "loss": 0.0001, "num_tokens": 76830293.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2801, "step_time": 15.810532130300999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3131905794143677, "epoch": 0.1297823066234368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026322880294173956, "kl": 0.0021417181997094303, "learning_rate": 9.740528022232514e-07, "loss": 0.0001, "num_tokens": 76854029.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2802, "step_time": 14.700319416821003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.19162605702877045, "epoch": 0.12982862436313108, "frac_reward_zero_std": 1.0, "grad_norm": 0.009328178130090237, "kl": 0.005718842032365501, "learning_rate": 9.740435386753126e-07, "loss": 0.0003, "num_tokens": 76884373.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2803, "step_time": 19.34447018429637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3919840082526207, "epoch": 0.12987494210282538, "frac_reward_zero_std": 0.0, "grad_norm": 0.10269218683242798, "kl": 0.005773777607828379, "learning_rate": 9.740342751273739e-07, "loss": -0.1533, "num_tokens": 76909995.0, "reward": 0.10943973064422607, "reward_std": 0.1957717388868332, "rewards/reward_func/mean": 0.10943973064422607, "rewards/reward_func/std": 0.1957717388868332, "step": 2804, "step_time": 22.360922671854496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 128.4375, "completions/mean_terminated_length": 128.4375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27369359880685806, "epoch": 0.12992125984251968, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034899504389613867, "kl": 0.002289213822223246, "learning_rate": 9.740250115794348e-07, "loss": 0.0001, "num_tokens": 76930626.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2805, "step_time": 14.012876734137535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.35228947550058365, "epoch": 0.129967577582214, "frac_reward_zero_std": 0.0, "grad_norm": 0.16344957053661346, "kl": 0.006582444650121033, "learning_rate": 9.74015748031496e-07, "loss": -0.104, "num_tokens": 76954972.0, "reward": 0.04720311611890793, "reward_std": 0.1888124644756317, "rewards/reward_func/mean": 0.04720311611890793, "rewards/reward_func/std": 0.1888124793767929, "step": 2806, "step_time": 18.877719160169363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 191.9375, "completions/mean_terminated_length": 191.9375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.39682287722826004, "epoch": 0.1300138953219083, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019119007047265768, "kl": 0.0019481416675262153, "learning_rate": 9.74006484483557e-07, "loss": 0.0001, "num_tokens": 76985995.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2807, "step_time": 21.04789998009801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 224.5625, "completions/mean_terminated_length": 224.5625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4276200234889984, "epoch": 0.1300602130616026, "frac_reward_zero_std": 0.0, "grad_norm": 0.11104413121938705, "kl": 0.009202176705002785, "learning_rate": 9.739972209356184e-07, "loss": -0.117, "num_tokens": 77018948.0, "reward": 0.0025933170691132545, "reward_std": 0.003972659353166819, "rewards/reward_func/mean": 0.0025933170691132545, "rewards/reward_func/std": 0.003972659818828106, "step": 2808, "step_time": 28.487824976444244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.40838924795389175, "epoch": 0.1301065308012969, "frac_reward_zero_std": 1.0, "grad_norm": 0.014654034748673439, "kl": 0.006727033876813948, "learning_rate": 9.739879573876795e-07, "loss": 0.0003, "num_tokens": 77048420.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2809, "step_time": 21.227949671447277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.4511312171816826, "epoch": 0.1301528485409912, "frac_reward_zero_std": 1.0, "grad_norm": 0.002993133617565036, "kl": 0.0028400610899552703, "learning_rate": 9.739786938397406e-07, "loss": 0.0001, "num_tokens": 77074344.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2810, "step_time": 22.26493389904499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 194.4375, "completions/mean_terminated_length": 194.4375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2299153245985508, "epoch": 0.1301991662806855, "frac_reward_zero_std": 0.0, "grad_norm": 0.1469038426876068, "kl": 0.012816822156310081, "learning_rate": 9.739694302918018e-07, "loss": 0.0382, "num_tokens": 77107743.0, "reward": 0.48003360629081726, "reward_std": 0.23582106828689575, "rewards/reward_func/mean": 0.48003360629081726, "rewards/reward_func/std": 0.23582108318805695, "step": 2811, "step_time": 25.016322780400515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 118.8125, "completions/mean_terminated_length": 118.8125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2408006638288498, "epoch": 0.1302454840203798, "frac_reward_zero_std": 1.0, "grad_norm": 0.00309711042791605, "kl": 0.0018486992630641907, "learning_rate": 9.739601667438629e-07, "loss": 0.0001, "num_tokens": 77127020.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2812, "step_time": 13.587357494980097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 176.0625, "completions/mean_terminated_length": 176.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.20126453414559364, "epoch": 0.1302918017600741, "frac_reward_zero_std": 0.0, "grad_norm": 0.12710730731487274, "kl": 0.00843966007232666, "learning_rate": 9.73950903195924e-07, "loss": -0.0621, "num_tokens": 77150157.0, "reward": 0.6890777349472046, "reward_std": 0.19096322357654572, "rewards/reward_func/mean": 0.6890777349472046, "rewards/reward_func/std": 0.19096322357654572, "step": 2813, "step_time": 19.778738137334585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 224.4375, "completions/mean_terminated_length": 224.4375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.48743831366300583, "epoch": 0.13033811949976842, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027950266376137733, "kl": 0.0023856922634877264, "learning_rate": 9.739416396479851e-07, "loss": 0.0001, "num_tokens": 77184212.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2814, "step_time": 25.894040428102016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 178.5625, "completions/mean_terminated_length": 178.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.25779133290052414, "epoch": 0.13038443723946272, "frac_reward_zero_std": 0.0, "grad_norm": 0.08911740034818649, "kl": 0.0016971510485745966, "learning_rate": 9.739323761000463e-07, "loss": -0.0071, "num_tokens": 77219549.0, "reward": 0.8511500954627991, "reward_std": 0.0016676903469488025, "rewards/reward_func/mean": 0.8511500954627991, "rewards/reward_func/std": 0.0016676932573318481, "step": 2815, "step_time": 22.849760118871927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2913820743560791, "epoch": 0.13043075497915702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025024309288710356, "kl": 0.0018942720780614763, "learning_rate": 9.739231125521074e-07, "loss": 0.0001, "num_tokens": 77249405.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2816, "step_time": 18.067398082464933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 298.3125, "completions/mean_terminated_length": 298.3125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "entropy": 0.35839635133743286, "epoch": 0.1304770727188513, "frac_reward_zero_std": 0.0, "grad_norm": 0.07766842842102051, "kl": 0.004964290070347488, "learning_rate": 9.739138490041685e-07, "loss": -0.1697, "num_tokens": 77289698.0, "reward": 0.23419679701328278, "reward_std": 0.35474202036857605, "rewards/reward_func/mean": 0.23419679701328278, "rewards/reward_func/std": 0.35474202036857605, "step": 2817, "step_time": 38.30240035802126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3360302150249481, "epoch": 0.13052339045854564, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030505871400237083, "kl": 0.002078228397294879, "learning_rate": 9.739045854562296e-07, "loss": 0.0001, "num_tokens": 77325526.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2818, "step_time": 19.013024352490902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 193.4375, "completions/mean_terminated_length": 193.4375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.3325582519173622, "epoch": 0.13056970819823993, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017353898147121072, "kl": 0.001587057369761169, "learning_rate": 9.738953219082908e-07, "loss": 0.0001, "num_tokens": 77352685.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2819, "step_time": 20.33905030414462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 215.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.2956802621483803, "epoch": 0.13061602593793423, "frac_reward_zero_std": 0.0, "grad_norm": 0.09080685675144196, "kl": 0.005979874520562589, "learning_rate": 9.738860583603519e-07, "loss": 0.0197, "num_tokens": 77376211.0, "reward": 0.1551075428724289, "reward_std": 0.036326441913843155, "rewards/reward_func/mean": 0.1551075428724289, "rewards/reward_func/std": 0.03632644936442375, "step": 2820, "step_time": 21.90963600203395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 150.9375, "completions/mean_terminated_length": 150.9375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.38743866980075836, "epoch": 0.13066234367762852, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015383700374513865, "kl": 0.0016945595853030682, "learning_rate": 9.738767948124132e-07, "loss": 0.0001, "num_tokens": 77419986.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2821, "step_time": 21.644740346819162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 162.8125, "completions/mean_terminated_length": 162.8125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.26039203628897667, "epoch": 0.13070866141732285, "frac_reward_zero_std": 0.0, "grad_norm": 0.11434976756572723, "kl": 0.0033649375254753977, "learning_rate": 9.738675312644744e-07, "loss": -0.0345, "num_tokens": 77441775.0, "reward": 0.9293943643569946, "reward_std": 0.035030219703912735, "rewards/reward_func/mean": 0.9293943643569946, "rewards/reward_func/std": 0.03503022342920303, "step": 2822, "step_time": 18.52336959913373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 128.4375, "completions/mean_terminated_length": 128.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3027849420905113, "epoch": 0.13075497915701714, "frac_reward_zero_std": 1.0, "grad_norm": 0.005467102862894535, "kl": 0.0026550963521003723, "learning_rate": 9.738582677165353e-07, "loss": 0.0001, "num_tokens": 77465526.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2823, "step_time": 15.159585032612085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 169.4375, "completions/mean_terminated_length": 169.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.25463734567165375, "epoch": 0.13080129689671144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048914761282503605, "kl": 0.002183649019571021, "learning_rate": 9.738490041685966e-07, "loss": 0.0001, "num_tokens": 77486445.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2824, "step_time": 17.311506897211075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3090377524495125, "epoch": 0.13084761463640573, "frac_reward_zero_std": 0.0, "grad_norm": 0.23248831927776337, "kl": 0.005472452496178448, "learning_rate": 9.738397406206577e-07, "loss": -0.019, "num_tokens": 77512237.0, "reward": 0.4834849238395691, "reward_std": 0.4851493239402771, "rewards/reward_func/mean": 0.4834849238395691, "rewards/reward_func/std": 0.4851492941379547, "step": 2825, "step_time": 21.344312380999327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 116.0625, "completions/mean_terminated_length": 116.0625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2596679925918579, "epoch": 0.13089393237610006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020094362553209066, "kl": 0.001851470791734755, "learning_rate": 9.738304770727189e-07, "loss": 0.0001, "num_tokens": 77533870.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2826, "step_time": 13.552637655287981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 210.5625, "completions/mean_terminated_length": 210.5625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.1898454986512661, "epoch": 0.13094025011579435, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038989358581602573, "kl": 0.002383921411819756, "learning_rate": 9.7382121352478e-07, "loss": 0.0001, "num_tokens": 77571959.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2827, "step_time": 23.55286794155836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 186.125, "completions/mean_terminated_length": 186.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.17277348786592484, "epoch": 0.13098656785548865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012403902364894748, "kl": 0.0008625778427813202, "learning_rate": 9.73811949976841e-07, "loss": 0.0, "num_tokens": 77604505.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 2828, "step_time": 21.842342231422663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.36300013214349747, "epoch": 0.13103288559518295, "frac_reward_zero_std": 1.0, "grad_norm": 0.005545167252421379, "kl": 0.004219019669108093, "learning_rate": 9.738026864289022e-07, "loss": 0.0002, "num_tokens": 77634169.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2829, "step_time": 21.325227595865726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 143.3125, "completions/mean_terminated_length": 143.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3187604248523712, "epoch": 0.13107920333487727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027888871263712645, "kl": 0.0021906490437686443, "learning_rate": 9.737934228809634e-07, "loss": 0.0001, "num_tokens": 77659806.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2830, "step_time": 16.002289425581694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 152.5, "completions/mean_terminated_length": 152.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.428384892642498, "epoch": 0.13112552107457157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022219247184693813, "kl": 0.0027715860633179545, "learning_rate": 9.737841593330245e-07, "loss": 0.0001, "num_tokens": 77705062.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2831, "step_time": 22.706428229808807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.332115963101387, "epoch": 0.13117183881426586, "frac_reward_zero_std": 1.0, "grad_norm": 0.004188732244074345, "kl": 0.002412432979326695, "learning_rate": 9.737748957850856e-07, "loss": 0.0001, "num_tokens": 77732244.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2832, "step_time": 17.01812907680869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 147.6875, "completions/mean_terminated_length": 147.6875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3853449895977974, "epoch": 0.13121815655396016, "frac_reward_zero_std": 1.0, "grad_norm": 0.004720553755760193, "kl": 0.003017849929165095, "learning_rate": 9.737656322371467e-07, "loss": 0.0002, "num_tokens": 77763391.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2833, "step_time": 17.235878136008978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 127.4375, "completions/mean_terminated_length": 127.4375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.31607072055339813, "epoch": 0.13126447429365448, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046013761311769485, "kl": 0.002206444158218801, "learning_rate": 9.73756368689208e-07, "loss": 0.0001, "num_tokens": 77791814.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2834, "step_time": 15.14806305989623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29682984203100204, "epoch": 0.13131079203334878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023793757427483797, "kl": 0.001989031967241317, "learning_rate": 9.73747105141269e-07, "loss": 0.0001, "num_tokens": 77812534.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2835, "step_time": 14.895387556403875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3648112565279007, "epoch": 0.13135710977304307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023736937437206507, "kl": 0.002105565945385024, "learning_rate": 9.7373784159333e-07, "loss": 0.0001, "num_tokens": 77838938.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2836, "step_time": 17.144540406763554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.37220189720392227, "epoch": 0.13140342751273737, "frac_reward_zero_std": 0.0, "grad_norm": 0.03227676451206207, "kl": 0.011503327172249556, "learning_rate": 9.737285780453912e-07, "loss": 0.0039, "num_tokens": 77861016.0, "reward": 2.237048465758562e-05, "reward_std": 2.982731348311063e-05, "rewards/reward_func/mean": 2.237048465758562e-05, "rewards/reward_func/std": 2.9827315302100033e-05, "step": 2837, "step_time": 19.155315332114697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3582801818847656, "epoch": 0.1314497452524317, "frac_reward_zero_std": 1.0, "grad_norm": 0.003145157126709819, "kl": 0.002566359471529722, "learning_rate": 9.737193144974526e-07, "loss": 0.0001, "num_tokens": 77884482.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2838, "step_time": 16.08520222082734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 171.75, "completions/mean_terminated_length": 171.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.37275414168834686, "epoch": 0.131496062992126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041363718919456005, "kl": 0.002808906661812216, "learning_rate": 9.737100509495137e-07, "loss": 0.0001, "num_tokens": 77908030.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2839, "step_time": 18.000150248408318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 205.0625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2423257678747177, "epoch": 0.13154238073182029, "frac_reward_zero_std": 0.0, "grad_norm": 0.0757722333073616, "kl": 0.003921059018466622, "learning_rate": 9.737007874015748e-07, "loss": -0.0097, "num_tokens": 77938927.0, "reward": 0.6516474485397339, "reward_std": 0.012865008786320686, "rewards/reward_func/mean": 0.6516474485397339, "rewards/reward_func/std": 0.012865009717643261, "step": 2840, "step_time": 24.081785764545202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 396.375, "completions/mean_terminated_length": 396.375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.2218848541378975, "epoch": 0.13158869847151458, "frac_reward_zero_std": 0.0, "grad_norm": 0.06507100909948349, "kl": 0.003597293747588992, "learning_rate": 9.73691523853636e-07, "loss": -0.0362, "num_tokens": 77971877.0, "reward": 0.939376711845398, "reward_std": 0.09530484676361084, "rewards/reward_func/mean": 0.939376711845398, "rewards/reward_func/std": 0.09530485421419144, "step": 2841, "step_time": 37.56795885413885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 170.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2796224169433117, "epoch": 0.1316350162112089, "frac_reward_zero_std": 0.0, "grad_norm": 0.11485463380813599, "kl": 0.006121489219367504, "learning_rate": 9.73682260305697e-07, "loss": -0.0539, "num_tokens": 77993054.0, "reward": 0.9737098217010498, "reward_std": 0.0470292828977108, "rewards/reward_func/mean": 0.9737098217010498, "rewards/reward_func/std": 0.0470292754471302, "step": 2842, "step_time": 18.752591751515865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 110.5625, "completions/mean_terminated_length": 110.5625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2667034789919853, "epoch": 0.1316813339509032, "frac_reward_zero_std": 1.0, "grad_norm": 0.004681343678385019, "kl": 0.0029791942215524614, "learning_rate": 9.736729967577582e-07, "loss": 0.0001, "num_tokens": 78013623.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2843, "step_time": 13.74761088192463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3632110729813576, "epoch": 0.1317276516905975, "frac_reward_zero_std": 1.0, "grad_norm": 0.005398222245275974, "kl": 0.003484962333459407, "learning_rate": 9.736637332098193e-07, "loss": 0.0002, "num_tokens": 78061759.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2844, "step_time": 23.896695479750633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.45029694586992264, "epoch": 0.1317739694302918, "frac_reward_zero_std": 1.0, "grad_norm": 0.002244369825348258, "kl": 0.0021674801246263087, "learning_rate": 9.736544696618804e-07, "loss": 0.0001, "num_tokens": 78094635.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2845, "step_time": 19.787635374814272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 136.125, "completions/mean_terminated_length": 136.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2084420546889305, "epoch": 0.13182028716998612, "frac_reward_zero_std": 1.0, "grad_norm": 0.002307872287929058, "kl": 0.0015559589373879135, "learning_rate": 9.736452061139416e-07, "loss": 0.0001, "num_tokens": 78114221.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2846, "step_time": 13.526204243302345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.25473516434431076, "epoch": 0.1318666049096804, "frac_reward_zero_std": 0.0, "grad_norm": 0.11678625643253326, "kl": 0.005555804818868637, "learning_rate": 9.736359425660027e-07, "loss": -0.0889, "num_tokens": 78146945.0, "reward": 0.9171520471572876, "reward_std": 0.2485218346118927, "rewards/reward_func/mean": 0.9171520471572876, "rewards/reward_func/std": 0.2485218197107315, "step": 2847, "step_time": 28.150619812309742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 140.5625, "completions/mean_terminated_length": 140.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2401268556714058, "epoch": 0.1319129226493747, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015420548152178526, "kl": 0.0012628434924408793, "learning_rate": 9.736266790180638e-07, "loss": 0.0001, "num_tokens": 78166586.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2848, "step_time": 15.192064005881548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1445140577852726, "epoch": 0.131959240389069, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025332460645586252, "kl": 0.0010443386563565582, "learning_rate": 9.73617415470125e-07, "loss": 0.0001, "num_tokens": 78204498.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 2849, "step_time": 19.768490180373192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4105469658970833, "epoch": 0.13200555812876333, "frac_reward_zero_std": 1.0, "grad_norm": 0.004277004394680262, "kl": 0.0022296886891126633, "learning_rate": 9.73608151922186e-07, "loss": 0.0001, "num_tokens": 78238307.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2850, "step_time": 21.153131306171417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3839899152517319, "epoch": 0.13205187586845762, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011443643597885966, "kl": 0.001415493810782209, "learning_rate": 9.735988883742474e-07, "loss": 0.0001, "num_tokens": 78295438.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2851, "step_time": 27.310379087924957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 152.9375, "completions/mean_terminated_length": 152.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.33014437556266785, "epoch": 0.13209819360815192, "frac_reward_zero_std": 1.0, "grad_norm": 0.010165599174797535, "kl": 0.00636251294054091, "learning_rate": 9.735896248263085e-07, "loss": 0.0003, "num_tokens": 78316093.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2852, "step_time": 17.826702434569597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.310026116669178, "epoch": 0.13214451134784622, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032589833717793226, "kl": 0.0018544544582255185, "learning_rate": 9.735803612783696e-07, "loss": 0.0001, "num_tokens": 78351867.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2853, "step_time": 17.5997002273798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 283.125, "completions/mean_terminated_length": 283.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.19570598751306534, "epoch": 0.13219082908754054, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010641536209732294, "kl": 0.0010448120883665979, "learning_rate": 9.735710977304308e-07, "loss": 0.0001, "num_tokens": 78393581.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2854, "step_time": 30.403636183589697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 138.4375, "completions/mean_terminated_length": 138.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.43954967707395554, "epoch": 0.13223714682723484, "frac_reward_zero_std": 1.0, "grad_norm": 0.003814935917034745, "kl": 0.0027921597356908023, "learning_rate": 9.73561834182492e-07, "loss": 0.0001, "num_tokens": 78416532.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2855, "step_time": 15.817701667547226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 119.8125, "completions/mean_terminated_length": 119.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2750936970114708, "epoch": 0.13228346456692913, "frac_reward_zero_std": 1.0, "grad_norm": 0.00558044109493494, "kl": 0.0032425089739263058, "learning_rate": 9.73552570634553e-07, "loss": 0.0002, "num_tokens": 78436321.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2856, "step_time": 12.657209232449532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.26289578527212143, "epoch": 0.13232978230662343, "frac_reward_zero_std": 1.0, "grad_norm": 0.006055313628166914, "kl": 0.0038973866030573845, "learning_rate": 9.735433070866141e-07, "loss": 0.0002, "num_tokens": 78469447.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2857, "step_time": 25.483021415770054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 115.25, "completions/mean_terminated_length": 115.25, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.29793746024370193, "epoch": 0.13237610004631775, "frac_reward_zero_std": 1.0, "grad_norm": 0.006718107964843512, "kl": 0.0022733150981366634, "learning_rate": 9.735340435386753e-07, "loss": 0.0001, "num_tokens": 78492987.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2858, "step_time": 14.312612924724817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 201.875, "completions/mean_terminated_length": 201.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2685972824692726, "epoch": 0.13242241778601205, "frac_reward_zero_std": 0.0, "grad_norm": 0.09218121320009232, "kl": 0.0030734979663975537, "learning_rate": 9.735247799907364e-07, "loss": 0.0674, "num_tokens": 78515625.0, "reward": 0.2890387773513794, "reward_std": 0.010135901160538197, "rewards/reward_func/mean": 0.2890387773513794, "rewards/reward_func/std": 0.01013590395450592, "step": 2859, "step_time": 19.88095634058118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 140.6875, "completions/mean_terminated_length": 140.6875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.23037349060177803, "epoch": 0.13246873552570634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024827327579259872, "kl": 0.0014823552628513426, "learning_rate": 9.735155164427975e-07, "loss": 0.0001, "num_tokens": 78535444.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2860, "step_time": 14.67548917979002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.1875, "completions/mean_terminated_length": 187.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.261336587369442, "epoch": 0.13251505326540064, "frac_reward_zero_std": 0.0, "grad_norm": 0.1303095519542694, "kl": 0.01158877625130117, "learning_rate": 9.735062528948586e-07, "loss": 0.0107, "num_tokens": 78557415.0, "reward": 0.8026753664016724, "reward_std": 0.4001431465148926, "rewards/reward_func/mean": 0.8026753664016724, "rewards/reward_func/std": 0.4001431465148926, "step": 2861, "step_time": 20.49484769254923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.20846930146217346, "epoch": 0.13256137100509496, "frac_reward_zero_std": 1.0, "grad_norm": 0.003349486505612731, "kl": 0.0025259426329284906, "learning_rate": 9.734969893469198e-07, "loss": 0.0001, "num_tokens": 78579934.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2862, "step_time": 15.887763421982527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.34081466495990753, "epoch": 0.13260768874478926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015075166011229157, "kl": 0.0018579459574539214, "learning_rate": 9.73487725798981e-07, "loss": 0.0001, "num_tokens": 78599766.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2863, "step_time": 13.782018475234509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 156.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.19136565178632736, "epoch": 0.13265400648448356, "frac_reward_zero_std": 0.0, "grad_norm": 0.1502605527639389, "kl": 0.005579829099588096, "learning_rate": 9.734784622510422e-07, "loss": -0.0105, "num_tokens": 78626742.0, "reward": 0.8675934076309204, "reward_std": 0.02813946083188057, "rewards/reward_func/mean": 0.8675934076309204, "rewards/reward_func/std": 0.028139453381299973, "step": 2864, "step_time": 18.511396799236536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2968406602740288, "epoch": 0.13270032422417785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017504510469734669, "kl": 0.0013929185806773603, "learning_rate": 9.734691987031034e-07, "loss": 0.0001, "num_tokens": 78648543.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2865, "step_time": 16.743940446525812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.40339572727680206, "epoch": 0.13274664196387218, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022827195934951305, "kl": 0.0023920593375805765, "learning_rate": 9.734599351551643e-07, "loss": 0.0001, "num_tokens": 78701713.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2866, "step_time": 23.830079551786184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.17589811980724335, "epoch": 0.13279295970356647, "frac_reward_zero_std": 0.0, "grad_norm": 0.17251649498939514, "kl": 0.021240927278995514, "learning_rate": 9.734506716072254e-07, "loss": -0.0168, "num_tokens": 78724929.0, "reward": 0.9665919542312622, "reward_std": 0.13363230228424072, "rewards/reward_func/mean": 0.9665919542312622, "rewards/reward_func/std": 0.13363230228424072, "step": 2867, "step_time": 19.739026203751564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 126.1875, "completions/mean_terminated_length": 126.1875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2696906700730324, "epoch": 0.13283927744326077, "frac_reward_zero_std": 1.0, "grad_norm": 0.004241105634719133, "kl": 0.00217734751640819, "learning_rate": 9.734414080592867e-07, "loss": 0.0001, "num_tokens": 78745620.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2868, "step_time": 13.344957027584314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 202.5, "completions/mean_terminated_length": 202.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1787184216082096, "epoch": 0.13288559518295506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020049912855029106, "kl": 0.001621152478037402, "learning_rate": 9.734321445113479e-07, "loss": 0.0001, "num_tokens": 78785516.0, "reward": 0.03273354470729828, "reward_std": 0.0, "rewards/reward_func/mean": 0.03273354470729828, "rewards/reward_func/std": 0.0, "step": 2869, "step_time": 24.221180498600006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 132.3125, "completions/mean_terminated_length": 132.3125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.30913008749485016, "epoch": 0.1329319129226494, "frac_reward_zero_std": 1.0, "grad_norm": 0.004960328806191683, "kl": 0.002564389316830784, "learning_rate": 9.73422880963409e-07, "loss": 0.0001, "num_tokens": 78805377.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2870, "step_time": 14.58257419988513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.14191536977887154, "epoch": 0.13297823066234368, "frac_reward_zero_std": 1.0, "grad_norm": 0.001973430858924985, "kl": 0.0014234594709705561, "learning_rate": 9.734136174154701e-07, "loss": 0.0001, "num_tokens": 78828215.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 2871, "step_time": 18.145116105675697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.19115526601672173, "epoch": 0.13302454840203798, "frac_reward_zero_std": 0.0, "grad_norm": 0.09391110390424728, "kl": 0.005062502284999937, "learning_rate": 9.734043538675312e-07, "loss": -0.1028, "num_tokens": 78855943.0, "reward": 0.7210334539413452, "reward_std": 0.20943774282932281, "rewards/reward_func/mean": 0.7210334539413452, "rewards/reward_func/std": 0.20943774282932281, "step": 2872, "step_time": 29.053698629140854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 160.6875, "completions/mean_terminated_length": 160.6875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.17224779352545738, "epoch": 0.13307086614173227, "frac_reward_zero_std": 1.0, "grad_norm": 0.007179053034633398, "kl": 0.004441243247129023, "learning_rate": 9.733950903195924e-07, "loss": 0.0002, "num_tokens": 78888802.0, "reward": 0.38776010274887085, "reward_std": 0.0, "rewards/reward_func/mean": 0.38776010274887085, "rewards/reward_func/std": 0.0, "step": 2873, "step_time": 18.88411708176136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.370157815515995, "epoch": 0.1331171838814266, "frac_reward_zero_std": 1.0, "grad_norm": 0.00280299736186862, "kl": 0.002772213250864297, "learning_rate": 9.733858267716535e-07, "loss": 0.0001, "num_tokens": 78919604.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2874, "step_time": 20.773690421134233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 376.5625, "completions/mean_terminated_length": 376.5625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.33804337680339813, "epoch": 0.1331635016211209, "frac_reward_zero_std": 0.0, "grad_norm": 0.07376828789710999, "kl": 0.004882515640929341, "learning_rate": 9.733765632237146e-07, "loss": -0.2977, "num_tokens": 78953805.0, "reward": 0.07358209788799286, "reward_std": 0.1180819422006607, "rewards/reward_func/mean": 0.07358209788799286, "rewards/reward_func/std": 0.1180819496512413, "step": 2875, "step_time": 53.88939857855439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.18610315024852753, "epoch": 0.1332098193608152, "frac_reward_zero_std": 1.0, "grad_norm": 0.001507364446297288, "kl": 0.0011621343437582254, "learning_rate": 9.733672996757757e-07, "loss": 0.0001, "num_tokens": 78990909.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 2876, "step_time": 21.398020897060633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 144.25, "completions/mean_terminated_length": 144.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.15696774795651436, "epoch": 0.1332561371005095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017617812845855951, "kl": 0.0013164619158487767, "learning_rate": 9.733580361278369e-07, "loss": 0.0001, "num_tokens": 79027377.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2877, "step_time": 19.813029501587152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.26674485206604004, "epoch": 0.1333024548402038, "frac_reward_zero_std": 1.0, "grad_norm": 0.01314203254878521, "kl": 0.0038373583811335266, "learning_rate": 9.73348772579898e-07, "loss": 0.0002, "num_tokens": 79047697.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2878, "step_time": 15.462935660034418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 244.9375, "completions/mean_terminated_length": 244.9375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.3274817541241646, "epoch": 0.1333487725798981, "frac_reward_zero_std": 0.0, "grad_norm": 0.08200172334909439, "kl": 0.017387705855071545, "learning_rate": 9.733395090319591e-07, "loss": 0.0394, "num_tokens": 79085632.0, "reward": 0.5542079210281372, "reward_std": 0.3619919419288635, "rewards/reward_func/mean": 0.5542079210281372, "rewards/reward_func/std": 0.3619919419288635, "step": 2879, "step_time": 33.15012853220105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 165.5, "completions/mean_terminated_length": 165.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.23618333414196968, "epoch": 0.1333950903195924, "frac_reward_zero_std": 0.0, "grad_norm": 0.09502062201499939, "kl": 0.0038849368575029075, "learning_rate": 9.733302454840202e-07, "loss": -0.0665, "num_tokens": 79113304.0, "reward": 0.018163681030273438, "reward_std": 0.05706524848937988, "rewards/reward_func/mean": 0.018163681030273438, "rewards/reward_func/std": 0.057065244764089584, "step": 2880, "step_time": 19.765874680131674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.18659117072820663, "epoch": 0.1334414080592867, "frac_reward_zero_std": 1.0, "grad_norm": 0.001981121487915516, "kl": 0.0014658432046417147, "learning_rate": 9.733209819360816e-07, "loss": 0.0001, "num_tokens": 79153338.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 2881, "step_time": 21.366734340786934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 140.5, "completions/mean_terminated_length": 140.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.25292740762233734, "epoch": 0.13348772579898102, "frac_reward_zero_std": 1.0, "grad_norm": 0.004230554215610027, "kl": 0.0022894427529536188, "learning_rate": 9.733117183881427e-07, "loss": 0.0001, "num_tokens": 79173026.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2882, "step_time": 14.935656115412712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.16847683116793633, "epoch": 0.13353404353867532, "frac_reward_zero_std": 0.0, "grad_norm": 0.0687066838145256, "kl": 0.0009277532808482647, "learning_rate": 9.733024548402038e-07, "loss": -0.0133, "num_tokens": 79220244.0, "reward": 0.9978815317153931, "reward_std": 0.008473753929138184, "rewards/reward_func/mean": 0.9978815317153931, "rewards/reward_func/std": 0.008473754860460758, "step": 2883, "step_time": 28.099848553538322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 183.25, "completions/mean_terminated_length": 183.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.31974637508392334, "epoch": 0.1335803612783696, "frac_reward_zero_std": 0.0, "grad_norm": 0.0881740152835846, "kl": 0.006744670565240085, "learning_rate": 9.73293191292265e-07, "loss": 0.0348, "num_tokens": 79243304.0, "reward": 0.06255227327346802, "reward_std": 0.24998606741428375, "rewards/reward_func/mean": 0.06255227327346802, "rewards/reward_func/std": 0.24998606741428375, "step": 2884, "step_time": 21.40220644325018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 300.4375, "completions/mean_terminated_length": 300.4375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.1885397471487522, "epoch": 0.1336266790180639, "frac_reward_zero_std": 0.0, "grad_norm": 0.05555471405386925, "kl": 0.0034118599724024534, "learning_rate": 9.73283927744326e-07, "loss": -0.0385, "num_tokens": 79283215.0, "reward": 0.9948967695236206, "reward_std": 0.01394471526145935, "rewards/reward_func/mean": 0.9948967695236206, "rewards/reward_func/std": 0.013944721780717373, "step": 2885, "step_time": 30.627264350652695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 175.625, "completions/mean_terminated_length": 175.625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1396532915532589, "epoch": 0.13367299675775823, "frac_reward_zero_std": 1.0, "grad_norm": 0.006407846696674824, "kl": 0.024945903103798628, "learning_rate": 9.732746641963872e-07, "loss": 0.0012, "num_tokens": 79312025.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2886, "step_time": 19.252801068127155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.43986694514751434, "epoch": 0.13371931449745253, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020557146053761244, "kl": 0.0019736994290724397, "learning_rate": 9.732654006484483e-07, "loss": 0.0001, "num_tokens": 79357915.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2887, "step_time": 23.858646240085363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1891889125108719, "epoch": 0.13376563223714683, "frac_reward_zero_std": 1.0, "grad_norm": 0.004160380456596613, "kl": 0.003076967317610979, "learning_rate": 9.732561371005094e-07, "loss": 0.0002, "num_tokens": 79379969.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2888, "step_time": 18.755075678229332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.33918070793151855, "epoch": 0.13381194997684112, "frac_reward_zero_std": 1.0, "grad_norm": 0.002751731313765049, "kl": 0.0021560149907600135, "learning_rate": 9.732468735525706e-07, "loss": 0.0001, "num_tokens": 79416082.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2889, "step_time": 18.360064450651407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4022962525486946, "epoch": 0.13385826771653545, "frac_reward_zero_std": 1.0, "grad_norm": 0.004516348708420992, "kl": 0.0032430451828986406, "learning_rate": 9.732376100046317e-07, "loss": 0.0002, "num_tokens": 79437456.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2890, "step_time": 19.23727685213089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 149.4375, "completions/mean_terminated_length": 149.4375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.45322011411190033, "epoch": 0.13390458545622974, "frac_reward_zero_std": 1.0, "grad_norm": 0.002992769004777074, "kl": 0.0031186541309580207, "learning_rate": 9.732283464566928e-07, "loss": 0.0002, "num_tokens": 79478807.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2891, "step_time": 20.694837115705013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.37650372087955475, "epoch": 0.13395090319592404, "frac_reward_zero_std": 1.0, "grad_norm": 0.00413138372823596, "kl": 0.002810453821439296, "learning_rate": 9.73219082908754e-07, "loss": 0.0001, "num_tokens": 79505987.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2892, "step_time": 19.3064882196486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.25057096034288406, "epoch": 0.13399722093561833, "frac_reward_zero_std": 0.0, "grad_norm": 0.0678534209728241, "kl": 0.0033562793396413326, "learning_rate": 9.73209819360815e-07, "loss": -0.0279, "num_tokens": 79531731.0, "reward": 0.6970856189727783, "reward_std": 0.14487607777118683, "rewards/reward_func/mean": 0.6970856189727783, "rewards/reward_func/std": 0.14487609267234802, "step": 2893, "step_time": 25.717489823698997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 160.5625, "completions/mean_terminated_length": 160.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.21365992724895477, "epoch": 0.13404353867531266, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014544484438374639, "kl": 0.0009965770732378587, "learning_rate": 9.732005558128764e-07, "loss": 0.0, "num_tokens": 79562892.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2894, "step_time": 19.14187029749155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.22835980728268623, "epoch": 0.13408985641500695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021027359180152416, "kl": 0.0016507274995092303, "learning_rate": 9.731912922649375e-07, "loss": 0.0001, "num_tokens": 79583170.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2895, "step_time": 15.572048984467983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 127.1875, "completions/mean_terminated_length": 127.1875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2573355510830879, "epoch": 0.13413617415470125, "frac_reward_zero_std": 1.0, "grad_norm": 0.003158706473186612, "kl": 0.0019519127672538161, "learning_rate": 9.731820287169987e-07, "loss": 0.0001, "num_tokens": 79603029.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2896, "step_time": 14.71007889136672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2831461504101753, "epoch": 0.13418249189439554, "frac_reward_zero_std": 0.0, "grad_norm": 0.10067969560623169, "kl": 0.003846227133180946, "learning_rate": 9.731727651690596e-07, "loss": -0.0102, "num_tokens": 79625508.0, "reward": 0.9107850790023804, "reward_std": 0.023790644481778145, "rewards/reward_func/mean": 0.9107850790023804, "rewards/reward_func/std": 0.023790642619132996, "step": 2897, "step_time": 16.928314816206694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 179.1875, "completions/mean_terminated_length": 179.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.43180882185697556, "epoch": 0.13422880963408987, "frac_reward_zero_std": 1.0, "grad_norm": 0.006570231169462204, "kl": 0.00406917673535645, "learning_rate": 9.73163501621121e-07, "loss": 0.0002, "num_tokens": 79651943.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2898, "step_time": 20.983196452260017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 126.0625, "completions/mean_terminated_length": 126.0625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.3219418525695801, "epoch": 0.13427512737378416, "frac_reward_zero_std": 1.0, "grad_norm": 0.006617399863898754, "kl": 0.002424533828161657, "learning_rate": 9.73154238073182e-07, "loss": 0.0001, "num_tokens": 79678040.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2899, "step_time": 16.985364101827145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 129.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.4039051756262779, "epoch": 0.13432144511347846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031726134475320578, "kl": 0.002316710742888972, "learning_rate": 9.731449745252432e-07, "loss": 0.0001, "num_tokens": 79710337.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2900, "step_time": 15.91283344477415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 167.125, "completions/mean_terminated_length": 167.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.22085124626755714, "epoch": 0.13436776285317276, "frac_reward_zero_std": 0.0, "grad_norm": 0.11789902299642563, "kl": 0.0022243360290303826, "learning_rate": 9.731357109773043e-07, "loss": 0.0324, "num_tokens": 79731379.0, "reward": 0.8954626321792603, "reward_std": 0.04066455364227295, "rewards/reward_func/mean": 0.8954626321792603, "rewards/reward_func/std": 0.04066455364227295, "step": 2901, "step_time": 17.55325523391366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 241.1875, "completions/mean_terminated_length": 241.1875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.23715194314718246, "epoch": 0.13441408059286708, "frac_reward_zero_std": 1.0, "grad_norm": 0.006826833356171846, "kl": 0.008999955840408802, "learning_rate": 9.731264474293654e-07, "loss": 0.0004, "num_tokens": 79757510.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2902, "step_time": 26.31505984812975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 119.3125, "completions/mean_terminated_length": 119.3125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3403228744864464, "epoch": 0.13446039833256138, "frac_reward_zero_std": 1.0, "grad_norm": 0.004008160438388586, "kl": 0.0024982955947052687, "learning_rate": 9.731171838814265e-07, "loss": 0.0001, "num_tokens": 79777803.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2903, "step_time": 13.930200260132551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.4237442836165428, "epoch": 0.13450671607225567, "frac_reward_zero_std": 1.0, "grad_norm": 0.005169948097318411, "kl": 0.004285382223315537, "learning_rate": 9.731079203334877e-07, "loss": 0.0002, "num_tokens": 79812303.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2904, "step_time": 23.392813712358475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.49060673266649246, "epoch": 0.13455303381194997, "frac_reward_zero_std": 0.0, "grad_norm": 0.07705912739038467, "kl": 0.0030304257525131106, "learning_rate": 9.730986567855488e-07, "loss": 0.089, "num_tokens": 79837057.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 2905, "step_time": 27.92110213637352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2768367826938629, "epoch": 0.1345993515516443, "frac_reward_zero_std": 1.0, "grad_norm": 0.007510158233344555, "kl": 0.003734195663128048, "learning_rate": 9.7308939323761e-07, "loss": 0.0002, "num_tokens": 79857238.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2906, "step_time": 14.888576116412878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.269774716347456, "epoch": 0.1346456692913386, "frac_reward_zero_std": 1.0, "grad_norm": 0.011981590650975704, "kl": 0.008176260511390865, "learning_rate": 9.73080129689671e-07, "loss": 0.0004, "num_tokens": 79883369.0, "reward": 0.08882806450128555, "reward_std": 0.0, "rewards/reward_func/mean": 0.08882806450128555, "rewards/reward_func/std": 0.0, "step": 2907, "step_time": 18.90694124996662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 206.6875, "completions/mean_terminated_length": 206.6875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.20256155729293823, "epoch": 0.13469198703103288, "frac_reward_zero_std": 1.0, "grad_norm": 0.006589134223759174, "kl": 0.005259062745608389, "learning_rate": 9.730708661417324e-07, "loss": 0.0003, "num_tokens": 79909540.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2908, "step_time": 20.471765104681253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.2770218923687935, "epoch": 0.13473830477072718, "frac_reward_zero_std": 0.0, "grad_norm": 0.0778183862566948, "kl": 0.007008157903328538, "learning_rate": 9.730616025937933e-07, "loss": -0.0185, "num_tokens": 79934066.0, "reward": 0.7777365446090698, "reward_std": 0.27077364921569824, "rewards/reward_func/mean": 0.7777365446090698, "rewards/reward_func/std": 0.27077367901802063, "step": 2909, "step_time": 22.851427253335714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.40292152017354965, "epoch": 0.1347846225104215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015367643209174275, "kl": 0.0018360107787884772, "learning_rate": 9.730523390458544e-07, "loss": 0.0001, "num_tokens": 79970778.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2910, "step_time": 21.00841509178281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.45231635868549347, "epoch": 0.1348309402501158, "frac_reward_zero_std": 1.0, "grad_norm": 0.012612712569534779, "kl": 0.005916845519095659, "learning_rate": 9.730430754979157e-07, "loss": 0.0003, "num_tokens": 80022730.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2911, "step_time": 27.85970949009061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 232.6875, "completions/mean_terminated_length": 232.6875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.32124925404787064, "epoch": 0.1348772579898101, "frac_reward_zero_std": 0.0, "grad_norm": 0.08090087026357651, "kl": 0.002967926091514528, "learning_rate": 9.730338119499769e-07, "loss": 0.0165, "num_tokens": 80045237.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 2912, "step_time": 21.96769331395626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 147.3125, "completions/mean_terminated_length": 147.3125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.24899177998304367, "epoch": 0.1349235757295044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038116220384836197, "kl": 0.0020863708632532507, "learning_rate": 9.73024548402038e-07, "loss": 0.0001, "num_tokens": 80068650.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2913, "step_time": 16.062062311917543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.4191589877009392, "epoch": 0.13496989346919872, "frac_reward_zero_std": 1.0, "grad_norm": 0.005117417778819799, "kl": 0.003462464897893369, "learning_rate": 9.730152848540991e-07, "loss": 0.0002, "num_tokens": 80098574.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2914, "step_time": 21.375920746475458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.29498157650232315, "epoch": 0.135016211208893, "frac_reward_zero_std": 0.0, "grad_norm": 0.0670662447810173, "kl": 0.005866886465810239, "learning_rate": 9.730060213061602e-07, "loss": -0.1551, "num_tokens": 80128648.0, "reward": 0.20415154099464417, "reward_std": 0.15054519474506378, "rewards/reward_func/mean": 0.20415154099464417, "rewards/reward_func/std": 0.15054519474506378, "step": 2915, "step_time": 31.901460755616426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 129.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.29677698016166687, "epoch": 0.1350625289485873, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028735266532748938, "kl": 0.0016636458167340606, "learning_rate": 9.729967577582214e-07, "loss": 0.0001, "num_tokens": 80156136.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2916, "step_time": 16.00371688231826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.30348125100135803, "epoch": 0.1351088466882816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021966758649796247, "kl": 0.001968748401850462, "learning_rate": 9.729874942102825e-07, "loss": 0.0001, "num_tokens": 80180159.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2917, "step_time": 14.408776670694351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.199592687189579, "epoch": 0.13515516442797593, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033022004645317793, "kl": 0.0021382744307629764, "learning_rate": 9.729782306623436e-07, "loss": 0.0001, "num_tokens": 80200947.0, "reward": 0.8507331609725952, "reward_std": 0.0, "rewards/reward_func/mean": 0.8507331609725952, "rewards/reward_func/std": 0.0, "step": 2918, "step_time": 15.433997303247452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.40478938817977905, "epoch": 0.13520148216767022, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042166030034422874, "kl": 0.0031098597100935876, "learning_rate": 9.729689671144047e-07, "loss": 0.0002, "num_tokens": 80222322.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2919, "step_time": 17.935936015099287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.24535804614424706, "epoch": 0.13524779990736452, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034478441812098026, "kl": 0.0026309596141800284, "learning_rate": 9.729597035664659e-07, "loss": 0.0001, "num_tokens": 80243715.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2920, "step_time": 18.642922777682543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 140.75, "completions/mean_terminated_length": 140.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.32133422791957855, "epoch": 0.13529411764705881, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026950486935675144, "kl": 0.002045614004600793, "learning_rate": 9.72950440018527e-07, "loss": 0.0001, "num_tokens": 80264239.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2921, "step_time": 15.687923986464739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.34213197976350784, "epoch": 0.13534043538675314, "frac_reward_zero_std": 1.0, "grad_norm": 0.012444699183106422, "kl": 0.0053047959809191525, "learning_rate": 9.729411764705881e-07, "loss": 0.0003, "num_tokens": 80300755.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2922, "step_time": 20.790783379226923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.1986936368048191, "epoch": 0.13538675312644743, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037879047449678183, "kl": 0.002654906886164099, "learning_rate": 9.729319129226492e-07, "loss": 0.0001, "num_tokens": 80321153.0, "reward": 0.21578796207904816, "reward_std": 0.0, "rewards/reward_func/mean": 0.21578796207904816, "rewards/reward_func/std": 0.0, "step": 2923, "step_time": 15.273275960236788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.24271875619888306, "epoch": 0.13543307086614173, "frac_reward_zero_std": 1.0, "grad_norm": 0.003306017490103841, "kl": 0.0020956738444510847, "learning_rate": 9.729226493747106e-07, "loss": 0.0001, "num_tokens": 80340743.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2924, "step_time": 14.392927952110767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.19377048686146736, "epoch": 0.13547938860583603, "frac_reward_zero_std": 0.0, "grad_norm": 0.15734557807445526, "kl": 0.0018305819248780608, "learning_rate": 9.729133858267717e-07, "loss": 0.0111, "num_tokens": 80365977.0, "reward": 0.34278643131256104, "reward_std": 0.007662854623049498, "rewards/reward_func/mean": 0.34278643131256104, "rewards/reward_func/std": 0.007662852294743061, "step": 2925, "step_time": 16.507501907646656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.2721988260746002, "epoch": 0.13552570634553035, "frac_reward_zero_std": 0.0, "grad_norm": 0.0854281559586525, "kl": 0.006537881796248257, "learning_rate": 9.729041222788328e-07, "loss": 0.058, "num_tokens": 80390437.0, "reward": 0.7393002510070801, "reward_std": 0.006071718409657478, "rewards/reward_func/mean": 0.7393002510070801, "rewards/reward_func/std": 0.006071717012673616, "step": 2926, "step_time": 23.67028560861945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.4652087688446045, "epoch": 0.13557202408522465, "frac_reward_zero_std": 1.0, "grad_norm": 0.008797567337751389, "kl": 0.006701090256683528, "learning_rate": 9.72894858730894e-07, "loss": 0.0003, "num_tokens": 80421643.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2927, "step_time": 22.19981612637639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.17625219747424126, "epoch": 0.13561834182491894, "frac_reward_zero_std": 0.0, "grad_norm": 0.08175390213727951, "kl": 0.001726919668726623, "learning_rate": 9.72885595182955e-07, "loss": 0.0101, "num_tokens": 80444321.0, "reward": 0.9493370056152344, "reward_std": 0.013510131277143955, "rewards/reward_func/mean": 0.9493370056152344, "rewards/reward_func/std": 0.013510138727724552, "step": 2928, "step_time": 18.106609422713518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3816415071487427, "epoch": 0.13566465956461324, "frac_reward_zero_std": 0.0, "grad_norm": 0.08109243214130402, "kl": 0.00977090816013515, "learning_rate": 9.728763316350162e-07, "loss": -0.0956, "num_tokens": 80482319.0, "reward": 0.44238895177841187, "reward_std": 0.4039229154586792, "rewards/reward_func/mean": 0.44238895177841187, "rewards/reward_func/std": 0.4039229452610016, "step": 2929, "step_time": 30.66715943813324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 166.4375, "completions/mean_terminated_length": 166.4375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2888646200299263, "epoch": 0.13571097730430756, "frac_reward_zero_std": 0.0, "grad_norm": 0.12389273196458817, "kl": 0.0038977753720246255, "learning_rate": 9.728670680870773e-07, "loss": -0.0277, "num_tokens": 80502486.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.5625, "rewards/reward_func/std": 0.5123475790023804, "step": 2930, "step_time": 16.247186593711376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.32616279274225235, "epoch": 0.13575729504400186, "frac_reward_zero_std": 1.0, "grad_norm": 0.002884137211367488, "kl": 0.0022058217437006533, "learning_rate": 9.728578045391384e-07, "loss": 0.0001, "num_tokens": 80522824.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2931, "step_time": 14.713385853916407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.3592124730348587, "epoch": 0.13580361278369615, "frac_reward_zero_std": 0.0, "grad_norm": 0.07550641894340515, "kl": 0.0037497011944651604, "learning_rate": 9.728485409911996e-07, "loss": -0.1445, "num_tokens": 80553986.0, "reward": 0.5031079649925232, "reward_std": 0.40248632431030273, "rewards/reward_func/mean": 0.5031079649925232, "rewards/reward_func/std": 0.4024863541126251, "step": 2932, "step_time": 32.35196726769209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 115.5625, "completions/mean_terminated_length": 115.5625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.27754029631614685, "epoch": 0.13584993052339045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028542224317789078, "kl": 0.0020283262128941715, "learning_rate": 9.728392774432607e-07, "loss": 0.0001, "num_tokens": 80575435.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2933, "step_time": 13.490512564778328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 149.0, "completions/mean_terminated_length": 149.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3260805085301399, "epoch": 0.13589624826308477, "frac_reward_zero_std": 1.0, "grad_norm": 0.003782173851504922, "kl": 0.002757948881480843, "learning_rate": 9.728300138953218e-07, "loss": 0.0001, "num_tokens": 80598459.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2934, "step_time": 16.095183614641428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 100.8125, "completions/mean_terminated_length": 100.8125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.26286860555410385, "epoch": 0.13594256600277907, "frac_reward_zero_std": 1.0, "grad_norm": 0.003110466292127967, "kl": 0.0018810816982295364, "learning_rate": 9.72820750347383e-07, "loss": 0.0001, "num_tokens": 80617816.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2935, "step_time": 11.666656825691462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 154.3125, "completions/mean_terminated_length": 154.3125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.16198613867163658, "epoch": 0.13598888374247337, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013472113059833646, "kl": 0.0010267670004395768, "learning_rate": 9.72811486799444e-07, "loss": 0.0001, "num_tokens": 80649581.0, "reward": 0.25572916865348816, "reward_std": 0.0, "rewards/reward_func/mean": 0.25572916865348816, "rewards/reward_func/std": 0.0, "step": 2936, "step_time": 18.152310617268085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4010363295674324, "epoch": 0.13603520148216766, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021539137233048677, "kl": 0.0020854767644777894, "learning_rate": 9.728022232515052e-07, "loss": 0.0001, "num_tokens": 80691981.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2937, "step_time": 23.469403725117445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 190.4375, "completions/mean_terminated_length": 190.4375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.43749643862247467, "epoch": 0.13608151922186199, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036167947109788656, "kl": 0.003282747173216194, "learning_rate": 9.727929597035665e-07, "loss": 0.0002, "num_tokens": 80715572.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2938, "step_time": 20.30497931689024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 154.9375, "completions/mean_terminated_length": 154.9375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.22568828985095024, "epoch": 0.13612783696155628, "frac_reward_zero_std": 1.0, "grad_norm": 0.001811955007724464, "kl": 0.001611333544133231, "learning_rate": 9.727836961556277e-07, "loss": 0.0001, "num_tokens": 80747731.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2939, "step_time": 18.38596522435546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 142.3125, "completions/mean_terminated_length": 142.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.32939816266298294, "epoch": 0.13617415470125058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026357651222497225, "kl": 0.001972697500605136, "learning_rate": 9.727744326076886e-07, "loss": 0.0001, "num_tokens": 80774312.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2940, "step_time": 16.551889911293983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 134.0, "completions/mean_terminated_length": 134.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.30944251269102097, "epoch": 0.13622047244094487, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035243777092546225, "kl": 0.002230473415693268, "learning_rate": 9.7276516905975e-07, "loss": 0.0001, "num_tokens": 80800472.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2941, "step_time": 16.00205109268427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.2383054681122303, "epoch": 0.1362667901806392, "frac_reward_zero_std": 1.0, "grad_norm": 0.010360358282923698, "kl": 0.003712862846441567, "learning_rate": 9.72755905511811e-07, "loss": 0.0002, "num_tokens": 80823892.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2942, "step_time": 20.769436541944742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 173.6875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.1985713616013527, "epoch": 0.1363131079203335, "frac_reward_zero_std": 0.0, "grad_norm": 0.1353738158941269, "kl": 0.002829553181072697, "learning_rate": 9.727466419638722e-07, "loss": -0.0043, "num_tokens": 80851615.0, "reward": 0.571158766746521, "reward_std": 0.0052941846661269665, "rewards/reward_func/mean": 0.571158766746521, "rewards/reward_func/std": 0.005294173490256071, "step": 2943, "step_time": 19.30748025327921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.18440937623381615, "epoch": 0.1363594256600278, "frac_reward_zero_std": 0.0, "grad_norm": 0.10002607852220535, "kl": 0.0036318711936473846, "learning_rate": 9.727373784159333e-07, "loss": -0.0119, "num_tokens": 80879685.0, "reward": 0.9840062260627747, "reward_std": 0.06397509574890137, "rewards/reward_func/mean": 0.9840062260627747, "rewards/reward_func/std": 0.06397509574890137, "step": 2944, "step_time": 18.639881521463394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.2200978547334671, "epoch": 0.13640574339972208, "frac_reward_zero_std": 1.0, "grad_norm": 0.003600543364882469, "kl": 0.0021891340729780495, "learning_rate": 9.727281148679944e-07, "loss": 0.0001, "num_tokens": 80901147.0, "reward": 0.6976763010025024, "reward_std": 0.0, "rewards/reward_func/mean": 0.6976763010025024, "rewards/reward_func/std": 0.0, "step": 2945, "step_time": 19.604786157608032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 116.75, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.25810839980840683, "epoch": 0.1364520611394164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025552993174642324, "kl": 0.0018019027484115213, "learning_rate": 9.727188513200555e-07, "loss": 0.0001, "num_tokens": 80924151.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2946, "step_time": 14.32331271842122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 107.5, "completions/mean_terminated_length": 107.5, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2829029783606529, "epoch": 0.1364983788791107, "frac_reward_zero_std": 1.0, "grad_norm": 0.002424074336886406, "kl": 0.0014853333414066583, "learning_rate": 9.727095877721167e-07, "loss": 0.0001, "num_tokens": 80943903.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2947, "step_time": 12.356964226812124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.24649735167622566, "epoch": 0.136544696618805, "frac_reward_zero_std": 0.0, "grad_norm": 0.09459095448255539, "kl": 0.003821521357167512, "learning_rate": 9.727003242241778e-07, "loss": -0.0151, "num_tokens": 80966347.0, "reward": 0.9810665845870972, "reward_std": 0.02900378406047821, "rewards/reward_func/mean": 0.9810665845870972, "rewards/reward_func/std": 0.02900378406047821, "step": 2948, "step_time": 19.565968200564384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.4957779496908188, "epoch": 0.1365910143584993, "frac_reward_zero_std": 0.0, "grad_norm": 0.11119517683982849, "kl": 0.002874163561500609, "learning_rate": 9.72691060676239e-07, "loss": 0.1147, "num_tokens": 80990121.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 2949, "step_time": 25.593505263328552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.9375, "completions/mean_terminated_length": 124.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28258949145674706, "epoch": 0.13663733209819362, "frac_reward_zero_std": 1.0, "grad_norm": 0.003415545215830207, "kl": 0.0023273377446457744, "learning_rate": 9.726817971283e-07, "loss": 0.0001, "num_tokens": 81009608.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2950, "step_time": 13.205769643187523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 438.25, "completions/mean_terminated_length": 438.25, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.16405363380908966, "epoch": 0.13668364983788792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016253968933597207, "kl": 0.001384514122037217, "learning_rate": 9.726725335803614e-07, "loss": 0.0001, "num_tokens": 81045068.0, "reward": 0.6524353623390198, "reward_std": 0.0, "rewards/reward_func/mean": 0.6524353623390198, "rewards/reward_func/std": 0.0, "step": 2951, "step_time": 39.95747434720397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29256442934274673, "epoch": 0.1367299675775822, "frac_reward_zero_std": 1.0, "grad_norm": 0.003066990291699767, "kl": 0.0021466032485477626, "learning_rate": 9.726632700324223e-07, "loss": 0.0001, "num_tokens": 81068500.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2952, "step_time": 16.507258500903845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 152.5625, "completions/mean_terminated_length": 152.5625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2603667229413986, "epoch": 0.1367762853172765, "frac_reward_zero_std": 1.0, "grad_norm": 0.016266319900751114, "kl": 0.0039931878563947976, "learning_rate": 9.726540064844834e-07, "loss": 0.0002, "num_tokens": 81089293.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2953, "step_time": 15.947432920336723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 111.0, "completions/mean_terminated_length": 111.0, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.32477306574583054, "epoch": 0.13682260305697083, "frac_reward_zero_std": 1.0, "grad_norm": 0.005752775352448225, "kl": 0.0026571782655082643, "learning_rate": 9.726447429365445e-07, "loss": 0.0001, "num_tokens": 81113885.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2954, "step_time": 14.212970558553934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.265098437666893, "epoch": 0.13686892079666513, "frac_reward_zero_std": 1.0, "grad_norm": 0.002817682223394513, "kl": 0.0022179307125043124, "learning_rate": 9.726354793886059e-07, "loss": 0.0001, "num_tokens": 81137215.0, "reward": 0.7376042604446411, "reward_std": 0.0, "rewards/reward_func/mean": 0.7376042604446411, "rewards/reward_func/std": 0.0, "step": 2955, "step_time": 18.107690293341875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.32519863545894623, "epoch": 0.13691523853635942, "frac_reward_zero_std": 1.0, "grad_norm": 0.002365069929510355, "kl": 0.00207607468473725, "learning_rate": 9.72626215840667e-07, "loss": 0.0001, "num_tokens": 81161124.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2956, "step_time": 14.70854127407074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.2559128552675247, "epoch": 0.13696155627605372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018688369309529662, "kl": 0.0015516452840529382, "learning_rate": 9.726169522927281e-07, "loss": 0.0001, "num_tokens": 81188606.0, "reward": 0.7738244533538818, "reward_std": 0.0, "rewards/reward_func/mean": 0.7738244533538818, "rewards/reward_func/std": 0.0, "step": 2957, "step_time": 26.525179404765368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 205.1875, "completions/mean_terminated_length": 205.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3751988708972931, "epoch": 0.13700787401574804, "frac_reward_zero_std": 0.0, "grad_norm": 0.10362628102302551, "kl": 0.005429615383036435, "learning_rate": 9.726076887447892e-07, "loss": -0.1505, "num_tokens": 81210945.0, "reward": 0.31229549646377563, "reward_std": 0.41644027829170227, "rewards/reward_func/mean": 0.31229549646377563, "rewards/reward_func/std": 0.41644027829170227, "step": 2958, "step_time": 24.359685085713863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.23295798897743225, "epoch": 0.13705419175544234, "frac_reward_zero_std": 0.0, "grad_norm": 0.11378776282072067, "kl": 0.012075455160811543, "learning_rate": 9.725984251968504e-07, "loss": -0.0314, "num_tokens": 81248777.0, "reward": 0.9636383056640625, "reward_std": 0.09935915470123291, "rewards/reward_func/mean": 0.9636383056640625, "rewards/reward_func/std": 0.09935914725065231, "step": 2959, "step_time": 24.62277100980282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3242144212126732, "epoch": 0.13710050949513664, "frac_reward_zero_std": 1.0, "grad_norm": 0.004124994855374098, "kl": 0.0021949128131382167, "learning_rate": 9.725891616489115e-07, "loss": 0.0001, "num_tokens": 81274503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2960, "step_time": 16.611236076802015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3944648876786232, "epoch": 0.13714682723483093, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020955982618033886, "kl": 0.00184916183934547, "learning_rate": 9.725798981009726e-07, "loss": 0.0001, "num_tokens": 81309185.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2961, "step_time": 19.850964810699224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.22054476290941238, "epoch": 0.13719314497452526, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029245137702673674, "kl": 0.0024541362072341144, "learning_rate": 9.725706345530337e-07, "loss": 0.0001, "num_tokens": 81342803.0, "reward": 0.581777811050415, "reward_std": 0.0, "rewards/reward_func/mean": 0.581777811050415, "rewards/reward_func/std": 0.0, "step": 2962, "step_time": 20.51937496289611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.23320022225379944, "epoch": 0.13723946271421955, "frac_reward_zero_std": 1.0, "grad_norm": 0.005797238554805517, "kl": 0.0036599887534976006, "learning_rate": 9.725613710050949e-07, "loss": 0.0002, "num_tokens": 81371241.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 2963, "step_time": 17.888550620526075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.30964043736457825, "epoch": 0.13728578045391385, "frac_reward_zero_std": 0.0, "grad_norm": 0.07827834784984589, "kl": 0.005486906738951802, "learning_rate": 9.725521074571562e-07, "loss": -0.1153, "num_tokens": 81411757.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 2964, "step_time": 34.58116399124265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 154.5625, "completions/mean_terminated_length": 154.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.21548538655042648, "epoch": 0.13733209819360814, "frac_reward_zero_std": 0.0, "grad_norm": 0.13951675593852997, "kl": 0.007275815587490797, "learning_rate": 9.725428439092171e-07, "loss": -0.0208, "num_tokens": 81441718.0, "reward": 0.3506828844547272, "reward_std": 0.0201385710388422, "rewards/reward_func/mean": 0.3506828844547272, "rewards/reward_func/std": 0.020138569176197052, "step": 2965, "step_time": 17.99277178570628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 148.1875, "completions/mean_terminated_length": 148.1875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3910243958234787, "epoch": 0.13737841593330247, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020106330048292875, "kl": 0.002160381118301302, "learning_rate": 9.725335803612782e-07, "loss": 0.0001, "num_tokens": 81495641.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2966, "step_time": 25.105138290673494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3421904370188713, "epoch": 0.13742473367299676, "frac_reward_zero_std": 1.0, "grad_norm": 0.004232991952449083, "kl": 0.003913190448656678, "learning_rate": 9.725243168133394e-07, "loss": 0.0002, "num_tokens": 81518379.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2967, "step_time": 19.195054043084383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3975019305944443, "epoch": 0.13747105141269106, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021872776560485363, "kl": 0.0022378937574103475, "learning_rate": 9.725150532654007e-07, "loss": 0.0001, "num_tokens": 81555297.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2968, "step_time": 23.513963136821985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2754397690296173, "epoch": 0.13751736915238535, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023599236737936735, "kl": 0.0016544149548280984, "learning_rate": 9.725057897174618e-07, "loss": 0.0001, "num_tokens": 81577873.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2969, "step_time": 14.453000880777836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.28134264796972275, "epoch": 0.13756368689207968, "frac_reward_zero_std": 0.0, "grad_norm": 0.08394055813550949, "kl": 0.003864644793793559, "learning_rate": 9.72496526169523e-07, "loss": 0.1294, "num_tokens": 81605547.0, "reward": 0.6885161995887756, "reward_std": 0.2206767350435257, "rewards/reward_func/mean": 0.6885161995887756, "rewards/reward_func/std": 0.2206767350435257, "step": 2970, "step_time": 27.560300171375275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2121870256960392, "epoch": 0.13761000463177397, "frac_reward_zero_std": 1.0, "grad_norm": 0.005776191595941782, "kl": 0.003393293300177902, "learning_rate": 9.72487262621584e-07, "loss": 0.0002, "num_tokens": 81626133.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 2971, "step_time": 15.72008827328682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 155.6875, "completions/mean_terminated_length": 155.6875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3848320543766022, "epoch": 0.13765632237146827, "frac_reward_zero_std": 1.0, "grad_norm": 0.007267037406563759, "kl": 0.0029213608358986676, "learning_rate": 9.724779990736452e-07, "loss": 0.0001, "num_tokens": 81653872.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2972, "step_time": 19.16296986490488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 143.0625, "completions/mean_terminated_length": 143.0625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2205166332423687, "epoch": 0.13770264011116257, "frac_reward_zero_std": 1.0, "grad_norm": 0.002539976965636015, "kl": 0.0017141374410130084, "learning_rate": 9.724687355257063e-07, "loss": 0.0001, "num_tokens": 81673601.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2973, "step_time": 13.995499208569527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.252107672393322, "epoch": 0.1377489578508569, "frac_reward_zero_std": 1.0, "grad_norm": 0.003110184334218502, "kl": 0.0016299804265145212, "learning_rate": 9.724594719777675e-07, "loss": 0.0001, "num_tokens": 81702361.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2974, "step_time": 17.1572062112391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.30890874564647675, "epoch": 0.1377952755905512, "frac_reward_zero_std": 1.0, "grad_norm": 0.00426195515319705, "kl": 0.0026575836818665266, "learning_rate": 9.724502084298286e-07, "loss": 0.0001, "num_tokens": 81723316.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2975, "step_time": 13.773773342370987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4315086305141449, "epoch": 0.13784159333024548, "frac_reward_zero_std": 1.0, "grad_norm": 0.001540576689876616, "kl": 0.0017092112975660712, "learning_rate": 9.724409448818897e-07, "loss": 0.0001, "num_tokens": 81758476.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2976, "step_time": 20.798981700092554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.27639901265501976, "epoch": 0.13788791106993978, "frac_reward_zero_std": 1.0, "grad_norm": 0.001724769128486514, "kl": 0.001594531029695645, "learning_rate": 9.724316813339508e-07, "loss": 0.0001, "num_tokens": 81778351.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2977, "step_time": 15.520280051976442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3911156952381134, "epoch": 0.1379342288096341, "frac_reward_zero_std": 1.0, "grad_norm": 0.004675261210650206, "kl": 0.0030932281515561044, "learning_rate": 9.72422417786012e-07, "loss": 0.0002, "num_tokens": 81804390.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2978, "step_time": 20.01363567262888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 196.6875, "completions/mean_terminated_length": 196.6875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.19053053855895996, "epoch": 0.1379805465493284, "frac_reward_zero_std": 1.0, "grad_norm": 0.00747695891186595, "kl": 0.005894030095078051, "learning_rate": 9.72413154238073e-07, "loss": 0.0003, "num_tokens": 81836609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2979, "step_time": 22.578228179365396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.37082868814468384, "epoch": 0.1380268642890227, "frac_reward_zero_std": 1.0, "grad_norm": 0.00807334017008543, "kl": 0.006216909969225526, "learning_rate": 9.724038906901342e-07, "loss": 0.0003, "num_tokens": 81861853.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2980, "step_time": 17.812676947563887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 265.9375, "completions/mean_terminated_length": 265.9375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.3077727183699608, "epoch": 0.138073182028717, "frac_reward_zero_std": 0.0, "grad_norm": 0.0890994518995285, "kl": 0.009753985330462456, "learning_rate": 9.723946271421955e-07, "loss": -0.0513, "num_tokens": 81884604.0, "reward": 0.5567809343338013, "reward_std": 0.4464142918586731, "rewards/reward_func/mean": 0.5567809343338013, "rewards/reward_func/std": 0.4464143216609955, "step": 2981, "step_time": 28.96948353946209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.40940602868795395, "epoch": 0.1381194997684113, "frac_reward_zero_std": 1.0, "grad_norm": 0.00440339557826519, "kl": 0.0034140886273235083, "learning_rate": 9.723853635942567e-07, "loss": 0.0002, "num_tokens": 81916268.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 2982, "step_time": 22.532951060682535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 235.4375, "completions/mean_terminated_length": 235.4375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2550181820988655, "epoch": 0.1381658175081056, "frac_reward_zero_std": 0.0, "grad_norm": 0.07482445240020752, "kl": 0.01346679124981165, "learning_rate": 9.723761000463176e-07, "loss": -0.0532, "num_tokens": 81947235.0, "reward": 0.6716771721839905, "reward_std": 0.26680293679237366, "rewards/reward_func/mean": 0.6716771721839905, "rewards/reward_func/std": 0.26680296659469604, "step": 2983, "step_time": 25.834476247429848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 167.4375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.1424405835568905, "epoch": 0.1382121352477999, "frac_reward_zero_std": 1.0, "grad_norm": 0.00439409539103508, "kl": 0.0021929975191596895, "learning_rate": 9.723668364983787e-07, "loss": 0.0001, "num_tokens": 81978522.0, "reward": 0.9622687101364136, "reward_std": 0.0, "rewards/reward_func/mean": 0.9622687101364136, "rewards/reward_func/std": 0.0, "step": 2984, "step_time": 20.259456109255552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 159.5625, "completions/mean_terminated_length": 159.5625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.19097694009542465, "epoch": 0.1382584529874942, "frac_reward_zero_std": 0.0, "grad_norm": 0.10251673310995102, "kl": 0.005822888575494289, "learning_rate": 9.7235757295044e-07, "loss": -0.0069, "num_tokens": 82001011.0, "reward": 0.34434574842453003, "reward_std": 0.038803718984127045, "rewards/reward_func/mean": 0.34434574842453003, "rewards/reward_func/std": 0.038803718984127045, "step": 2985, "step_time": 16.958806682378054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.33455196022987366, "epoch": 0.13830477072718853, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051012057811021805, "kl": 0.0035558182280510664, "learning_rate": 9.723483094025012e-07, "loss": 0.0002, "num_tokens": 82022895.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2986, "step_time": 16.516713060438633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 147.75, "completions/mean_terminated_length": 147.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.22480645030736923, "epoch": 0.13835108846688282, "frac_reward_zero_std": 1.0, "grad_norm": 0.010497340932488441, "kl": 0.006787444464862347, "learning_rate": 9.723390458545623e-07, "loss": 0.0004, "num_tokens": 82047019.0, "reward": 0.930604875087738, "reward_std": 0.0, "rewards/reward_func/mean": 0.930604875087738, "rewards/reward_func/std": 0.0, "step": 2987, "step_time": 18.981467500329018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.41501862555742264, "epoch": 0.13839740620657712, "frac_reward_zero_std": 1.0, "grad_norm": 0.006412906106561422, "kl": 0.0036485460004769266, "learning_rate": 9.723297823066234e-07, "loss": 0.0002, "num_tokens": 82069083.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2988, "step_time": 17.08159761875868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1863102950155735, "epoch": 0.1384437239462714, "frac_reward_zero_std": 0.0, "grad_norm": 0.12076622247695923, "kl": 0.003980996320024133, "learning_rate": 9.723205187586845e-07, "loss": 0.0116, "num_tokens": 82091249.0, "reward": 0.9447779655456543, "reward_std": 0.01762891374528408, "rewards/reward_func/mean": 0.9447779655456543, "rewards/reward_func/std": 0.017628923058509827, "step": 2989, "step_time": 18.402724485844374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.625, "completions/mean_terminated_length": 124.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.22548434510827065, "epoch": 0.13849004168596574, "frac_reward_zero_std": 1.0, "grad_norm": 0.00368937267921865, "kl": 0.0017834273457992822, "learning_rate": 9.723112552107457e-07, "loss": 0.0001, "num_tokens": 82110651.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2990, "step_time": 12.86932748556137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 217.3125, "completions/mean_terminated_length": 217.3125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4416034296154976, "epoch": 0.13853635942566003, "frac_reward_zero_std": 0.0, "grad_norm": 0.11278366297483444, "kl": 0.006657724385149777, "learning_rate": 9.723019916628068e-07, "loss": -0.1936, "num_tokens": 82150832.0, "reward": 0.05823352932929993, "reward_std": 0.2329341173171997, "rewards/reward_func/mean": 0.05823352932929993, "rewards/reward_func/std": 0.2329341322183609, "step": 2991, "step_time": 34.99669820070267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.18767217174172401, "epoch": 0.13858267716535433, "frac_reward_zero_std": 0.0, "grad_norm": 0.13702774047851562, "kl": 0.005850961548276246, "learning_rate": 9.72292728114868e-07, "loss": 0.0096, "num_tokens": 82172589.0, "reward": 0.5520051717758179, "reward_std": 0.014887908473610878, "rewards/reward_func/mean": 0.5520051717758179, "rewards/reward_func/std": 0.014887906610965729, "step": 2992, "step_time": 16.737485133111477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 131.1875, "completions/mean_terminated_length": 131.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24596243724226952, "epoch": 0.13862899490504862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016323782037943602, "kl": 0.0014825518592260778, "learning_rate": 9.72283464566929e-07, "loss": 0.0001, "num_tokens": 82192352.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2993, "step_time": 15.041008338332176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.20328615233302116, "epoch": 0.13867531264474295, "frac_reward_zero_std": 1.0, "grad_norm": 0.00559780839830637, "kl": 0.00186509671038948, "learning_rate": 9.722742010189904e-07, "loss": 0.0001, "num_tokens": 82216498.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2994, "step_time": 15.585237976163626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.22220513597130775, "epoch": 0.13872163038443724, "frac_reward_zero_std": 0.0, "grad_norm": 0.1539018750190735, "kl": 0.005877788527868688, "learning_rate": 9.722649374710513e-07, "loss": 0.0223, "num_tokens": 82248190.0, "reward": 0.872715950012207, "reward_std": 0.23345592617988586, "rewards/reward_func/mean": 0.872715950012207, "rewards/reward_func/std": 0.23345592617988586, "step": 2995, "step_time": 23.43818784877658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 121.75, "completions/mean_terminated_length": 121.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2709006443619728, "epoch": 0.13876794812413154, "frac_reward_zero_std": 1.0, "grad_norm": 0.003966325893998146, "kl": 0.0022318846022244543, "learning_rate": 9.722556739231124e-07, "loss": 0.0001, "num_tokens": 82269994.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2996, "step_time": 13.280930988490582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 224.4375, "completions/mean_terminated_length": 224.4375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.21429625153541565, "epoch": 0.13881426586382584, "frac_reward_zero_std": 0.0, "grad_norm": 0.14449664950370789, "kl": 0.021888707764446735, "learning_rate": 9.722464103751735e-07, "loss": -0.086, "num_tokens": 82300545.0, "reward": 0.8610855340957642, "reward_std": 0.18172870576381683, "rewards/reward_func/mean": 0.8610855340957642, "rewards/reward_func/std": 0.18172870576381683, "step": 2997, "step_time": 27.55720454081893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.17517578601837158, "epoch": 0.13886058360352016, "frac_reward_zero_std": 0.0, "grad_norm": 0.09581270813941956, "kl": 0.0014729919785168022, "learning_rate": 9.722371468272349e-07, "loss": 0.0187, "num_tokens": 82322834.0, "reward": 0.941566526889801, "reward_std": 0.03496573492884636, "rewards/reward_func/mean": 0.941566526889801, "rewards/reward_func/std": 0.03496573492884636, "step": 2998, "step_time": 18.05263601243496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3106953948736191, "epoch": 0.13890690134321446, "frac_reward_zero_std": 1.0, "grad_norm": 0.001961939036846161, "kl": 0.0017858156061265618, "learning_rate": 9.72227883279296e-07, "loss": 0.0001, "num_tokens": 82344886.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 2999, "step_time": 15.224582199007273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 168.3125, "completions/mean_terminated_length": 168.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.40341317653656006, "epoch": 0.13895321908290875, "frac_reward_zero_std": 1.0, "grad_norm": 0.005552309099584818, "kl": 0.004683843930251896, "learning_rate": 9.722186197313571e-07, "loss": 0.0002, "num_tokens": 82380395.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3000, "step_time": 21.262337151914835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 135.25, "completions/mean_terminated_length": 135.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3231126293540001, "epoch": 0.13899953682260305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022516052704304457, "kl": 0.0018037364934571087, "learning_rate": 9.722093561834182e-07, "loss": 0.0001, "num_tokens": 82406431.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3001, "step_time": 15.317341301590204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 312.4375, "completions/mean_terminated_length": 312.4375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.16434377059340477, "epoch": 0.13904585456229737, "frac_reward_zero_std": 1.0, "grad_norm": 0.00212790141813457, "kl": 0.0020398667838890105, "learning_rate": 9.722000926354794e-07, "loss": 0.0001, "num_tokens": 82435622.0, "reward": 0.9813933372497559, "reward_std": 0.0, "rewards/reward_func/mean": 0.9813933372497559, "rewards/reward_func/std": 0.0, "step": 3002, "step_time": 28.26568039879203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3353980928659439, "epoch": 0.13909217230199167, "frac_reward_zero_std": 1.0, "grad_norm": 0.005667801480740309, "kl": 0.0035694522666744888, "learning_rate": 9.721908290875405e-07, "loss": 0.0002, "num_tokens": 82458446.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3003, "step_time": 17.9529795832932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4704963266849518, "epoch": 0.13913849004168596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027271786238998175, "kl": 0.0027305830735713243, "learning_rate": 9.721815655396016e-07, "loss": 0.0001, "num_tokens": 82485172.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3004, "step_time": 23.76654951274395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.24829314649105072, "epoch": 0.13918480778138026, "frac_reward_zero_std": 0.0, "grad_norm": 0.07245910912752151, "kl": 0.008992287330329418, "learning_rate": 9.721723019916627e-07, "loss": -0.0269, "num_tokens": 82509176.0, "reward": 0.5142320990562439, "reward_std": 0.07422799617052078, "rewards/reward_func/mean": 0.5142320990562439, "rewards/reward_func/std": 0.07422800362110138, "step": 3005, "step_time": 20.921358436346054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 166.9375, "completions/mean_terminated_length": 166.9375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3856392726302147, "epoch": 0.13923112552107458, "frac_reward_zero_std": 1.0, "grad_norm": 0.004602343309670687, "kl": 0.002773736574454233, "learning_rate": 9.721630384437239e-07, "loss": 0.0001, "num_tokens": 82541223.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3006, "step_time": 18.90654380246997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3541441410779953, "epoch": 0.13927744326076888, "frac_reward_zero_std": 1.0, "grad_norm": 0.007559177000075579, "kl": 0.005332320695742965, "learning_rate": 9.72153774895785e-07, "loss": 0.0003, "num_tokens": 82563021.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3007, "step_time": 18.53321072459221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 158.125, "completions/mean_terminated_length": 158.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2771758511662483, "epoch": 0.13932376100046318, "frac_reward_zero_std": 1.0, "grad_norm": 0.003859475487843156, "kl": 0.0026886623236350715, "learning_rate": 9.721445113478461e-07, "loss": 0.0001, "num_tokens": 82583103.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3008, "step_time": 16.836896754801273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 176.6875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1914764828979969, "epoch": 0.13937007874015747, "frac_reward_zero_std": 1.0, "grad_norm": 0.002919860417023301, "kl": 0.0019065296510234475, "learning_rate": 9.721352477999072e-07, "loss": 0.0001, "num_tokens": 82615338.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 3009, "step_time": 21.103497747331858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 112.625, "completions/mean_terminated_length": 112.625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.26232175529003143, "epoch": 0.1394163964798518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018449926283210516, "kl": 0.0015014406526461244, "learning_rate": 9.721259842519684e-07, "loss": 0.0001, "num_tokens": 82635012.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3010, "step_time": 12.229396902024746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3080074191093445, "epoch": 0.1394627142195461, "frac_reward_zero_std": 1.0, "grad_norm": 0.005197662394493818, "kl": 0.002390563109656796, "learning_rate": 9.721167207040297e-07, "loss": 0.0001, "num_tokens": 82657101.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3011, "step_time": 13.777455564588308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.35617558658123016, "epoch": 0.1395090319592404, "frac_reward_zero_std": 1.0, "grad_norm": 0.011955607682466507, "kl": 0.008283481933176517, "learning_rate": 9.721074571560908e-07, "loss": 0.0004, "num_tokens": 82677737.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3012, "step_time": 17.563011325895786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2488524466753006, "epoch": 0.13955534969893468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027501913718879223, "kl": 0.0018833787471521646, "learning_rate": 9.72098193608152e-07, "loss": 0.0001, "num_tokens": 82698519.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3013, "step_time": 15.960894122719765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 178.4375, "completions/mean_terminated_length": 178.4375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.25607532262802124, "epoch": 0.139601667438629, "frac_reward_zero_std": 0.0, "grad_norm": 0.07284007221460342, "kl": 0.0030666259699501097, "learning_rate": 9.720889300602129e-07, "loss": -0.0101, "num_tokens": 82723598.0, "reward": 0.14382196962833405, "reward_std": 0.002884342335164547, "rewards/reward_func/mean": 0.14382196962833405, "rewards/reward_func/std": 0.0028843434993177652, "step": 3014, "step_time": 18.82703233882785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 170.9375, "completions/mean_terminated_length": 170.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3732554018497467, "epoch": 0.1396479851783233, "frac_reward_zero_std": 1.0, "grad_norm": 0.001959107117727399, "kl": 0.0018909156206063926, "learning_rate": 9.720796665122742e-07, "loss": 0.0001, "num_tokens": 82759949.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3015, "step_time": 20.609770573675632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2312699593603611, "epoch": 0.1396943029180176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015609496040269732, "kl": 0.0013426737277768552, "learning_rate": 9.720704029643353e-07, "loss": 0.0001, "num_tokens": 82781757.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3016, "step_time": 14.741418339312077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 148.5625, "completions/mean_terminated_length": 148.5625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.29120461642742157, "epoch": 0.1397406206577119, "frac_reward_zero_std": 1.0, "grad_norm": 0.008427511900663376, "kl": 0.0023643068270757794, "learning_rate": 9.720611394163965e-07, "loss": 0.0001, "num_tokens": 82803526.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3017, "step_time": 15.820613522082567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 191.3125, "completions/mean_terminated_length": 191.3125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.30200475454330444, "epoch": 0.13978693839740622, "frac_reward_zero_std": 1.0, "grad_norm": 0.004149302840232849, "kl": 0.003819424891844392, "learning_rate": 9.720518758684576e-07, "loss": 0.0002, "num_tokens": 82830923.0, "reward": 0.2167416214942932, "reward_std": 0.0, "rewards/reward_func/mean": 0.2167416214942932, "rewards/reward_func/std": 0.0, "step": 3018, "step_time": 20.017596885561943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3494086414575577, "epoch": 0.13983325613710051, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022782550659030676, "kl": 0.001864958874648437, "learning_rate": 9.720426123205187e-07, "loss": 0.0001, "num_tokens": 82857579.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3019, "step_time": 16.663807447999716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 191.8125, "completions/mean_terminated_length": 191.8125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.35616112500429153, "epoch": 0.1398795738767948, "frac_reward_zero_std": 0.0, "grad_norm": 0.08675806224346161, "kl": 0.0035336354630999267, "learning_rate": 9.720333487725798e-07, "loss": -0.0007, "num_tokens": 82887640.0, "reward": 0.8100361824035645, "reward_std": 0.31683072447776794, "rewards/reward_func/mean": 0.8100361824035645, "rewards/reward_func/std": 0.31683069467544556, "step": 3020, "step_time": 23.310782480984926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4156198650598526, "epoch": 0.1399258916164891, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019909366965293884, "kl": 0.002192864805692807, "learning_rate": 9.72024085224641e-07, "loss": 0.0001, "num_tokens": 82942036.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3021, "step_time": 26.098113026469946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 116.875, "completions/mean_terminated_length": 116.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2946057990193367, "epoch": 0.13997220935618343, "frac_reward_zero_std": 1.0, "grad_norm": 0.006412202958017588, "kl": 0.0036119677824899554, "learning_rate": 9.72014821676702e-07, "loss": 0.0002, "num_tokens": 82963410.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3022, "step_time": 13.971223030239344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 153.75, "completions/mean_terminated_length": 153.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2157832831144333, "epoch": 0.14001852709587773, "frac_reward_zero_std": 1.0, "grad_norm": 0.002061971463263035, "kl": 0.0014133761869743466, "learning_rate": 9.720055581287632e-07, "loss": 0.0001, "num_tokens": 82984206.0, "reward": 0.26742759346961975, "reward_std": 0.0, "rewards/reward_func/mean": 0.26742759346961975, "rewards/reward_func/std": 0.0, "step": 3023, "step_time": 15.735854860395193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 197.1875, "completions/mean_terminated_length": 197.1875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4383648708462715, "epoch": 0.14006484483557202, "frac_reward_zero_std": 1.0, "grad_norm": 0.005425654351711273, "kl": 0.003583900397643447, "learning_rate": 9.719962945808243e-07, "loss": 0.0002, "num_tokens": 83018753.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3024, "step_time": 23.91798797994852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.39416050910949707, "epoch": 0.14011116257526632, "frac_reward_zero_std": 1.0, "grad_norm": 0.002842653775587678, "kl": 0.0023979249817784876, "learning_rate": 9.719870310328857e-07, "loss": 0.0001, "num_tokens": 83052597.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3025, "step_time": 19.870104629546404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 220.6875, "completions/mean_terminated_length": 220.6875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.21130745857954025, "epoch": 0.14015748031496064, "frac_reward_zero_std": 0.0, "grad_norm": 0.11111432313919067, "kl": 0.0033099165884777904, "learning_rate": 9.719777674849466e-07, "loss": 0.0688, "num_tokens": 83075936.0, "reward": 0.9771252870559692, "reward_std": 0.08604231476783752, "rewards/reward_func/mean": 0.9771252870559692, "rewards/reward_func/std": 0.08604232966899872, "step": 3026, "step_time": 24.006690483540297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.15849712491035461, "epoch": 0.14020379805465494, "frac_reward_zero_std": 1.0, "grad_norm": 0.053610339760780334, "kl": 0.0011202768073417246, "learning_rate": 9.719685039370077e-07, "loss": 0.0001, "num_tokens": 83112480.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3027, "step_time": 18.192413430660963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 184.875, "completions/mean_terminated_length": 184.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.44311244040727615, "epoch": 0.14025011579434923, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037516490556299686, "kl": 0.00294655334437266, "learning_rate": 9.71959240389069e-07, "loss": 0.0001, "num_tokens": 83136014.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3028, "step_time": 20.209473069757223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 261.375, "completions/mean_terminated_length": 261.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.5869688093662262, "epoch": 0.14029643353404353, "frac_reward_zero_std": 0.0, "grad_norm": 0.10678769648075104, "kl": 0.003234754258301109, "learning_rate": 9.719499768411302e-07, "loss": 0.0309, "num_tokens": 83166564.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 3029, "step_time": 30.887698356062174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 131.8125, "completions/mean_terminated_length": 131.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3739954084157944, "epoch": 0.14034275127373785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022352435626089573, "kl": 0.0020131257479079068, "learning_rate": 9.719407132931913e-07, "loss": 0.0001, "num_tokens": 83193073.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3030, "step_time": 15.910360716283321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.27550504356622696, "epoch": 0.14038906901343215, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017742261989042163, "kl": 0.0014804693055339158, "learning_rate": 9.719314497452524e-07, "loss": 0.0001, "num_tokens": 83213823.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3031, "step_time": 13.649042718112469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 148.4375, "completions/mean_terminated_length": 148.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4004627615213394, "epoch": 0.14043538675312645, "frac_reward_zero_std": 1.0, "grad_norm": 0.002752473810687661, "kl": 0.002842024026904255, "learning_rate": 9.719221861973135e-07, "loss": 0.0001, "num_tokens": 83266678.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3032, "step_time": 23.976348515599966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2226003035902977, "epoch": 0.14048170449282074, "frac_reward_zero_std": 0.0, "grad_norm": 0.23777858912944794, "kl": 0.022119770292192698, "learning_rate": 9.719129226493747e-07, "loss": 0.0544, "num_tokens": 83289586.0, "reward": 0.8626243472099304, "reward_std": 0.2472572922706604, "rewards/reward_func/mean": 0.8626243472099304, "rewards/reward_func/std": 0.2472572922706604, "step": 3033, "step_time": 20.435957849025726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.30680377781391144, "epoch": 0.14052802223251507, "frac_reward_zero_std": 1.0, "grad_norm": 0.008545816875994205, "kl": 0.0029445297259371728, "learning_rate": 9.719036591014358e-07, "loss": 0.0001, "num_tokens": 83318282.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3034, "step_time": 16.973541107028723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3295881971716881, "epoch": 0.14057433997220936, "frac_reward_zero_std": 1.0, "grad_norm": 0.017239896580576897, "kl": 0.010237524285912514, "learning_rate": 9.71894395553497e-07, "loss": 0.0005, "num_tokens": 83340489.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3035, "step_time": 17.23481447249651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 178.5625, "completions/mean_terminated_length": 178.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.35540928691625595, "epoch": 0.14062065771190366, "frac_reward_zero_std": 1.0, "grad_norm": 0.006867921911180019, "kl": 0.004615910118445754, "learning_rate": 9.71885132005558e-07, "loss": 0.0002, "num_tokens": 83372994.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3036, "step_time": 21.024396255612373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.172479510307312, "epoch": 0.14066697545159795, "frac_reward_zero_std": 1.0, "grad_norm": 0.004548189230263233, "kl": 0.004860262502916157, "learning_rate": 9.718758684576192e-07, "loss": 0.0002, "num_tokens": 83398716.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3037, "step_time": 19.449104487895966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 197.375, "completions/mean_terminated_length": 197.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3322830870747566, "epoch": 0.14071329319129228, "frac_reward_zero_std": 1.0, "grad_norm": 0.003146246075630188, "kl": 0.002210581151302904, "learning_rate": 9.718666049096803e-07, "loss": 0.0001, "num_tokens": 83426546.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3038, "step_time": 21.201699301600456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.25694333761930466, "epoch": 0.14075961093098657, "frac_reward_zero_std": 1.0, "grad_norm": 0.008664743974804878, "kl": 0.0029743796912953258, "learning_rate": 9.718573413617414e-07, "loss": 0.0001, "num_tokens": 83446128.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3039, "step_time": 15.962315011769533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 186.4375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.38639630377292633, "epoch": 0.14080592867068087, "frac_reward_zero_std": 0.0, "grad_norm": 0.14186641573905945, "kl": 0.005497544887475669, "learning_rate": 9.718480778138025e-07, "loss": 0.0188, "num_tokens": 83482887.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 3040, "step_time": 22.51087235286832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 120.8125, "completions/mean_terminated_length": 120.8125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.22201518341898918, "epoch": 0.14085224641037516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010322537273168564, "kl": 0.0010378068109275773, "learning_rate": 9.718388142658639e-07, "loss": 0.0001, "num_tokens": 83507380.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3041, "step_time": 14.335064977407455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 165.0625, "completions/mean_terminated_length": 165.0625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.20901744812726974, "epoch": 0.1408985641500695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010990627342835069, "kl": 0.0010493967711227015, "learning_rate": 9.71829550717925e-07, "loss": 0.0001, "num_tokens": 83553189.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3042, "step_time": 23.672550708055496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.26086245849728584, "epoch": 0.14094488188976378, "frac_reward_zero_std": 0.0, "grad_norm": 0.10829035937786102, "kl": 0.002545473660575226, "learning_rate": 9.718202871699861e-07, "loss": -0.0074, "num_tokens": 83589339.0, "reward": 0.8403646945953369, "reward_std": 0.14294101297855377, "rewards/reward_func/mean": 0.8403646945953369, "rewards/reward_func/std": 0.14294102787971497, "step": 3043, "step_time": 22.47973906993866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.24207553267478943, "epoch": 0.14099119962945808, "frac_reward_zero_std": 0.0, "grad_norm": 0.09628209471702576, "kl": 0.010839248541742563, "learning_rate": 9.718110236220473e-07, "loss": -0.0258, "num_tokens": 83618537.0, "reward": 0.8728713393211365, "reward_std": 0.23276567459106445, "rewards/reward_func/mean": 0.8728713393211365, "rewards/reward_func/std": 0.23276568949222565, "step": 3044, "step_time": 22.380034614354372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 115.625, "completions/mean_terminated_length": 115.625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.24875660985708237, "epoch": 0.14103751736915238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031943044159561396, "kl": 0.0018857441318687052, "learning_rate": 9.718017600741084e-07, "loss": 0.0001, "num_tokens": 83638483.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3045, "step_time": 13.065401766449213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.32689883559942245, "epoch": 0.1410838351088467, "frac_reward_zero_std": 0.0, "grad_norm": 0.11655188351869583, "kl": 0.008323002490215003, "learning_rate": 9.717924965261695e-07, "loss": -0.0041, "num_tokens": 83661307.0, "reward": 0.829888105392456, "reward_std": 0.2213035225868225, "rewards/reward_func/mean": 0.829888105392456, "rewards/reward_func/std": 0.22130350768566132, "step": 3046, "step_time": 21.887813713401556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 201.8125, "completions/mean_terminated_length": 201.8125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4084246978163719, "epoch": 0.141130152848541, "frac_reward_zero_std": 1.0, "grad_norm": 0.004622376523911953, "kl": 0.003465540474280715, "learning_rate": 9.717832329782306e-07, "loss": 0.0002, "num_tokens": 83689064.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3047, "step_time": 22.28806211799383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 195.625, "completions/mean_terminated_length": 195.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.191276665776968, "epoch": 0.1411764705882353, "frac_reward_zero_std": 0.0, "grad_norm": 0.10191956162452698, "kl": 0.0030786641291342676, "learning_rate": 9.717739694302918e-07, "loss": -0.0201, "num_tokens": 83723202.0, "reward": 0.16149914264678955, "reward_std": 0.12526051700115204, "rewards/reward_func/mean": 0.16149914264678955, "rewards/reward_func/std": 0.12526051700115204, "step": 3048, "step_time": 23.00204337015748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 125.0625, "completions/mean_terminated_length": 125.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28970544785261154, "epoch": 0.1412227883279296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031022813636809587, "kl": 0.0023523358104284853, "learning_rate": 9.717647058823529e-07, "loss": 0.0001, "num_tokens": 83742963.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3049, "step_time": 14.628591068089008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.23548217117786407, "epoch": 0.1412691060676239, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013360042357817292, "kl": 0.0010569237201707438, "learning_rate": 9.71755442334414e-07, "loss": 0.0001, "num_tokens": 83770214.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3050, "step_time": 15.252948425710201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.44899706542491913, "epoch": 0.1413154238073182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018619222100824118, "kl": 0.0023352773278020322, "learning_rate": 9.717461787864751e-07, "loss": 0.0001, "num_tokens": 83825725.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3051, "step_time": 25.797151293605566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3013594150543213, "epoch": 0.1413617415470125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021030872594565153, "kl": 0.001682311820331961, "learning_rate": 9.717369152385363e-07, "loss": 0.0001, "num_tokens": 83847101.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3052, "step_time": 16.435410499572754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2900470197200775, "epoch": 0.1414080592867068, "frac_reward_zero_std": 1.0, "grad_norm": 0.014194217510521412, "kl": 0.0037966840609442443, "learning_rate": 9.717276516905974e-07, "loss": 0.0002, "num_tokens": 83867606.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3053, "step_time": 14.13415927067399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 295.5, "completions/mean_terminated_length": 295.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.222489595413208, "epoch": 0.14145437702640112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033659781329333782, "kl": 0.002463035110849887, "learning_rate": 9.717183881426585e-07, "loss": 0.0001, "num_tokens": 83908078.0, "reward": 0.8346666097640991, "reward_std": 0.0, "rewards/reward_func/mean": 0.8346666097640991, "rewards/reward_func/std": 0.0, "step": 3054, "step_time": 31.249343916773796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.21848269924521446, "epoch": 0.14150069476609542, "frac_reward_zero_std": 0.0, "grad_norm": 0.11221415549516678, "kl": 0.0034620476653799415, "learning_rate": 9.717091245947198e-07, "loss": -0.0784, "num_tokens": 83943114.0, "reward": 0.34633660316467285, "reward_std": 0.014082971960306168, "rewards/reward_func/mean": 0.34633660316467285, "rewards/reward_func/std": 0.014082977548241615, "step": 3055, "step_time": 23.131091088056564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2509133219718933, "epoch": 0.14154701250578972, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017673317342996597, "kl": 0.0015014345990493894, "learning_rate": 9.71699861046781e-07, "loss": 0.0001, "num_tokens": 83964682.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3056, "step_time": 14.708979427814484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 181.9375, "completions/mean_terminated_length": 181.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.22735528647899628, "epoch": 0.141593330245484, "frac_reward_zero_std": 0.0, "grad_norm": 0.13249163329601288, "kl": 0.003153125464450568, "learning_rate": 9.716905974988419e-07, "loss": 0.0244, "num_tokens": 84017945.0, "reward": 0.9714365601539612, "reward_std": 0.02950017899274826, "rewards/reward_func/mean": 0.9714365601539612, "rewards/reward_func/std": 0.029500195756554604, "step": 3057, "step_time": 28.813837237656116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2536050006747246, "epoch": 0.14163964798517834, "frac_reward_zero_std": 1.0, "grad_norm": 0.004969343543052673, "kl": 0.003967517986893654, "learning_rate": 9.716813339509032e-07, "loss": 0.0002, "num_tokens": 84038057.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3058, "step_time": 14.99054791033268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4103945419192314, "epoch": 0.14168596572487263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022730007767677307, "kl": 0.002340605657082051, "learning_rate": 9.716720704029643e-07, "loss": 0.0001, "num_tokens": 84070049.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3059, "step_time": 20.31228854879737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 201.0625, "completions/mean_terminated_length": 201.0625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2669229544699192, "epoch": 0.14173228346456693, "frac_reward_zero_std": 1.0, "grad_norm": 0.00418067304417491, "kl": 0.003562486555892974, "learning_rate": 9.716628068550255e-07, "loss": 0.0002, "num_tokens": 84093106.0, "reward": 0.9672160744667053, "reward_std": 0.0, "rewards/reward_func/mean": 0.9672160744667053, "rewards/reward_func/std": 0.0, "step": 3060, "step_time": 22.047264583408833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 155.1875, "completions/mean_terminated_length": 155.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.36747582256793976, "epoch": 0.14177860120426122, "frac_reward_zero_std": 1.0, "grad_norm": 0.002079027472063899, "kl": 0.002176420297473669, "learning_rate": 9.716535433070866e-07, "loss": 0.0001, "num_tokens": 84152261.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3061, "step_time": 26.792435012757778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.2686142325401306, "epoch": 0.14182491894395555, "frac_reward_zero_std": 0.0, "grad_norm": 0.10157947987318039, "kl": 0.0017788418917916715, "learning_rate": 9.716442797591477e-07, "loss": -0.0459, "num_tokens": 84191575.0, "reward": 0.12028113752603531, "reward_std": 0.07132308930158615, "rewards/reward_func/mean": 0.12028113752603531, "rewards/reward_func/std": 0.07132309675216675, "step": 3062, "step_time": 29.01181998103857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.22781230881810188, "epoch": 0.14187123668364984, "frac_reward_zero_std": 0.0, "grad_norm": 0.10221483558416367, "kl": 0.0018892473890446126, "learning_rate": 9.716350162112088e-07, "loss": 0.0533, "num_tokens": 84217301.0, "reward": 0.699444055557251, "reward_std": 0.07328739017248154, "rewards/reward_func/mean": 0.699444055557251, "rewards/reward_func/std": 0.07328739762306213, "step": 3063, "step_time": 19.34302917867899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.42125069350004196, "epoch": 0.14191755442334414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018473091768100858, "kl": 0.001721715903840959, "learning_rate": 9.7162575266327e-07, "loss": 0.0001, "num_tokens": 84250935.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3064, "step_time": 20.535016171634197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 198.0625, "completions/mean_terminated_length": 198.0625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3379729837179184, "epoch": 0.14196387216303843, "frac_reward_zero_std": 0.0, "grad_norm": 0.10586327314376831, "kl": 0.0037930503604002297, "learning_rate": 9.71616489115331e-07, "loss": -0.0289, "num_tokens": 84288568.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 3065, "step_time": 24.85549681261182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4226868078112602, "epoch": 0.14201018990273276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013798902509734035, "kl": 0.00155748357065022, "learning_rate": 9.716072255673922e-07, "loss": 0.0001, "num_tokens": 84323210.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3066, "step_time": 19.625557269901037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 140.9375, "completions/mean_terminated_length": 140.9375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.34594182670116425, "epoch": 0.14205650764242705, "frac_reward_zero_std": 1.0, "grad_norm": 0.011119349859654903, "kl": 0.00644524663221091, "learning_rate": 9.715979620194533e-07, "loss": 0.0003, "num_tokens": 84343801.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3067, "step_time": 14.647478591650724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.39974554628133774, "epoch": 0.14210282538212135, "frac_reward_zero_std": 1.0, "grad_norm": 0.002050853567197919, "kl": 0.0018190699338447303, "learning_rate": 9.715886984715147e-07, "loss": 0.0001, "num_tokens": 84377019.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3068, "step_time": 20.185601744800806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.27893398702144623, "epoch": 0.14214914312181565, "frac_reward_zero_std": 0.0, "grad_norm": 0.19752110540866852, "kl": 0.005182704247999936, "learning_rate": 9.715794349235756e-07, "loss": -0.1375, "num_tokens": 84397590.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 3069, "step_time": 16.552004102617502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 167.1875, "completions/mean_terminated_length": 167.1875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.14356551691889763, "epoch": 0.14219546086150997, "frac_reward_zero_std": 1.0, "grad_norm": 0.001284717465750873, "kl": 0.000879270868608728, "learning_rate": 9.715701713756367e-07, "loss": 0.0, "num_tokens": 84426473.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 3070, "step_time": 18.244757778942585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4129192754626274, "epoch": 0.14224177860120427, "frac_reward_zero_std": 1.0, "grad_norm": 0.00182828470133245, "kl": 0.002239855588413775, "learning_rate": 9.71560907827698e-07, "loss": 0.0001, "num_tokens": 84477298.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3071, "step_time": 23.213310711085796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.4551624208688736, "epoch": 0.14228809634089856, "frac_reward_zero_std": 0.0, "grad_norm": 0.07563548535108566, "kl": 0.002163757977541536, "learning_rate": 9.715516442797592e-07, "loss": 0.0621, "num_tokens": 84506498.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3072, "step_time": 26.718425411731005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.27576783671975136, "epoch": 0.14233441408059286, "frac_reward_zero_std": 0.0, "grad_norm": 0.09983588755130768, "kl": 0.001499654186773114, "learning_rate": 9.715423807318203e-07, "loss": 0.0869, "num_tokens": 84528866.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3073, "step_time": 21.365447714924812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.3125, "completions/mean_terminated_length": 173.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.1853836365044117, "epoch": 0.14238073182028718, "frac_reward_zero_std": 1.0, "grad_norm": 0.004038714803755283, "kl": 0.0018410694028716534, "learning_rate": 9.715331171838814e-07, "loss": 0.0001, "num_tokens": 84567847.0, "reward": 0.8507331609725952, "reward_std": 0.0, "rewards/reward_func/mean": 0.8507331609725952, "rewards/reward_func/std": 0.0, "step": 3074, "step_time": 22.511045575141907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.35022301971912384, "epoch": 0.14242704955998148, "frac_reward_zero_std": 1.0, "grad_norm": 0.002216340508311987, "kl": 0.0017985946033149958, "learning_rate": 9.715238536359426e-07, "loss": 0.0001, "num_tokens": 84592435.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3075, "step_time": 14.913249608129263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2580137923359871, "epoch": 0.14247336729967577, "frac_reward_zero_std": 0.0, "grad_norm": 0.11763420701026917, "kl": 0.004678957397118211, "learning_rate": 9.715145900880037e-07, "loss": -0.0676, "num_tokens": 84612897.0, "reward": 0.7995544672012329, "reward_std": 0.051335595548152924, "rewards/reward_func/mean": 0.7995544672012329, "rewards/reward_func/std": 0.05133558437228203, "step": 3076, "step_time": 19.429770436137915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 135.0, "completions/mean_terminated_length": 135.0, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2920405864715576, "epoch": 0.14251968503937007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034133740700781345, "kl": 0.0020403833768796176, "learning_rate": 9.715053265400648e-07, "loss": 0.0001, "num_tokens": 84649185.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3077, "step_time": 18.52658887952566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.28933966904878616, "epoch": 0.1425660027790644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011805701069533825, "kl": 0.0012496154668042436, "learning_rate": 9.71496062992126e-07, "loss": 0.0001, "num_tokens": 84683962.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3078, "step_time": 17.53727101162076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 200.6875, "completions/mean_terminated_length": 200.6875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.46959687769412994, "epoch": 0.1426123205187587, "frac_reward_zero_std": 1.0, "grad_norm": 0.004442025441676378, "kl": 0.0035442839143797755, "learning_rate": 9.71486799444187e-07, "loss": 0.0002, "num_tokens": 84708981.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3079, "step_time": 21.380993772298098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3721555396914482, "epoch": 0.14265863825845299, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020447385031729937, "kl": 0.001785428379662335, "learning_rate": 9.714775358962482e-07, "loss": 0.0001, "num_tokens": 84733059.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3080, "step_time": 16.2989693954587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 153.3125, "completions/mean_terminated_length": 153.3125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1326531060039997, "epoch": 0.14270495599814728, "frac_reward_zero_std": 0.0, "grad_norm": 0.12060035765171051, "kl": 0.0008723430219106376, "learning_rate": 9.714682723483093e-07, "loss": -0.0011, "num_tokens": 84767000.0, "reward": 0.9655313491821289, "reward_std": 0.03559904173016548, "rewards/reward_func/mean": 0.9655313491821289, "rewards/reward_func/std": 0.03559904173016548, "step": 3081, "step_time": 18.66436466574669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 151.8125, "completions/mean_terminated_length": 151.8125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.18006207048892975, "epoch": 0.1427512737378416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012774410424754024, "kl": 0.0011981050483882427, "learning_rate": 9.714590088003704e-07, "loss": 0.0001, "num_tokens": 84787637.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 3082, "step_time": 15.84832838922739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 179.5625, "completions/mean_terminated_length": 179.5625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.37587346881628036, "epoch": 0.1427975914775359, "frac_reward_zero_std": 0.0, "grad_norm": 0.16464351117610931, "kl": 0.011520389234647155, "learning_rate": 9.714497452524315e-07, "loss": -0.0793, "num_tokens": 84809838.0, "reward": 0.3690481185913086, "reward_std": 0.49216657876968384, "rewards/reward_func/mean": 0.3690481185913086, "rewards/reward_func/std": 0.49216654896736145, "step": 3083, "step_time": 19.75730662792921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4154827445745468, "epoch": 0.1428439092172302, "frac_reward_zero_std": 1.0, "grad_norm": 0.00961045641452074, "kl": 0.007004943443462253, "learning_rate": 9.714404817044927e-07, "loss": 0.0003, "num_tokens": 84831442.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3084, "step_time": 17.125869277864695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.2482575811445713, "epoch": 0.1428902269569245, "frac_reward_zero_std": 0.0, "grad_norm": 0.09998895227909088, "kl": 0.005964857351500541, "learning_rate": 9.71431218156554e-07, "loss": 0.0462, "num_tokens": 84852048.0, "reward": 0.7368773818016052, "reward_std": 0.3585629165172577, "rewards/reward_func/mean": 0.7368773818016052, "rewards/reward_func/std": 0.3585629463195801, "step": 3085, "step_time": 18.564539909362793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 220.9375, "completions/mean_terminated_length": 220.9375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.38869137316942215, "epoch": 0.14293654469661882, "frac_reward_zero_std": 0.0, "grad_norm": 0.32799142599105835, "kl": 0.0051233284175395966, "learning_rate": 9.714219546086151e-07, "loss": 0.0081, "num_tokens": 84882895.0, "reward": 0.007592645939439535, "reward_std": 0.03037058375775814, "rewards/reward_func/mean": 0.007592645939439535, "rewards/reward_func/std": 0.03037058375775814, "step": 3086, "step_time": 24.329820852726698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2955872640013695, "epoch": 0.1429828624363131, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034672317560762167, "kl": 0.002140417287591845, "learning_rate": 9.714126910606763e-07, "loss": 0.0001, "num_tokens": 84904311.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3087, "step_time": 15.493290606886148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.35423120111227036, "epoch": 0.1430291801760074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031535944435745478, "kl": 0.0023835660540498793, "learning_rate": 9.714034275127374e-07, "loss": 0.0001, "num_tokens": 84927423.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3088, "step_time": 15.894897159188986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.17939773947000504, "epoch": 0.1430754979157017, "frac_reward_zero_std": 0.0, "grad_norm": 0.07509777694940567, "kl": 0.002948031004052609, "learning_rate": 9.713941639647985e-07, "loss": -0.0051, "num_tokens": 84973399.0, "reward": 0.9079843759536743, "reward_std": 0.0359191857278347, "rewards/reward_func/mean": 0.9079843759536743, "rewards/reward_func/std": 0.035919204354286194, "step": 3089, "step_time": 23.080620639026165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.38066281378269196, "epoch": 0.14312181565539603, "frac_reward_zero_std": 0.0, "grad_norm": 0.09729088097810745, "kl": 0.004983256570994854, "learning_rate": 9.713849004168596e-07, "loss": -0.0478, "num_tokens": 85011839.0, "reward": 0.13014303147792816, "reward_std": 0.14395686984062195, "rewards/reward_func/mean": 0.13014303147792816, "rewards/reward_func/std": 0.14395686984062195, "step": 3090, "step_time": 27.270726181566715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 184.125, "completions/mean_terminated_length": 184.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3863908275961876, "epoch": 0.14316813339509032, "frac_reward_zero_std": 1.0, "grad_norm": 0.006155778653919697, "kl": 0.0033418061793781817, "learning_rate": 9.713756368689208e-07, "loss": 0.0002, "num_tokens": 85048849.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3091, "step_time": 23.163608994334936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 207.125, "completions/mean_terminated_length": 207.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2691080644726753, "epoch": 0.14321445113478462, "frac_reward_zero_std": 1.0, "grad_norm": 0.003819690551608801, "kl": 0.003386061522178352, "learning_rate": 9.713663733209819e-07, "loss": 0.0002, "num_tokens": 85087027.0, "reward": 0.23965103924274445, "reward_std": 0.0, "rewards/reward_func/mean": 0.23965103924274445, "rewards/reward_func/std": 0.0, "step": 3092, "step_time": 24.239079508930445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 197.0625, "completions/mean_terminated_length": 197.0625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3097037822008133, "epoch": 0.14326076887447892, "frac_reward_zero_std": 0.0, "grad_norm": 0.09709495306015015, "kl": 0.0023074370983522385, "learning_rate": 9.71357109773043e-07, "loss": -0.0284, "num_tokens": 85119972.0, "reward": 0.8820197582244873, "reward_std": 0.23685505986213684, "rewards/reward_func/mean": 0.8820197582244873, "rewards/reward_func/std": 0.23685505986213684, "step": 3093, "step_time": 22.47984228283167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.17485283315181732, "epoch": 0.14330708661417324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019301557913422585, "kl": 0.0016929760458879173, "learning_rate": 9.713478462251041e-07, "loss": 0.0001, "num_tokens": 85157402.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3094, "step_time": 21.515938695520163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 157.8125, "completions/mean_terminated_length": 157.8125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.15712221339344978, "epoch": 0.14335340435386754, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015463732415810227, "kl": 0.0010701666760724038, "learning_rate": 9.713385826771653e-07, "loss": 0.0001, "num_tokens": 85178679.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3095, "step_time": 15.812082946300507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 216.8125, "completions/mean_terminated_length": 216.8125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4117165356874466, "epoch": 0.14339972209356183, "frac_reward_zero_std": 0.0, "grad_norm": 0.10295020788908005, "kl": 0.006531993625685573, "learning_rate": 9.713293191292264e-07, "loss": -0.052, "num_tokens": 85208132.0, "reward": 0.43998464941978455, "reward_std": 0.262356162071228, "rewards/reward_func/mean": 0.43998464941978455, "rewards/reward_func/std": 0.2623561918735504, "step": 3096, "step_time": 26.689372658729553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 322.1875, "completions/mean_terminated_length": 322.1875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.21018989011645317, "epoch": 0.14344603983325613, "frac_reward_zero_std": 0.0, "grad_norm": 0.07340876758098602, "kl": 0.006146425555925816, "learning_rate": 9.713200555812875e-07, "loss": -0.1871, "num_tokens": 85242695.0, "reward": 0.6335451006889343, "reward_std": 0.3770582377910614, "rewards/reward_func/mean": 0.6335451006889343, "rewards/reward_func/std": 0.3770582675933838, "step": 3097, "step_time": 34.40815368667245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 145.4375, "completions/mean_terminated_length": 145.4375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.18927786126732826, "epoch": 0.14349235757295045, "frac_reward_zero_std": 1.0, "grad_norm": 0.002579091116786003, "kl": 0.0013895122101530433, "learning_rate": 9.713107920333488e-07, "loss": 0.0001, "num_tokens": 85267214.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 3098, "step_time": 16.404458358883858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3593042865395546, "epoch": 0.14353867531264475, "frac_reward_zero_std": 1.0, "grad_norm": 0.004546928219497204, "kl": 0.003188308735843748, "learning_rate": 9.7130152848541e-07, "loss": 0.0002, "num_tokens": 85296462.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3099, "step_time": 18.903316736221313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 185.0, "completions/mean_terminated_length": 185.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.35782644152641296, "epoch": 0.14358499305233904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018865115707740188, "kl": 0.00179413627483882, "learning_rate": 9.712922649374709e-07, "loss": 0.0001, "num_tokens": 85336446.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3100, "step_time": 22.833679974079132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1481158770620823, "epoch": 0.14363131079203334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017669608350843191, "kl": 0.0011966716556344181, "learning_rate": 9.712830013895322e-07, "loss": 0.0001, "num_tokens": 85361608.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 3101, "step_time": 17.195040185004473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.25260768085718155, "epoch": 0.14367762853172766, "frac_reward_zero_std": 0.0, "grad_norm": 0.06848274916410446, "kl": 0.002867669682018459, "learning_rate": 9.712737378415933e-07, "loss": -0.006, "num_tokens": 85386307.0, "reward": 0.1186196506023407, "reward_std": 0.0013871828559786081, "rewards/reward_func/mean": 0.1186196506023407, "rewards/reward_func/std": 0.0013871807605028152, "step": 3102, "step_time": 24.05906977877021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 156.75, "completions/mean_terminated_length": 156.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3555886596441269, "epoch": 0.14372394627142196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028709163889288902, "kl": 0.002633381634950638, "learning_rate": 9.712644742936545e-07, "loss": 0.0001, "num_tokens": 85415695.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3103, "step_time": 18.29762477427721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 110.9375, "completions/mean_terminated_length": 110.9375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2514931708574295, "epoch": 0.14377026401111626, "frac_reward_zero_std": 1.0, "grad_norm": 0.002546251518651843, "kl": 0.0020225835614837706, "learning_rate": 9.712552107457156e-07, "loss": 0.0001, "num_tokens": 85436702.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3104, "step_time": 12.752374984323978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 196.6875, "completions/mean_terminated_length": 196.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3918505311012268, "epoch": 0.14381658175081055, "frac_reward_zero_std": 0.0, "grad_norm": 0.12146206200122833, "kl": 0.005598803400062025, "learning_rate": 9.712459471977767e-07, "loss": -0.0384, "num_tokens": 85460953.0, "reward": 0.0003962897462770343, "reward_std": 0.0015851589851081371, "rewards/reward_func/mean": 0.0003962897462770343, "rewards/reward_func/std": 0.0015851589851081371, "step": 3105, "step_time": 21.550162710249424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 199.6875, "completions/mean_terminated_length": 199.6875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.19410334154963493, "epoch": 0.14386289949050488, "frac_reward_zero_std": 0.0, "grad_norm": 0.10892891883850098, "kl": 0.002881001215428114, "learning_rate": 9.712366836498378e-07, "loss": 0.0104, "num_tokens": 85486452.0, "reward": 0.9705920815467834, "reward_std": 0.017535503953695297, "rewards/reward_func/mean": 0.9705920815467834, "rewards/reward_func/std": 0.0175354965031147, "step": 3106, "step_time": 20.669698297977448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 235.8125, "completions/mean_terminated_length": 235.8125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "entropy": 0.21750912070274353, "epoch": 0.14390921723019917, "frac_reward_zero_std": 0.0, "grad_norm": 0.11343816667795181, "kl": 0.005503826774656773, "learning_rate": 9.71227420101899e-07, "loss": -0.0276, "num_tokens": 85509697.0, "reward": 0.9566006064414978, "reward_std": 0.014243211597204208, "rewards/reward_func/mean": 0.9566006064414978, "rewards/reward_func/std": 0.014243212528526783, "step": 3107, "step_time": 22.1902144998312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 148.8125, "completions/mean_terminated_length": 148.8125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.17570658028125763, "epoch": 0.14395553496989347, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018220599740743637, "kl": 0.001269097119802609, "learning_rate": 9.7121815655396e-07, "loss": 0.0001, "num_tokens": 85532702.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 3108, "step_time": 16.33903457224369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 181.1875, "completions/mean_terminated_length": 181.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.23641077429056168, "epoch": 0.14400185270958776, "frac_reward_zero_std": 1.0, "grad_norm": 0.002284695627167821, "kl": 0.0024181325279641896, "learning_rate": 9.712088930060212e-07, "loss": 0.0001, "num_tokens": 85557809.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 3109, "step_time": 20.062915228307247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 161.8125, "completions/mean_terminated_length": 161.8125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3653111681342125, "epoch": 0.1440481704492821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038684229366481304, "kl": 0.00288474268745631, "learning_rate": 9.711996294580823e-07, "loss": 0.0001, "num_tokens": 85582062.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3110, "step_time": 17.6112901866436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 259.8125, "completions/mean_terminated_length": 259.8125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.48962145298719406, "epoch": 0.14409448818897638, "frac_reward_zero_std": 0.0, "grad_norm": 0.08644682914018631, "kl": 0.006093928008340299, "learning_rate": 9.711903659101437e-07, "loss": -0.1471, "num_tokens": 85612283.0, "reward": 0.2566387951374054, "reward_std": 0.42777517437934875, "rewards/reward_func/mean": 0.2566387951374054, "rewards/reward_func/std": 0.42777520418167114, "step": 3111, "step_time": 30.46641080826521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 121.0, "completions/mean_terminated_length": 121.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.21173788979649544, "epoch": 0.14414080592867068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032269067596644163, "kl": 0.0016220287652686238, "learning_rate": 9.711811023622046e-07, "loss": 0.0001, "num_tokens": 85631611.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3112, "step_time": 12.906226556748152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.36915768682956696, "epoch": 0.14418712366836497, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037062610499560833, "kl": 0.0034915836877189577, "learning_rate": 9.711718388142657e-07, "loss": 0.0002, "num_tokens": 85665449.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3113, "step_time": 20.85910550132394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 117.9375, "completions/mean_terminated_length": 117.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2525225207209587, "epoch": 0.1442334414080593, "frac_reward_zero_std": 1.0, "grad_norm": 0.003564269980415702, "kl": 0.002008470590226352, "learning_rate": 9.711625752663268e-07, "loss": 0.0001, "num_tokens": 85686072.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3114, "step_time": 13.030921410769224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 177.125, "completions/mean_terminated_length": 177.125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3929122984409332, "epoch": 0.1442797591477536, "frac_reward_zero_std": 1.0, "grad_norm": 0.006715687923133373, "kl": 0.004389499430544674, "learning_rate": 9.711533117183882e-07, "loss": 0.0002, "num_tokens": 85707482.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3115, "step_time": 18.148340705782175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.9375, "completions/mean_terminated_length": 132.9375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3351728692650795, "epoch": 0.1443260768874479, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021768605802208185, "kl": 0.0019831115496344864, "learning_rate": 9.711440481704493e-07, "loss": 0.0001, "num_tokens": 85729785.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3116, "step_time": 14.500599570572376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 308.3125, "completions/mean_terminated_length": 308.3125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.2655940279364586, "epoch": 0.1443723946271422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026614207308739424, "kl": 0.0020910808816552162, "learning_rate": 9.711347846225104e-07, "loss": 0.0001, "num_tokens": 85765534.0, "reward": 0.8301368355751038, "reward_std": 0.0, "rewards/reward_func/mean": 0.8301368355751038, "rewards/reward_func/std": 0.0, "step": 3117, "step_time": 31.337754849344492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 157.4375, "completions/mean_terminated_length": 157.4375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4436837360262871, "epoch": 0.1444187123668365, "frac_reward_zero_std": 1.0, "grad_norm": 0.002486135810613632, "kl": 0.002556470106355846, "learning_rate": 9.711255210745716e-07, "loss": 0.0001, "num_tokens": 85808661.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3118, "step_time": 21.65603133663535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 141.25, "completions/mean_terminated_length": 141.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.31348419189453125, "epoch": 0.1444650301065308, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027583360206335783, "kl": 0.0016864509670995176, "learning_rate": 9.711162575266327e-07, "loss": 0.0001, "num_tokens": 85829481.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3119, "step_time": 14.478506825864315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4057074710726738, "epoch": 0.1445113478462251, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048325881361961365, "kl": 0.003486860659904778, "learning_rate": 9.711069939786938e-07, "loss": 0.0002, "num_tokens": 85851707.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3120, "step_time": 17.688097588717937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 241.1875, "completions/mean_terminated_length": 241.1875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.2984723672270775, "epoch": 0.1445576655859194, "frac_reward_zero_std": 0.0, "grad_norm": 0.07417890429496765, "kl": 0.0043800766579806805, "learning_rate": 9.71097730430755e-07, "loss": -0.0835, "num_tokens": 85881918.0, "reward": 0.05044776573777199, "reward_std": 0.10316508263349533, "rewards/reward_func/mean": 0.05044776573777199, "rewards/reward_func/std": 0.10316507518291473, "step": 3121, "step_time": 28.65020202472806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.24655034393072128, "epoch": 0.14460398332561372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035377475433051586, "kl": 0.0030263965018093586, "learning_rate": 9.71088466882816e-07, "loss": 0.0002, "num_tokens": 85904892.0, "reward": 0.010689839720726013, "reward_std": 0.0, "rewards/reward_func/mean": 0.010689839720726013, "rewards/reward_func/std": 0.0, "step": 3122, "step_time": 16.88920145854354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4134618043899536, "epoch": 0.14465030106530802, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022563552483916283, "kl": 0.0022778004640713334, "learning_rate": 9.710792033348772e-07, "loss": 0.0001, "num_tokens": 85942616.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3123, "step_time": 20.020273722708225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 246.4375, "completions/mean_terminated_length": 246.4375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "entropy": 0.24199670553207397, "epoch": 0.14469661880500231, "frac_reward_zero_std": 0.0, "grad_norm": 0.06729083508253098, "kl": 0.004408994689583778, "learning_rate": 9.710699397869383e-07, "loss": -0.0398, "num_tokens": 85981791.0, "reward": 0.18782895803451538, "reward_std": 0.17358309030532837, "rewards/reward_func/mean": 0.18782895803451538, "rewards/reward_func/std": 0.17358307540416718, "step": 3124, "step_time": 28.185834880918264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2049425132572651, "epoch": 0.1447429365446966, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032749222591519356, "kl": 0.0018695277394726872, "learning_rate": 9.710606762389994e-07, "loss": 0.0001, "num_tokens": 86004142.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 3125, "step_time": 16.163268078118563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 115.5, "completions/mean_terminated_length": 115.5, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3004875108599663, "epoch": 0.14478925428439093, "frac_reward_zero_std": 1.0, "grad_norm": 0.002676882781088352, "kl": 0.0020285207428969443, "learning_rate": 9.710514126910606e-07, "loss": 0.0001, "num_tokens": 86025606.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3126, "step_time": 12.930851683020592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.40133099257946014, "epoch": 0.14483557202408523, "frac_reward_zero_std": 1.0, "grad_norm": 0.001862586592324078, "kl": 0.0019097040931228548, "learning_rate": 9.710421491431217e-07, "loss": 0.0001, "num_tokens": 86070696.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3127, "step_time": 22.270724210888147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 170.25, "completions/mean_terminated_length": 170.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.25261975824832916, "epoch": 0.14488188976377953, "frac_reward_zero_std": 0.0, "grad_norm": 0.13550913333892822, "kl": 0.012046173447743058, "learning_rate": 9.71032885595183e-07, "loss": 0.0374, "num_tokens": 86098908.0, "reward": 0.2242307960987091, "reward_std": 0.06592182070016861, "rewards/reward_func/mean": 0.2242307960987091, "rewards/reward_func/std": 0.06592182070016861, "step": 3128, "step_time": 20.090839847922325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 154.5625, "completions/mean_terminated_length": 154.5625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.38823341578245163, "epoch": 0.14492820750347382, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027239855844527483, "kl": 0.002623922482598573, "learning_rate": 9.710236220472441e-07, "loss": 0.0001, "num_tokens": 86121221.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3129, "step_time": 16.915862929075956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.23917216807603836, "epoch": 0.14497452524316815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034652315080165863, "kl": 0.0022992915473878384, "learning_rate": 9.710143584993053e-07, "loss": 0.0001, "num_tokens": 86140695.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3130, "step_time": 14.590781509876251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 160.625, "completions/mean_terminated_length": 160.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.27476590126752853, "epoch": 0.14502084298286244, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033477682154625654, "kl": 0.002532149839680642, "learning_rate": 9.710050949513664e-07, "loss": 0.0001, "num_tokens": 86162145.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 3131, "step_time": 16.802660521119833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2965754419565201, "epoch": 0.14506716072255674, "frac_reward_zero_std": 1.0, "grad_norm": 0.004325771238654852, "kl": 0.0022911931155249476, "learning_rate": 9.709958314034275e-07, "loss": 0.0001, "num_tokens": 86183601.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3132, "step_time": 13.96632957085967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 132.3125, "completions/mean_terminated_length": 132.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.31247203797101974, "epoch": 0.14511347846225103, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023194546811282635, "kl": 0.001823212078306824, "learning_rate": 9.709865678554886e-07, "loss": 0.0001, "num_tokens": 86216150.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3133, "step_time": 17.256319250911474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.24830194935202599, "epoch": 0.14515979620194536, "frac_reward_zero_std": 1.0, "grad_norm": 0.004885249305516481, "kl": 0.00285379181150347, "learning_rate": 9.709773043075498e-07, "loss": 0.0001, "num_tokens": 86235727.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3134, "step_time": 13.89758824929595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 185.8125, "completions/mean_terminated_length": 185.8125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3852306455373764, "epoch": 0.14520611394163965, "frac_reward_zero_std": 0.0, "grad_norm": 0.18450158834457397, "kl": 0.01743031432852149, "learning_rate": 9.709680407596109e-07, "loss": -0.0536, "num_tokens": 86257100.0, "reward": 0.5301079750061035, "reward_std": 0.4834319055080414, "rewards/reward_func/mean": 0.5301079750061035, "rewards/reward_func/std": 0.48343193531036377, "step": 3135, "step_time": 21.062769904732704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.18371649459004402, "epoch": 0.14525243168133395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074413837864995, "kl": 0.003688816213980317, "learning_rate": 9.70958777211672e-07, "loss": 0.0002, "num_tokens": 86292867.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 3136, "step_time": 19.577458258718252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 122.8125, "completions/mean_terminated_length": 122.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3455648347735405, "epoch": 0.14529874942102824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024457850959151983, "kl": 0.0016058568144217134, "learning_rate": 9.709495136637331e-07, "loss": 0.0001, "num_tokens": 86328912.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3137, "step_time": 18.181072514504194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 172.5625, "completions/mean_terminated_length": 172.5625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.37562134116888046, "epoch": 0.14534506716072257, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036446305457502604, "kl": 0.0032337330048903823, "learning_rate": 9.709402501157943e-07, "loss": 0.0002, "num_tokens": 86362105.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3138, "step_time": 21.73952941223979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3309723660349846, "epoch": 0.14539138490041686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027175559662282467, "kl": 0.002465819241479039, "learning_rate": 9.709309865678554e-07, "loss": 0.0001, "num_tokens": 86381995.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3139, "step_time": 13.53687021881342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 152.9375, "completions/mean_terminated_length": 152.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.18853257596492767, "epoch": 0.14543770264011116, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035405848175287247, "kl": 0.0023557813256047666, "learning_rate": 9.709217230199165e-07, "loss": 0.0001, "num_tokens": 86413770.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3140, "step_time": 18.62023865059018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.31725310534238815, "epoch": 0.14548402037980546, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017198971472680569, "kl": 0.001457408448914066, "learning_rate": 9.709124594719779e-07, "loss": 0.0001, "num_tokens": 86437156.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3141, "step_time": 15.070104915648699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 134.3125, "completions/mean_terminated_length": 134.3125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3369581550359726, "epoch": 0.14553033811949978, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037650528829544783, "kl": 0.0024292929738294333, "learning_rate": 9.70903195924039e-07, "loss": 0.0001, "num_tokens": 86460057.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3142, "step_time": 15.357313193380833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 188.9375, "completions/mean_terminated_length": 188.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.38247305154800415, "epoch": 0.14557665585919408, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012070218799635768, "kl": 0.001781894447049126, "learning_rate": 9.708939323760999e-07, "loss": 0.0001, "num_tokens": 86519736.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3143, "step_time": 28.206488724797964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 210.6875, "completions/mean_terminated_length": 210.6875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.329260416328907, "epoch": 0.14562297359888837, "frac_reward_zero_std": 0.0, "grad_norm": 0.0919196680188179, "kl": 0.003121873422060162, "learning_rate": 9.70884668828161e-07, "loss": 0.0248, "num_tokens": 86552803.0, "reward": 0.012597200460731983, "reward_std": 0.0003570404660422355, "rewards/reward_func/mean": 0.012597200460731983, "rewards/reward_func/std": 0.0003570404078345746, "step": 3144, "step_time": 26.431716088205576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.17420194670557976, "epoch": 0.14566929133858267, "frac_reward_zero_std": 0.0, "grad_norm": 0.11577330529689789, "kl": 0.00298696500249207, "learning_rate": 9.708754052802224e-07, "loss": -0.0038, "num_tokens": 86594889.0, "reward": 0.9507848024368286, "reward_std": 0.03160060942173004, "rewards/reward_func/mean": 0.9507848024368286, "rewards/reward_func/std": 0.03160062059760094, "step": 3145, "step_time": 22.716472662985325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.2812327370047569, "epoch": 0.145715609078277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022909201215952635, "kl": 0.0015351980400737375, "learning_rate": 9.708661417322835e-07, "loss": 0.0001, "num_tokens": 86614233.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3146, "step_time": 16.275097895413637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 116.0, "completions/mean_terminated_length": 116.0, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27915628254413605, "epoch": 0.1457619268179713, "frac_reward_zero_std": 1.0, "grad_norm": 0.003274570917710662, "kl": 0.0017979002150241286, "learning_rate": 9.708568781843446e-07, "loss": 0.0001, "num_tokens": 86633657.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3147, "step_time": 13.511092584580183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 118.75, "completions/mean_terminated_length": 118.75, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2631353735923767, "epoch": 0.14580824455766558, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030122355092316866, "kl": 0.002176519948989153, "learning_rate": 9.708476146364057e-07, "loss": 0.0001, "num_tokens": 86653973.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3148, "step_time": 13.861914370208979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.21980035305023193, "epoch": 0.14585456229735988, "frac_reward_zero_std": 1.0, "grad_norm": 0.002678386867046356, "kl": 0.0018450567149557173, "learning_rate": 9.708383510884669e-07, "loss": 0.0001, "num_tokens": 86673610.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3149, "step_time": 13.962938833981752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.27759916335344315, "epoch": 0.1459008800370542, "frac_reward_zero_std": 1.0, "grad_norm": 0.003311938839033246, "kl": 0.002035931043792516, "learning_rate": 9.70829087540528e-07, "loss": 0.0001, "num_tokens": 86701610.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3150, "step_time": 16.65987069159746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 151.5625, "completions/mean_terminated_length": 151.5625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.20273417234420776, "epoch": 0.1459471977767485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019732709042727947, "kl": 0.0019016769365407526, "learning_rate": 9.70819823992589e-07, "loss": 0.0001, "num_tokens": 86722371.0, "reward": 0.22313016653060913, "reward_std": 0.0, "rewards/reward_func/mean": 0.22313016653060913, "rewards/reward_func/std": 0.0, "step": 3151, "step_time": 16.397979117929935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 174.9375, "completions/mean_terminated_length": 174.9375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.24122145399451256, "epoch": 0.1459935155164428, "frac_reward_zero_std": 1.0, "grad_norm": 0.008388607762753963, "kl": 0.00425613671541214, "learning_rate": 9.708105604446502e-07, "loss": 0.0002, "num_tokens": 86746674.0, "reward": 0.2822723090648651, "reward_std": 0.0, "rewards/reward_func/mean": 0.2822723090648651, "rewards/reward_func/std": 0.0, "step": 3152, "step_time": 18.112127546221018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 136.4375, "completions/mean_terminated_length": 136.4375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2859875038266182, "epoch": 0.1460398332561371, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026483035180717707, "kl": 0.0019898385216947645, "learning_rate": 9.708012968967114e-07, "loss": 0.0001, "num_tokens": 86770857.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3153, "step_time": 16.893994688987732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 199.0625, "completions/mean_terminated_length": 199.0625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.1705622784793377, "epoch": 0.14608615099583142, "frac_reward_zero_std": 0.0, "grad_norm": 0.212143212556839, "kl": 0.004216172615997493, "learning_rate": 9.707920333487725e-07, "loss": 0.0482, "num_tokens": 86792730.0, "reward": 0.8634105324745178, "reward_std": 0.09766335785388947, "rewards/reward_func/mean": 0.8634105324745178, "rewards/reward_func/std": 0.09766335785388947, "step": 3154, "step_time": 20.41399770975113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 165.0625, "completions/mean_terminated_length": 165.0625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.24174608290195465, "epoch": 0.1461324687355257, "frac_reward_zero_std": 0.0, "grad_norm": 0.09421799331903458, "kl": 0.003022257413249463, "learning_rate": 9.707827698008336e-07, "loss": 0.017, "num_tokens": 86817227.0, "reward": 0.4943650960922241, "reward_std": 0.1352176070213318, "rewards/reward_func/mean": 0.4943650960922241, "rewards/reward_func/std": 0.1352176070213318, "step": 3155, "step_time": 18.282536655664444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.32472915202379227, "epoch": 0.14617878647522, "frac_reward_zero_std": 1.0, "grad_norm": 0.005474370904266834, "kl": 0.002662398968823254, "learning_rate": 9.707735062528947e-07, "loss": 0.0001, "num_tokens": 86838217.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3156, "step_time": 15.807528786361217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 222.0, "completions/mean_terminated_length": 222.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4381396025419235, "epoch": 0.1462251042149143, "frac_reward_zero_std": 0.0, "grad_norm": 0.08899547159671783, "kl": 0.004325619898736477, "learning_rate": 9.707642427049559e-07, "loss": -0.0287, "num_tokens": 86872345.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 3157, "step_time": 29.026765812188387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 169.6875, "completions/mean_terminated_length": 169.6875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4100569635629654, "epoch": 0.14627142195460863, "frac_reward_zero_std": 1.0, "grad_norm": 0.005031133070588112, "kl": 0.0038968485314399004, "learning_rate": 9.707549791570172e-07, "loss": 0.0002, "num_tokens": 86898868.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3158, "step_time": 18.37262473627925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.22437533736228943, "epoch": 0.14631773969430292, "frac_reward_zero_std": 0.0, "grad_norm": 0.13961029052734375, "kl": 0.007461741857696325, "learning_rate": 9.707457156090783e-07, "loss": -0.0655, "num_tokens": 86921171.0, "reward": 0.6732373237609863, "reward_std": 0.3405424654483795, "rewards/reward_func/mean": 0.6732373237609863, "rewards/reward_func/std": 0.3405424654483795, "step": 3159, "step_time": 18.213964194059372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3993169814348221, "epoch": 0.14636405743399722, "frac_reward_zero_std": 1.0, "grad_norm": 0.010213624686002731, "kl": 0.003526468004565686, "learning_rate": 9.707364520611394e-07, "loss": 0.0002, "num_tokens": 86951097.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3160, "step_time": 17.57730484753847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 193.0, "completions/mean_terminated_length": 193.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.22103362530469894, "epoch": 0.14641037517369151, "frac_reward_zero_std": 0.0, "grad_norm": 0.11382729560136795, "kl": 0.0024669055710546672, "learning_rate": 9.707271885132006e-07, "loss": -0.0636, "num_tokens": 86975513.0, "reward": 0.5914242267608643, "reward_std": 0.47124677896499634, "rewards/reward_func/mean": 0.5914242267608643, "rewards/reward_func/std": 0.4712468087673187, "step": 3161, "step_time": 19.94507908821106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 121.8125, "completions/mean_terminated_length": 121.8125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.35823170840740204, "epoch": 0.14645669291338584, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034570039715617895, "kl": 0.002837582549545914, "learning_rate": 9.707179249652617e-07, "loss": 0.0001, "num_tokens": 87003062.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3162, "step_time": 15.633826054632664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3633083179593086, "epoch": 0.14650301065308013, "frac_reward_zero_std": 1.0, "grad_norm": 0.005952723324298859, "kl": 0.0032613181974738836, "learning_rate": 9.707086614173228e-07, "loss": 0.0002, "num_tokens": 87023733.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3163, "step_time": 16.495303072035313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.26769497245550156, "epoch": 0.14654932839277443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015609278343617916, "kl": 0.0012672932643909007, "learning_rate": 9.70699397869384e-07, "loss": 0.0001, "num_tokens": 87045103.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3164, "step_time": 14.988057252019644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1024.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 219.06668090820312, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4312158152461052, "epoch": 0.14659564613246873, "frac_reward_zero_std": 0.0, "grad_norm": 0.07963003218173981, "kl": 0.008922230685129762, "learning_rate": 9.70690134321445e-07, "loss": -0.0593, "num_tokens": 87082725.0, "reward": 0.029000209644436836, "reward_std": 0.1158318817615509, "rewards/reward_func/mean": 0.029000209644436836, "rewards/reward_func/std": 0.1158318817615509, "step": 3165, "step_time": 82.63550824671984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 147.4375, "completions/mean_terminated_length": 147.4375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3034921735525131, "epoch": 0.14664196387216305, "frac_reward_zero_std": 1.0, "grad_norm": 0.006009840406477451, "kl": 0.0034268525196239352, "learning_rate": 9.706808707735062e-07, "loss": 0.0002, "num_tokens": 87103004.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3166, "step_time": 16.10325925424695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.263863917440176, "epoch": 0.14668828161185735, "frac_reward_zero_std": 0.0, "grad_norm": 0.12380976229906082, "kl": 0.00807945930864662, "learning_rate": 9.706716072255673e-07, "loss": -0.0214, "num_tokens": 87132760.0, "reward": 0.8609727621078491, "reward_std": 0.24149809777736664, "rewards/reward_func/mean": 0.8609727621078491, "rewards/reward_func/std": 0.24149809777736664, "step": 3167, "step_time": 22.885518915951252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3837863504886627, "epoch": 0.14673459935155164, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017830505967140198, "kl": 0.002071816212264821, "learning_rate": 9.706623436776284e-07, "loss": 0.0001, "num_tokens": 87173712.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3168, "step_time": 21.10164326801896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.1757211908698082, "epoch": 0.14678091709124594, "frac_reward_zero_std": 1.0, "grad_norm": 0.004815271124243736, "kl": 0.0019804288749583066, "learning_rate": 9.706530801296896e-07, "loss": 0.0001, "num_tokens": 87197368.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3169, "step_time": 18.87556202709675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 215.5, "completions/mean_terminated_length": 215.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.24164295196533203, "epoch": 0.14682723483094026, "frac_reward_zero_std": 0.0, "grad_norm": 0.1353035271167755, "kl": 0.00897871411871165, "learning_rate": 9.706438165817507e-07, "loss": -0.0041, "num_tokens": 87229216.0, "reward": 0.7134220600128174, "reward_std": 0.4254027307033539, "rewards/reward_func/mean": 0.7134220600128174, "rewards/reward_func/std": 0.4254027307033539, "step": 3170, "step_time": 23.80917016416788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 216.4375, "completions/mean_terminated_length": 216.4375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.2717517167329788, "epoch": 0.14687355257063456, "frac_reward_zero_std": 0.0, "grad_norm": 0.19454370439052582, "kl": 0.010912682628259063, "learning_rate": 9.70634553033812e-07, "loss": 0.0039, "num_tokens": 87260199.0, "reward": 0.8419197797775269, "reward_std": 0.3306751251220703, "rewards/reward_func/mean": 0.8419197797775269, "rewards/reward_func/std": 0.3306751251220703, "step": 3171, "step_time": 22.230905380100012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.41661446541547775, "epoch": 0.14691987031032885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027504321187734604, "kl": 0.002257747430121526, "learning_rate": 9.706252894858731e-07, "loss": 0.0001, "num_tokens": 87317207.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3172, "step_time": 30.25461930781603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 183.3125, "completions/mean_terminated_length": 183.3125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.22347508743405342, "epoch": 0.14696618805002315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032916993368417025, "kl": 0.002332076954189688, "learning_rate": 9.706160259379343e-07, "loss": 0.0001, "num_tokens": 87344204.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 3173, "step_time": 19.114656172692776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 276.0625, "completions/mean_terminated_length": 276.0625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.25793934240937233, "epoch": 0.14701250578971747, "frac_reward_zero_std": 0.0, "grad_norm": 0.08309826999902725, "kl": 0.00859370466787368, "learning_rate": 9.706067623899952e-07, "loss": -0.1086, "num_tokens": 87377437.0, "reward": 0.7614637613296509, "reward_std": 0.38147062063217163, "rewards/reward_func/mean": 0.7614637613296509, "rewards/reward_func/std": 0.381470650434494, "step": 3174, "step_time": 28.57931701466441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.20336568355560303, "epoch": 0.14705882352941177, "frac_reward_zero_std": 1.0, "grad_norm": 0.005387068260461092, "kl": 0.0037300767726264894, "learning_rate": 9.705974988420565e-07, "loss": 0.0002, "num_tokens": 87415377.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3175, "step_time": 23.53774171322584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 149.1875, "completions/mean_terminated_length": 149.1875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24261876940727234, "epoch": 0.14710514126910607, "frac_reward_zero_std": 1.0, "grad_norm": 0.006944534834474325, "kl": 0.002476296591339633, "learning_rate": 9.705882352941176e-07, "loss": 0.0001, "num_tokens": 87435172.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3176, "step_time": 15.371956024318933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 135.9375, "completions/mean_terminated_length": 135.9375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.24093306064605713, "epoch": 0.14715145900880036, "frac_reward_zero_std": 1.0, "grad_norm": 0.009514703415334225, "kl": 0.004567248863168061, "learning_rate": 9.705789717461788e-07, "loss": 0.0002, "num_tokens": 87455315.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3177, "step_time": 13.69770859926939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.24081763625144958, "epoch": 0.14719777674849469, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016568658174946904, "kl": 0.0015491091180592775, "learning_rate": 9.7056970819824e-07, "loss": 0.0001, "num_tokens": 87478216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3178, "step_time": 19.73670904710889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.9375, "completions/mean_terminated_length": 122.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2850130945444107, "epoch": 0.14724409448818898, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033600255846977234, "kl": 0.0024400824040640146, "learning_rate": 9.70560444650301e-07, "loss": 0.0001, "num_tokens": 87499943.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3179, "step_time": 13.772666417062283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 137.1875, "completions/mean_terminated_length": 137.1875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2709697335958481, "epoch": 0.14729041222788328, "frac_reward_zero_std": 1.0, "grad_norm": 0.004015101585537195, "kl": 0.002116362185915932, "learning_rate": 9.705511811023621e-07, "loss": 0.0001, "num_tokens": 87523626.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3180, "step_time": 15.20681268721819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2954035848379135, "epoch": 0.14733672996757757, "frac_reward_zero_std": 1.0, "grad_norm": 0.003261976409703493, "kl": 0.0021558912412729114, "learning_rate": 9.705419175544233e-07, "loss": 0.0001, "num_tokens": 87550550.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3181, "step_time": 15.935935780405998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.37066788971424103, "epoch": 0.1473830477072719, "frac_reward_zero_std": 1.0, "grad_norm": 0.011409871280193329, "kl": 0.005945907789282501, "learning_rate": 9.705326540064844e-07, "loss": 0.0003, "num_tokens": 87573057.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3182, "step_time": 21.450191903859377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 194.8125, "completions/mean_terminated_length": 194.8125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.31197257339954376, "epoch": 0.1474293654469662, "frac_reward_zero_std": 0.0, "grad_norm": 0.16130420565605164, "kl": 0.0033873734064400196, "learning_rate": 9.705233904585455e-07, "loss": -0.1348, "num_tokens": 87607534.0, "reward": 0.4347401559352875, "reward_std": 0.41662493348121643, "rewards/reward_func/mean": 0.4347401559352875, "rewards/reward_func/std": 0.41662493348121643, "step": 3183, "step_time": 24.65077030658722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 170.875, "completions/mean_terminated_length": 170.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.17241237312555313, "epoch": 0.1474756831866605, "frac_reward_zero_std": 1.0, "grad_norm": 0.002606244059279561, "kl": 0.001355106389382854, "learning_rate": 9.705141269106066e-07, "loss": 0.0001, "num_tokens": 87656268.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 3184, "step_time": 24.78933433443308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3972332626581192, "epoch": 0.14752200092635478, "frac_reward_zero_std": 0.0, "grad_norm": 0.08933868259191513, "kl": 0.006041689426638186, "learning_rate": 9.70504863362668e-07, "loss": -0.3568, "num_tokens": 87683954.0, "reward": 0.1817312240600586, "reward_std": 0.39071038365364075, "rewards/reward_func/mean": 0.1817312240600586, "rewards/reward_func/std": 0.39071041345596313, "step": 3185, "step_time": 36.7055429071188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.378615565598011, "epoch": 0.1475683186660491, "frac_reward_zero_std": 0.0, "grad_norm": 0.1564921885728836, "kl": 0.005293174646794796, "learning_rate": 9.70495599814729e-07, "loss": 0.0716, "num_tokens": 87712528.0, "reward": 0.3393140435218811, "reward_std": 0.4524186849594116, "rewards/reward_func/mean": 0.3393140435218811, "rewards/reward_func/std": 0.4524187445640564, "step": 3186, "step_time": 21.542234182357788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.37833455204963684, "epoch": 0.1476146364057434, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026527580339461565, "kl": 0.00210870907176286, "learning_rate": 9.7048633626679e-07, "loss": 0.0001, "num_tokens": 87760604.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3187, "step_time": 25.973256528377533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3513263314962387, "epoch": 0.1476609541454377, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036778149660676718, "kl": 0.0033898663241416216, "learning_rate": 9.704770727188514e-07, "loss": 0.0002, "num_tokens": 87794494.0, "reward": 0.169904425740242, "reward_std": 0.0, "rewards/reward_func/mean": 0.169904425740242, "rewards/reward_func/std": 0.0, "step": 3188, "step_time": 22.684977620840073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 203.9375, "completions/mean_terminated_length": 203.9375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4454081431031227, "epoch": 0.147707271885132, "frac_reward_zero_std": 0.0, "grad_norm": 0.1215273067355156, "kl": 0.008781867567449808, "learning_rate": 9.704678091709125e-07, "loss": 0.0495, "num_tokens": 87818925.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3189, "step_time": 21.12729962915182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.23385056480765343, "epoch": 0.14775358962482632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016233468195423484, "kl": 0.0012905322655569762, "learning_rate": 9.704585456229736e-07, "loss": 0.0001, "num_tokens": 87853327.0, "reward": 0.6147881746292114, "reward_std": 0.0, "rewards/reward_func/mean": 0.6147881746292114, "rewards/reward_func/std": 0.0, "step": 3190, "step_time": 22.21422252431512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.24486621469259262, "epoch": 0.14779990736452062, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017838386120274663, "kl": 0.0016516250325366855, "learning_rate": 9.704492820750347e-07, "loss": 0.0001, "num_tokens": 87882647.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3191, "step_time": 21.689008958637714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.39651864767074585, "epoch": 0.1478462251042149, "frac_reward_zero_std": 1.0, "grad_norm": 0.006718486547470093, "kl": 0.004706620529759675, "learning_rate": 9.704400185270959e-07, "loss": 0.0002, "num_tokens": 87920563.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3192, "step_time": 22.389150597155094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.210244283080101, "epoch": 0.1478925428439092, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016268540639430285, "kl": 0.001841002085711807, "learning_rate": 9.70430754979157e-07, "loss": 0.0001, "num_tokens": 87942907.0, "reward": 0.5471704602241516, "reward_std": 0.0, "rewards/reward_func/mean": 0.5471704602241516, "rewards/reward_func/std": 0.0, "step": 3193, "step_time": 17.487916626036167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 242.8125, "completions/mean_terminated_length": 242.8125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.17763400077819824, "epoch": 0.14793886058360353, "frac_reward_zero_std": 1.0, "grad_norm": 0.006192723289132118, "kl": 0.003738157683983445, "learning_rate": 9.704214914312181e-07, "loss": 0.0002, "num_tokens": 87967928.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3194, "step_time": 24.626737490296364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.27085962146520615, "epoch": 0.14798517832329783, "frac_reward_zero_std": 0.0, "grad_norm": 0.08795936405658722, "kl": 0.0032920980593189597, "learning_rate": 9.704122278832792e-07, "loss": 0.0384, "num_tokens": 88005498.0, "reward": 0.9402012825012207, "reward_std": 0.16340123116970062, "rewards/reward_func/mean": 0.9402012825012207, "rewards/reward_func/std": 0.16340124607086182, "step": 3195, "step_time": 24.832705087959766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 147.9375, "completions/mean_terminated_length": 147.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.34720899164676666, "epoch": 0.14803149606299212, "frac_reward_zero_std": 1.0, "grad_norm": 0.00362460152246058, "kl": 0.0026282008038833737, "learning_rate": 9.704029643353404e-07, "loss": 0.0001, "num_tokens": 88036553.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3196, "step_time": 17.945649698376656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3234320357441902, "epoch": 0.14807781380268642, "frac_reward_zero_std": 1.0, "grad_norm": 0.002180765150114894, "kl": 0.0016997777274809778, "learning_rate": 9.703937007874015e-07, "loss": 0.0001, "num_tokens": 88057847.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3197, "step_time": 13.488488294184208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.403076708316803, "epoch": 0.14812413154238074, "frac_reward_zero_std": 1.0, "grad_norm": 0.007377514149993658, "kl": 0.003922757343389094, "learning_rate": 9.703844372394626e-07, "loss": 0.0002, "num_tokens": 88101066.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3198, "step_time": 20.92857074737549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.3393295705318451, "epoch": 0.14817044928207504, "frac_reward_zero_std": 1.0, "grad_norm": 0.003656057408079505, "kl": 0.0035902425879612565, "learning_rate": 9.703751736915237e-07, "loss": 0.0002, "num_tokens": 88142146.0, "reward": 0.7663013935089111, "reward_std": 0.0, "rewards/reward_func/mean": 0.7663013935089111, "rewards/reward_func/std": 0.0, "step": 3199, "step_time": 37.4694101922214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.30981073528528214, "epoch": 0.14821676702176934, "frac_reward_zero_std": 1.0, "grad_norm": 0.005902593489736319, "kl": 0.0032484132098034024, "learning_rate": 9.703659101435849e-07, "loss": 0.0002, "num_tokens": 88161651.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3200, "step_time": 13.90316016599536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 170.8125, "completions/mean_terminated_length": 170.8125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3382454290986061, "epoch": 0.14826308476146363, "frac_reward_zero_std": 1.0, "grad_norm": 0.009572918526828289, "kl": 0.005233524250797927, "learning_rate": 9.703566465956462e-07, "loss": 0.0003, "num_tokens": 88182048.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3201, "step_time": 17.268680974841118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 180.8125, "completions/mean_terminated_length": 180.8125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.22474030777812004, "epoch": 0.14830940250115796, "frac_reward_zero_std": 1.0, "grad_norm": 0.004835336469113827, "kl": 0.0031781523721292615, "learning_rate": 9.703473830477073e-07, "loss": 0.0002, "num_tokens": 88206765.0, "reward": 0.9534969329833984, "reward_std": 0.0, "rewards/reward_func/mean": 0.9534969329833984, "rewards/reward_func/std": 0.0, "step": 3202, "step_time": 19.9906326495111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.36052901297807693, "epoch": 0.14835572024085225, "frac_reward_zero_std": 0.0, "grad_norm": 0.09114061295986176, "kl": 0.007722419453784823, "learning_rate": 9.703381194997684e-07, "loss": -0.0326, "num_tokens": 88229025.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 3203, "step_time": 20.64932259172201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.1975666582584381, "epoch": 0.14840203798054655, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023789252154529095, "kl": 0.0017853862955234945, "learning_rate": 9.703288559518296e-07, "loss": 0.0001, "num_tokens": 88252541.0, "reward": 0.32680743932724, "reward_std": 0.0, "rewards/reward_func/mean": 0.32680743932724, "rewards/reward_func/std": 0.0, "step": 3204, "step_time": 19.48708562925458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3965426906943321, "epoch": 0.14844835572024084, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024332997854799032, "kl": 0.002204675634857267, "learning_rate": 9.703195924038907e-07, "loss": 0.0001, "num_tokens": 88298277.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3205, "step_time": 23.821564003825188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.26234976947307587, "epoch": 0.14849467345993517, "frac_reward_zero_std": 0.0, "grad_norm": 0.08036817610263824, "kl": 0.007552730618044734, "learning_rate": 9.703103288559518e-07, "loss": 0.0607, "num_tokens": 88328707.0, "reward": 0.8278868198394775, "reward_std": 0.23553621768951416, "rewards/reward_func/mean": 0.8278868198394775, "rewards/reward_func/std": 0.23553623259067535, "step": 3206, "step_time": 28.886310018599033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.24381082132458687, "epoch": 0.14854099119962946, "frac_reward_zero_std": 1.0, "grad_norm": 0.006969882640987635, "kl": 0.002445285295834765, "learning_rate": 9.70301065308013e-07, "loss": 0.0001, "num_tokens": 88348367.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3207, "step_time": 14.086081974208355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 119.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.29726671427488327, "epoch": 0.14858730893932376, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015938072465360165, "kl": 0.0015336872020270675, "learning_rate": 9.70291801760074e-07, "loss": 0.0001, "num_tokens": 88371119.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3208, "step_time": 13.541106462478638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 164.25, "completions/mean_terminated_length": 164.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.116773946210742, "epoch": 0.14863362667901805, "frac_reward_zero_std": 0.0, "grad_norm": 0.1421230435371399, "kl": 0.0242515429854393, "learning_rate": 9.702825382121352e-07, "loss": -0.0416, "num_tokens": 88393571.0, "reward": 0.9391771554946899, "reward_std": 0.1661996841430664, "rewards/reward_func/mean": 0.9391771554946899, "rewards/reward_func/std": 0.1661996990442276, "step": 3209, "step_time": 16.628654144704342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 200.0, "completions/mean_terminated_length": 200.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.24974997341632843, "epoch": 0.14867994441871238, "frac_reward_zero_std": 1.0, "grad_norm": 0.004038608632981777, "kl": 0.003384222276508808, "learning_rate": 9.702732746641963e-07, "loss": 0.0002, "num_tokens": 88419571.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3210, "step_time": 21.770447835326195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.44399744272232056, "epoch": 0.14872626215840667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036541500594466925, "kl": 0.0028832972166128457, "learning_rate": 9.702640111162574e-07, "loss": 0.0001, "num_tokens": 88448765.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3211, "step_time": 23.496972754597664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 162.375, "completions/mean_terminated_length": 162.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.14537383988499641, "epoch": 0.14877257989810097, "frac_reward_zero_std": 1.0, "grad_norm": 0.004367548506706953, "kl": 0.0027702118968591094, "learning_rate": 9.702547475683186e-07, "loss": 0.0001, "num_tokens": 88470947.0, "reward": 0.8250529766082764, "reward_std": 0.0, "rewards/reward_func/mean": 0.8250529766082764, "rewards/reward_func/std": 0.0, "step": 3212, "step_time": 16.668695371598005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.21356702968478203, "epoch": 0.14881889763779527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032490375451743603, "kl": 0.002257484768051654, "learning_rate": 9.702454840203797e-07, "loss": 0.0001, "num_tokens": 88490945.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3213, "step_time": 13.733813617378473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 148.6875, "completions/mean_terminated_length": 148.6875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3643890991806984, "epoch": 0.1488652153774896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014543255092576146, "kl": 0.0017717880546115339, "learning_rate": 9.702362204724408e-07, "loss": 0.0001, "num_tokens": 88543036.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3214, "step_time": 23.729951851069927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.36227110773324966, "epoch": 0.1489115331171839, "frac_reward_zero_std": 1.0, "grad_norm": 0.00365792540833354, "kl": 0.0025077573372982442, "learning_rate": 9.702269569245022e-07, "loss": 0.0001, "num_tokens": 88570475.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3215, "step_time": 22.424935221672058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.37817592918872833, "epoch": 0.14895785085687818, "frac_reward_zero_std": 0.0, "grad_norm": 0.12057393044233322, "kl": 0.008120844489894807, "learning_rate": 9.702176933765633e-07, "loss": -0.0371, "num_tokens": 88592369.0, "reward": 0.6736177206039429, "reward_std": 0.4698963463306427, "rewards/reward_func/mean": 0.6736177206039429, "rewards/reward_func/std": 0.4698963761329651, "step": 3216, "step_time": 19.30144726112485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 139.1875, "completions/mean_terminated_length": 139.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.1959523782134056, "epoch": 0.14900416859657248, "frac_reward_zero_std": 0.0, "grad_norm": 0.128775954246521, "kl": 0.020842507015913725, "learning_rate": 9.702084298286242e-07, "loss": -0.105, "num_tokens": 88618676.0, "reward": 0.45511555671691895, "reward_std": 0.3485543727874756, "rewards/reward_func/mean": 0.45511555671691895, "rewards/reward_func/std": 0.3485543727874756, "step": 3217, "step_time": 16.61471777409315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 139.875, "completions/mean_terminated_length": 139.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.31921062618494034, "epoch": 0.1490504863362668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025665706489235163, "kl": 0.0021532553946599364, "learning_rate": 9.701991662806855e-07, "loss": 0.0001, "num_tokens": 88640482.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3218, "step_time": 15.491533864289522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 204.6875, "completions/mean_terminated_length": 204.6875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3016367256641388, "epoch": 0.1490968040759611, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037496606819331646, "kl": 0.0024888652842491865, "learning_rate": 9.701899027327467e-07, "loss": 0.0001, "num_tokens": 88666141.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3219, "step_time": 20.989007283002138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.18908924981951714, "epoch": 0.1491431218156554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033928302582353354, "kl": 0.0019500574562698603, "learning_rate": 9.701806391848078e-07, "loss": 0.0001, "num_tokens": 88697245.0, "reward": 0.9534969329833984, "reward_std": 0.0, "rewards/reward_func/mean": 0.9534969329833984, "rewards/reward_func/std": 0.0, "step": 3220, "step_time": 21.1616225913167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 148.3125, "completions/mean_terminated_length": 148.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2388499341905117, "epoch": 0.1491894395553497, "frac_reward_zero_std": 0.0, "grad_norm": 0.10226103663444519, "kl": 0.0020513379713520408, "learning_rate": 9.70171375636869e-07, "loss": 0.008, "num_tokens": 88718754.0, "reward": 0.9940523505210876, "reward_std": 0.023790646344423294, "rewards/reward_func/mean": 0.9940523505210876, "rewards/reward_func/std": 0.023790642619132996, "step": 3221, "step_time": 15.831282813102007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 116.875, "completions/mean_terminated_length": 116.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2894498035311699, "epoch": 0.149235757295044, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027666990645229816, "kl": 0.0019624547276180238, "learning_rate": 9.7016211208893e-07, "loss": 0.0001, "num_tokens": 88739360.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3222, "step_time": 13.560561783611774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 277.4375, "completions/mean_terminated_length": 277.4375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.2797243595123291, "epoch": 0.1492820750347383, "frac_reward_zero_std": 1.0, "grad_norm": 0.012131128460168839, "kl": 0.007326827384531498, "learning_rate": 9.701528485409912e-07, "loss": 0.0004, "num_tokens": 88764935.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3223, "step_time": 26.064146503806114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.33683259040117264, "epoch": 0.1493283927744326, "frac_reward_zero_std": 1.0, "grad_norm": 0.005655890330672264, "kl": 0.0031368256313726306, "learning_rate": 9.701435849930523e-07, "loss": 0.0002, "num_tokens": 88800158.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3224, "step_time": 20.38260020688176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.1556338369846344, "epoch": 0.1493747105141269, "frac_reward_zero_std": 0.0, "grad_norm": 0.18061292171478271, "kl": 0.0028197705396451056, "learning_rate": 9.701343214451134e-07, "loss": 0.0009, "num_tokens": 88822134.0, "reward": 0.9353712797164917, "reward_std": 0.01723429374396801, "rewards/reward_func/mean": 0.9353712797164917, "rewards/reward_func/std": 0.017234310507774353, "step": 3225, "step_time": 16.026508655399084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 110.0, "completions/mean_terminated_length": 110.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2682938277721405, "epoch": 0.14942102825382123, "frac_reward_zero_std": 1.0, "grad_norm": 0.004825829993933439, "kl": 0.0025337798288092017, "learning_rate": 9.701250578971745e-07, "loss": 0.0001, "num_tokens": 88842390.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3226, "step_time": 12.033143199980259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 208.8125, "completions/mean_terminated_length": 208.8125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4151565060019493, "epoch": 0.14946734599351552, "frac_reward_zero_std": 0.0, "grad_norm": 0.11671051383018494, "kl": 0.009419246343895793, "learning_rate": 9.701157943492357e-07, "loss": -0.0443, "num_tokens": 88872707.0, "reward": 0.1004081666469574, "reward_std": 0.2836160361766815, "rewards/reward_func/mean": 0.1004081666469574, "rewards/reward_func/std": 0.2836160361766815, "step": 3227, "step_time": 23.99384493380785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.38457028567790985, "epoch": 0.14951366373320982, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025691171176731586, "kl": 0.0020561676064971834, "learning_rate": 9.70106530801297e-07, "loss": 0.0001, "num_tokens": 88911767.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3228, "step_time": 20.47853649035096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1561616212129593, "epoch": 0.1495599814729041, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015544591005891562, "kl": 0.0013966645346954465, "learning_rate": 9.70097267253358e-07, "loss": 0.0001, "num_tokens": 88935419.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 3229, "step_time": 17.219679478555918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.29462575912475586, "epoch": 0.14960629921259844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028986926190555096, "kl": 0.0019535927567631006, "learning_rate": 9.70088003705419e-07, "loss": 0.0001, "num_tokens": 88957065.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3230, "step_time": 13.661428939551115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1987394392490387, "epoch": 0.14965261695229273, "frac_reward_zero_std": 0.0, "grad_norm": 0.12018464505672455, "kl": 0.020558612421154976, "learning_rate": 9.700787401574804e-07, "loss": -0.0604, "num_tokens": 88994118.0, "reward": 0.7430945634841919, "reward_std": 0.20552437007427216, "rewards/reward_func/mean": 0.7430945634841919, "rewards/reward_func/std": 0.20552437007427216, "step": 3231, "step_time": 21.399724923074245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.24140437692403793, "epoch": 0.14969893469198703, "frac_reward_zero_std": 1.0, "grad_norm": 0.00472430232912302, "kl": 0.002421194250928238, "learning_rate": 9.700694766095415e-07, "loss": 0.0001, "num_tokens": 89013662.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3232, "step_time": 13.645048223435879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 177.1875, "completions/mean_terminated_length": 177.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3778412416577339, "epoch": 0.14974525243168132, "frac_reward_zero_std": 1.0, "grad_norm": 0.00442263949662447, "kl": 0.003878927731420845, "learning_rate": 9.700602130616026e-07, "loss": 0.0002, "num_tokens": 89042033.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3233, "step_time": 21.010863408446312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.30326658487319946, "epoch": 0.14979157017137565, "frac_reward_zero_std": 1.0, "grad_norm": 0.00421948591247201, "kl": 0.0025798884744290262, "learning_rate": 9.700509495136637e-07, "loss": 0.0001, "num_tokens": 89064583.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3234, "step_time": 16.326236341148615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2790285795927048, "epoch": 0.14983788791106994, "frac_reward_zero_std": 1.0, "grad_norm": 0.002742520533502102, "kl": 0.0020693527185358107, "learning_rate": 9.700416859657249e-07, "loss": 0.0001, "num_tokens": 89087937.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3235, "step_time": 15.345882892608643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2681538164615631, "epoch": 0.14988420565076424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016844841884449124, "kl": 0.0013607275031972677, "learning_rate": 9.70032422417786e-07, "loss": 0.0001, "num_tokens": 89108703.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3236, "step_time": 16.236670289188623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.19332417845726013, "epoch": 0.14993052339045854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010703495936468244, "kl": 0.0008461621328024194, "learning_rate": 9.700231588698471e-07, "loss": 0.0, "num_tokens": 89130357.0, "reward": 0.3011942207813263, "reward_std": 0.0, "rewards/reward_func/mean": 0.3011942207813263, "rewards/reward_func/std": 0.0, "step": 3237, "step_time": 18.122743774205446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 236.1875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.21174217388033867, "epoch": 0.14997684113015286, "frac_reward_zero_std": 0.0, "grad_norm": 0.06723204255104065, "kl": 0.0013857937447028235, "learning_rate": 9.700138953219082e-07, "loss": -0.0366, "num_tokens": 89164504.0, "reward": 0.9305884838104248, "reward_std": 0.018509721383452415, "rewards/reward_func/mean": 0.9305884838104248, "rewards/reward_func/std": 0.01850973069667816, "step": 3238, "step_time": 27.427233666181564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 215.5625, "completions/mean_terminated_length": 215.5625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.46943120658397675, "epoch": 0.15002315886984716, "frac_reward_zero_std": 1.0, "grad_norm": 0.004929953720420599, "kl": 0.0032900521182455122, "learning_rate": 9.700046317739694e-07, "loss": 0.0002, "num_tokens": 89191793.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3239, "step_time": 29.97055809572339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 133.875, "completions/mean_terminated_length": 133.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.26193320006132126, "epoch": 0.15006947660954145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027752660680562258, "kl": 0.0021281801746226847, "learning_rate": 9.699953682260305e-07, "loss": 0.0001, "num_tokens": 89211471.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3240, "step_time": 14.621359758079052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 157.1875, "completions/mean_terminated_length": 157.1875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.36254987865686417, "epoch": 0.15011579434923575, "frac_reward_zero_std": 1.0, "grad_norm": 0.005949254613369703, "kl": 0.0031668933806940913, "learning_rate": 9.699861046780916e-07, "loss": 0.0002, "num_tokens": 89237138.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3241, "step_time": 17.5031932964921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 121.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.29025503993034363, "epoch": 0.15016211208893007, "frac_reward_zero_std": 1.0, "grad_norm": 0.004383976571261883, "kl": 0.002748581231571734, "learning_rate": 9.699768411301527e-07, "loss": 0.0001, "num_tokens": 89256966.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3242, "step_time": 13.214446749538183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.3311644718050957, "epoch": 0.15020842982862437, "frac_reward_zero_std": 0.0, "grad_norm": 0.10076361149549484, "kl": 0.008241066941991448, "learning_rate": 9.699675775822139e-07, "loss": 0.0323, "num_tokens": 89292432.0, "reward": 0.7135276794433594, "reward_std": 0.35400888323783875, "rewards/reward_func/mean": 0.7135276794433594, "rewards/reward_func/std": 0.35400888323783875, "step": 3243, "step_time": 26.671080119907856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.37825947999954224, "epoch": 0.15025474756831866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032125997822731733, "kl": 0.0022422123001888394, "learning_rate": 9.69958314034275e-07, "loss": 0.0001, "num_tokens": 89349246.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3244, "step_time": 24.166430916637182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 138.8125, "completions/mean_terminated_length": 138.8125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.30234841257333755, "epoch": 0.15030106530801296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018751475727185607, "kl": 0.001480356848333031, "learning_rate": 9.699490504863363e-07, "loss": 0.0001, "num_tokens": 89375211.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3245, "step_time": 16.059069741517305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 211.875, "completions/mean_terminated_length": 211.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3353811204433441, "epoch": 0.15034738304770728, "frac_reward_zero_std": 0.0, "grad_norm": 0.10169153660535812, "kl": 0.00472992321010679, "learning_rate": 9.699397869383974e-07, "loss": -0.0405, "num_tokens": 89405737.0, "reward": 0.3852382004261017, "reward_std": 0.4519440531730652, "rewards/reward_func/mean": 0.3852382004261017, "rewards/reward_func/std": 0.4519440829753876, "step": 3246, "step_time": 22.704330950975418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 129.625, "completions/mean_terminated_length": 129.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3572033792734146, "epoch": 0.15039370078740158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028394977562129498, "kl": 0.0023871789453551173, "learning_rate": 9.699305233904586e-07, "loss": 0.0001, "num_tokens": 89440691.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3247, "step_time": 17.94020389392972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2701178193092346, "epoch": 0.15044001852709588, "frac_reward_zero_std": 1.0, "grad_norm": 0.001421017455868423, "kl": 0.0012700442457571626, "learning_rate": 9.699212598425197e-07, "loss": 0.0001, "num_tokens": 89461589.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3248, "step_time": 16.179645586758852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.22720123454928398, "epoch": 0.15048633626679017, "frac_reward_zero_std": 1.0, "grad_norm": 0.002331890631467104, "kl": 0.0015017892292235047, "learning_rate": 9.699119962945808e-07, "loss": 0.0001, "num_tokens": 89481275.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3249, "step_time": 14.21345742419362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.12625334411859512, "epoch": 0.1505326540064845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016706647584214807, "kl": 0.001029629958793521, "learning_rate": 9.69902732746642e-07, "loss": 0.0001, "num_tokens": 89509418.0, "reward": 0.3425188660621643, "reward_std": 0.0, "rewards/reward_func/mean": 0.3425188660621643, "rewards/reward_func/std": 0.0, "step": 3250, "step_time": 18.146559350192547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 131.8125, "completions/mean_terminated_length": 131.8125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.1958547979593277, "epoch": 0.1505789717461788, "frac_reward_zero_std": 1.0, "grad_norm": 0.005485287867486477, "kl": 0.002891829004511237, "learning_rate": 9.69893469198703e-07, "loss": 0.0001, "num_tokens": 89533063.0, "reward": 0.09697196632623672, "reward_std": 0.0, "rewards/reward_func/mean": 0.09697196632623672, "rewards/reward_func/std": 0.0, "step": 3251, "step_time": 15.232710171490908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.2940206602215767, "epoch": 0.1506252894858731, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020695466082543135, "kl": 0.0013882204075343907, "learning_rate": 9.698842056507642e-07, "loss": 0.0001, "num_tokens": 89556281.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3252, "step_time": 16.643860213458538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 194.9375, "completions/mean_terminated_length": 194.9375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.34039029479026794, "epoch": 0.15067160722556738, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015669554704800248, "kl": 0.0017213832470588386, "learning_rate": 9.698749421028253e-07, "loss": 0.0001, "num_tokens": 89585144.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3253, "step_time": 20.878741156309843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 129.0625, "completions/mean_terminated_length": 129.0625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.29765769094228745, "epoch": 0.1507179249652617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023812558501958847, "kl": 0.0018762968538794667, "learning_rate": 9.698656785548864e-07, "loss": 0.0001, "num_tokens": 89613801.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3254, "step_time": 16.61073511093855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.31266072392463684, "epoch": 0.150764242704956, "frac_reward_zero_std": 1.0, "grad_norm": 0.004924772307276726, "kl": 0.0032038383651524782, "learning_rate": 9.698564150069476e-07, "loss": 0.0002, "num_tokens": 89636600.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3255, "step_time": 18.050747897475958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 194.25, "completions/mean_terminated_length": 194.25, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3691105544567108, "epoch": 0.1508105604446503, "frac_reward_zero_std": 0.0, "grad_norm": 0.11865503340959549, "kl": 0.011823933571577072, "learning_rate": 9.698471514590087e-07, "loss": -0.073, "num_tokens": 89660844.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 3256, "step_time": 21.661863792687654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 144.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.33116084337234497, "epoch": 0.1508568781843446, "frac_reward_zero_std": 1.0, "grad_norm": 0.002101058140397072, "kl": 0.0017146758327726275, "learning_rate": 9.698378879110698e-07, "loss": 0.0001, "num_tokens": 89683569.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3257, "step_time": 15.987027879804373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3729643002152443, "epoch": 0.15090319592403892, "frac_reward_zero_std": 1.0, "grad_norm": 0.003417692380025983, "kl": 0.0027359239174984396, "learning_rate": 9.698286243631312e-07, "loss": 0.0001, "num_tokens": 89704676.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3258, "step_time": 18.430900812149048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 158.25, "completions/mean_terminated_length": 158.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.36683958023786545, "epoch": 0.15094951366373321, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017151250503957272, "kl": 0.0015917146811261773, "learning_rate": 9.698193608151923e-07, "loss": 0.0001, "num_tokens": 89739240.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3259, "step_time": 19.48437863588333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2848173752427101, "epoch": 0.1509958314034275, "frac_reward_zero_std": 1.0, "grad_norm": 0.004380743019282818, "kl": 0.0024106700730044395, "learning_rate": 9.698100972672532e-07, "loss": 0.0001, "num_tokens": 89761298.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3260, "step_time": 15.09330853447318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 125.125, "completions/mean_terminated_length": 125.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.26965858042240143, "epoch": 0.1510421491431218, "frac_reward_zero_std": 1.0, "grad_norm": 0.00806054100394249, "kl": 0.0024616795708425343, "learning_rate": 9.698008337193143e-07, "loss": 0.0001, "num_tokens": 89780644.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3261, "step_time": 13.78364922106266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 183.0625, "completions/mean_terminated_length": 183.0625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.34466879814863205, "epoch": 0.15108846688281613, "frac_reward_zero_std": 0.0, "grad_norm": 0.1152619943022728, "kl": 0.006931161391548812, "learning_rate": 9.697915701713757e-07, "loss": 0.0458, "num_tokens": 89802357.0, "reward": 0.5879805088043213, "reward_std": 0.47100555896759033, "rewards/reward_func/mean": 0.5879805088043213, "rewards/reward_func/std": 0.47100555896759033, "step": 3262, "step_time": 20.866017419844866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.2506040036678314, "epoch": 0.15113478462251043, "frac_reward_zero_std": 1.0, "grad_norm": 0.002137587871402502, "kl": 0.0019003933412022889, "learning_rate": 9.697823066234368e-07, "loss": 0.0001, "num_tokens": 89827651.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3263, "step_time": 26.01487050577998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.36395641416311264, "epoch": 0.15118110236220472, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025934490840882063, "kl": 0.0018593795248307288, "learning_rate": 9.69773043075498e-07, "loss": 0.0001, "num_tokens": 89861851.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3264, "step_time": 19.28165887668729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.1678924560546875, "epoch": 0.15122742010189902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014214112889021635, "kl": 0.0010792575048981234, "learning_rate": 9.69763779527559e-07, "loss": 0.0001, "num_tokens": 89884263.0, "reward": 0.22313016653060913, "reward_std": 0.0, "rewards/reward_func/mean": 0.22313016653060913, "rewards/reward_func/std": 0.0, "step": 3265, "step_time": 20.242999769747257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 220.0625, "completions/mean_terminated_length": 220.0625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.2644739933311939, "epoch": 0.15127373784159334, "frac_reward_zero_std": 0.0, "grad_norm": 0.10686413943767548, "kl": 0.005656387540511787, "learning_rate": 9.697545159796202e-07, "loss": 0.025, "num_tokens": 89909336.0, "reward": 0.8512634634971619, "reward_std": 0.09077872335910797, "rewards/reward_func/mean": 0.8512634634971619, "rewards/reward_func/std": 0.09077871590852737, "step": 3266, "step_time": 21.561144541949034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.23788397759199142, "epoch": 0.15132005558128764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024266827385872602, "kl": 0.001750052470015362, "learning_rate": 9.697452524316813e-07, "loss": 0.0001, "num_tokens": 89928964.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3267, "step_time": 13.8215871155262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 223.1875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.15739285573363304, "epoch": 0.15136637332098193, "frac_reward_zero_std": 0.0, "grad_norm": 0.08341161161661148, "kl": 0.0024393564090132713, "learning_rate": 9.697359888837424e-07, "loss": -0.0684, "num_tokens": 89960231.0, "reward": 0.9670228958129883, "reward_std": 0.1319083571434021, "rewards/reward_func/mean": 0.9670228958129883, "rewards/reward_func/std": 0.1319083571434021, "step": 3268, "step_time": 24.692576456815004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 123.1875, "completions/mean_terminated_length": 123.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.30562247335910797, "epoch": 0.15141269106067623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035384935326874256, "kl": 0.0020598951377905905, "learning_rate": 9.697267253358035e-07, "loss": 0.0001, "num_tokens": 89979770.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3269, "step_time": 15.709949240088463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.15506519004702568, "epoch": 0.15145900880037055, "frac_reward_zero_std": 0.0, "grad_norm": 0.2474597692489624, "kl": 0.03511208947747946, "learning_rate": 9.697174617878647e-07, "loss": -0.0631, "num_tokens": 90000378.0, "reward": 0.9016326665878296, "reward_std": 0.1759648323059082, "rewards/reward_func/mean": 0.9016326665878296, "rewards/reward_func/std": 0.1759648323059082, "step": 3270, "step_time": 15.967582158744335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 118.0625, "completions/mean_terminated_length": 118.0625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3097205013036728, "epoch": 0.15150532654006485, "frac_reward_zero_std": 1.0, "grad_norm": 0.005242596846073866, "kl": 0.002366037108004093, "learning_rate": 9.69708198239926e-07, "loss": 0.0001, "num_tokens": 90024683.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3271, "step_time": 13.729232251644135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 114.875, "completions/mean_terminated_length": 114.875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.30227434635162354, "epoch": 0.15155164427975915, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036587256472557783, "kl": 0.0023751078406348825, "learning_rate": 9.69698934691987e-07, "loss": 0.0001, "num_tokens": 90048393.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3272, "step_time": 13.636955201625824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 186.6875, "completions/mean_terminated_length": 186.6875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.39632876217365265, "epoch": 0.15159796201945344, "frac_reward_zero_std": 1.0, "grad_norm": 0.004219031892716885, "kl": 0.003482925007119775, "learning_rate": 9.69689671144048e-07, "loss": 0.0002, "num_tokens": 90077076.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3273, "step_time": 21.402074065059423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.25, "completions/mean_terminated_length": 179.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3429599553346634, "epoch": 0.15164427975914777, "frac_reward_zero_std": 0.0, "grad_norm": 0.10947351902723312, "kl": 0.0089666866697371, "learning_rate": 9.696804075961092e-07, "loss": 0.0115, "num_tokens": 90101240.0, "reward": 0.028005223721265793, "reward_std": 0.025508280843496323, "rewards/reward_func/mean": 0.028005223721265793, "rewards/reward_func/std": 0.025508280843496323, "step": 3274, "step_time": 19.090307485312223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.33106181770563126, "epoch": 0.15169059749884206, "frac_reward_zero_std": 1.0, "grad_norm": 0.00574125163257122, "kl": 0.0031234038760885596, "learning_rate": 9.696711440481705e-07, "loss": 0.0002, "num_tokens": 90123464.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3275, "step_time": 15.320256788283587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 210.0625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.2225816547870636, "epoch": 0.15173691523853636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031187862623482943, "kl": 0.002612276584841311, "learning_rate": 9.696618805002316e-07, "loss": 0.0001, "num_tokens": 90156361.0, "reward": 0.5961628556251526, "reward_std": 0.0, "rewards/reward_func/mean": 0.5961628556251526, "rewards/reward_func/std": 0.0, "step": 3276, "step_time": 22.935048822313547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.20432621985673904, "epoch": 0.15178323297823065, "frac_reward_zero_std": 1.0, "grad_norm": 0.00388956256210804, "kl": 0.00211767345899716, "learning_rate": 9.696526169522927e-07, "loss": 0.0001, "num_tokens": 90178755.0, "reward": 0.3678794503211975, "reward_std": 0.0, "rewards/reward_func/mean": 0.3678794503211975, "rewards/reward_func/std": 0.0, "step": 3277, "step_time": 14.698649179190397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 151.125, "completions/mean_terminated_length": 151.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2996611073613167, "epoch": 0.15182955071792498, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018953023245558143, "kl": 0.0019482370116747916, "learning_rate": 9.696433534043539e-07, "loss": 0.0001, "num_tokens": 90199861.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3278, "step_time": 16.812793001532555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 260.9375, "completions/mean_terminated_length": 260.9375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.23603739216923714, "epoch": 0.15187586845761927, "frac_reward_zero_std": 0.0, "grad_norm": 0.09444378316402435, "kl": 0.011439332040026784, "learning_rate": 9.69634089856415e-07, "loss": -0.0012, "num_tokens": 90229716.0, "reward": 0.904184103012085, "reward_std": 0.0697307363152504, "rewards/reward_func/mean": 0.904184103012085, "rewards/reward_func/std": 0.0697307288646698, "step": 3279, "step_time": 27.659960947930813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.21466870605945587, "epoch": 0.15192218619731357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035024394746869802, "kl": 0.001631989172892645, "learning_rate": 9.696248263084761e-07, "loss": 0.0001, "num_tokens": 90249664.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3280, "step_time": 15.02141348272562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.20678676292300224, "epoch": 0.15196850393700786, "frac_reward_zero_std": 1.0, "grad_norm": 0.002799753565341234, "kl": 0.0014430058363359421, "learning_rate": 9.696155627605372e-07, "loss": 0.0001, "num_tokens": 90280148.0, "reward": 0.16957512497901917, "reward_std": 0.0, "rewards/reward_func/mean": 0.16957512497901917, "rewards/reward_func/std": 0.0, "step": 3281, "step_time": 20.30757163465023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 155.1875, "completions/mean_terminated_length": 155.1875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.38377776741981506, "epoch": 0.1520148216767022, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056123570539057255, "kl": 0.0035017322516068816, "learning_rate": 9.696062992125984e-07, "loss": 0.0002, "num_tokens": 90316999.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3282, "step_time": 22.22096023708582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 162.3125, "completions/mean_terminated_length": 162.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4039725139737129, "epoch": 0.15206113941639648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027582065667957067, "kl": 0.0021585662325378507, "learning_rate": 9.695970356646595e-07, "loss": 0.0001, "num_tokens": 90344092.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3283, "step_time": 17.36678833886981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 147.5625, "completions/mean_terminated_length": 147.5625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3137865886092186, "epoch": 0.15210745715609078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040509337559342384, "kl": 0.0022408179938793182, "learning_rate": 9.695877721167206e-07, "loss": 0.0001, "num_tokens": 90365445.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3284, "step_time": 15.23472861200571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.24157676845788956, "epoch": 0.15215377489578508, "frac_reward_zero_std": 0.0, "grad_norm": 0.09288583695888519, "kl": 0.0019232777995057404, "learning_rate": 9.695785085687817e-07, "loss": -0.0029, "num_tokens": 90387793.0, "reward": 0.9493370056152344, "reward_std": 0.013510131277143955, "rewards/reward_func/mean": 0.9493370056152344, "rewards/reward_func/std": 0.013510138727724552, "step": 3285, "step_time": 18.731104020029306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3699192479252815, "epoch": 0.1522000926354794, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018917883280664682, "kl": 0.0021559084416367114, "learning_rate": 9.695692450208429e-07, "loss": 0.0001, "num_tokens": 90418464.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3286, "step_time": 21.72200494259596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 156.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.17947082966566086, "epoch": 0.1522464103751737, "frac_reward_zero_std": 0.0, "grad_norm": 0.14477550983428955, "kl": 0.0419952217489481, "learning_rate": 9.69559981472904e-07, "loss": -0.0652, "num_tokens": 90440436.0, "reward": 0.8929705619812012, "reward_std": 0.19395191967487335, "rewards/reward_func/mean": 0.8929705619812012, "rewards/reward_func/std": 0.19395191967487335, "step": 3287, "step_time": 16.9317224919796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 152.3125, "completions/mean_terminated_length": 152.3125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.25229161977767944, "epoch": 0.152292728114868, "frac_reward_zero_std": 0.0, "grad_norm": 0.14486804604530334, "kl": 0.0030481035355478525, "learning_rate": 9.695507179249653e-07, "loss": 0.0193, "num_tokens": 90460969.0, "reward": 0.862541675567627, "reward_std": 0.23001109063625336, "rewards/reward_func/mean": 0.862541675567627, "rewards/reward_func/std": 0.23001112043857574, "step": 3288, "step_time": 16.284332536160946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.370268777012825, "epoch": 0.1523390458545623, "frac_reward_zero_std": 1.0, "grad_norm": 0.008434325456619263, "kl": 0.004034133744426072, "learning_rate": 9.695414543770265e-07, "loss": 0.0002, "num_tokens": 90485393.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3289, "step_time": 19.006528720259666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.23728742450475693, "epoch": 0.1523853635942566, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019843794871121645, "kl": 0.0016110965516418219, "learning_rate": 9.695321908290876e-07, "loss": 0.0001, "num_tokens": 90507926.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3290, "step_time": 18.091393880546093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3134409636259079, "epoch": 0.1524316813339509, "frac_reward_zero_std": 0.0, "grad_norm": 0.12602339684963226, "kl": 0.007899310905486345, "learning_rate": 9.695229272811485e-07, "loss": 0.0273, "num_tokens": 90530164.0, "reward": 0.9767484664916992, "reward_std": 0.024014079943299294, "rewards/reward_func/mean": 0.9767484664916992, "rewards/reward_func/std": 0.024014081805944443, "step": 3291, "step_time": 21.685513395816088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.43275701254606247, "epoch": 0.1524779990736452, "frac_reward_zero_std": 1.0, "grad_norm": 0.006604908034205437, "kl": 0.003985709452535957, "learning_rate": 9.695136637332098e-07, "loss": 0.0002, "num_tokens": 90568556.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3292, "step_time": 24.167351059615612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.35880517959594727, "epoch": 0.1525243168133395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072023323737084866, "kl": 0.004372330091428012, "learning_rate": 9.69504400185271e-07, "loss": 0.0002, "num_tokens": 90618884.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3293, "step_time": 25.151807714253664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 148.5, "completions/mean_terminated_length": 148.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.15271759033203125, "epoch": 0.15257063455303382, "frac_reward_zero_std": 1.0, "grad_norm": 0.001755103818140924, "kl": 0.0013450997066684067, "learning_rate": 9.69495136637332e-07, "loss": 0.0001, "num_tokens": 90649212.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3294, "step_time": 17.78191427141428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 124.0625, "completions/mean_terminated_length": 124.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2932748645544052, "epoch": 0.15261695229272812, "frac_reward_zero_std": 1.0, "grad_norm": 0.003972919657826424, "kl": 0.0026993892388418317, "learning_rate": 9.694858730893932e-07, "loss": 0.0001, "num_tokens": 90669405.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3295, "step_time": 14.937408167868853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.35521989315748215, "epoch": 0.15266327003242242, "frac_reward_zero_std": 1.0, "grad_norm": 0.015062614344060421, "kl": 0.01492274587508291, "learning_rate": 9.694766095414543e-07, "loss": 0.0008, "num_tokens": 90690890.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3296, "step_time": 22.135592482984066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3670378103852272, "epoch": 0.1527095877721167, "frac_reward_zero_std": 0.0, "grad_norm": 0.11439114063978195, "kl": 0.0034319759579375386, "learning_rate": 9.694673459935155e-07, "loss": 0.0026, "num_tokens": 90728652.0, "reward": 0.05706879496574402, "reward_std": 0.22827517986297607, "rewards/reward_func/mean": 0.05706879496574402, "rewards/reward_func/std": 0.22827517986297607, "step": 3297, "step_time": 34.143172804266214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 154.0, "completions/mean_terminated_length": 154.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.32381830364465714, "epoch": 0.15275590551181104, "frac_reward_zero_std": 1.0, "grad_norm": 0.006391808856278658, "kl": 0.00352169742109254, "learning_rate": 9.694580824455766e-07, "loss": 0.0002, "num_tokens": 90749084.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3298, "step_time": 16.812274515628815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2904447764158249, "epoch": 0.15280222325150533, "frac_reward_zero_std": 1.0, "grad_norm": 0.005151164252310991, "kl": 0.0025714096846058965, "learning_rate": 9.694488188976377e-07, "loss": 0.0001, "num_tokens": 90771702.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3299, "step_time": 15.857111155986786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 200.625, "completions/mean_terminated_length": 200.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.19473102688789368, "epoch": 0.15284854099119963, "frac_reward_zero_std": 0.0, "grad_norm": 0.1317998468875885, "kl": 0.00478421151638031, "learning_rate": 9.694395553496988e-07, "loss": -0.0416, "num_tokens": 90795616.0, "reward": 0.09923243522644043, "reward_std": 0.012309839949011803, "rewards/reward_func/mean": 0.09923243522644043, "rewards/reward_func/std": 0.012309840880334377, "step": 3300, "step_time": 20.982359372079372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 128.875, "completions/mean_terminated_length": 128.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.26331282407045364, "epoch": 0.15289485873089392, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022739721462130547, "kl": 0.001985971728572622, "learning_rate": 9.694302918017602e-07, "loss": 0.0001, "num_tokens": 90815454.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3301, "step_time": 14.306706611067057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 145.125, "completions/mean_terminated_length": 145.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.23532408848404884, "epoch": 0.15294117647058825, "frac_reward_zero_std": 1.0, "grad_norm": 0.006874697748571634, "kl": 0.0036359603982418776, "learning_rate": 9.694210282538213e-07, "loss": 0.0002, "num_tokens": 90838928.0, "reward": 0.8633400201797485, "reward_std": 0.0, "rewards/reward_func/mean": 0.8633400201797485, "rewards/reward_func/std": 0.0, "step": 3302, "step_time": 15.767543531954288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 174.875, "completions/mean_terminated_length": 174.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4035678803920746, "epoch": 0.15298749421028254, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030451833736151457, "kl": 0.003345791425090283, "learning_rate": 9.694117647058822e-07, "loss": 0.0002, "num_tokens": 90873550.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3303, "step_time": 20.6306994818151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 123.5, "completions/mean_terminated_length": 123.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.28306350857019424, "epoch": 0.15303381194997684, "frac_reward_zero_std": 1.0, "grad_norm": 0.00297046871855855, "kl": 0.002029210823820904, "learning_rate": 9.694025011579433e-07, "loss": 0.0001, "num_tokens": 90898550.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3304, "step_time": 14.078998416662216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 158.6875, "completions/mean_terminated_length": 158.6875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.377525232732296, "epoch": 0.15308012968967113, "frac_reward_zero_std": 1.0, "grad_norm": 0.00698324479162693, "kl": 0.002947824017610401, "learning_rate": 9.693932376100047e-07, "loss": 0.0001, "num_tokens": 90953521.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3305, "step_time": 24.537496775388718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 126.125, "completions/mean_terminated_length": 126.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.30835408717393875, "epoch": 0.15312644742936546, "frac_reward_zero_std": 1.0, "grad_norm": 0.009671625681221485, "kl": 0.004571863682940602, "learning_rate": 9.693839740620658e-07, "loss": 0.0002, "num_tokens": 90973779.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3306, "step_time": 13.879542093724012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.301509328186512, "epoch": 0.15317276516905975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017688415246084332, "kl": 0.001505364547483623, "learning_rate": 9.69374710514127e-07, "loss": 0.0001, "num_tokens": 91001609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3307, "step_time": 15.010881319642067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3417460024356842, "epoch": 0.15321908290875405, "frac_reward_zero_std": 1.0, "grad_norm": 0.011428617872297764, "kl": 0.006048428127542138, "learning_rate": 9.69365446966188e-07, "loss": 0.0003, "num_tokens": 91028219.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3308, "step_time": 21.835330188274384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 229.9375, "completions/mean_terminated_length": 229.9375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.21667520701885223, "epoch": 0.15326540064844835, "frac_reward_zero_std": 0.0, "grad_norm": 0.07177912443876266, "kl": 0.004559624299872667, "learning_rate": 9.693561834182492e-07, "loss": -0.0447, "num_tokens": 91052026.0, "reward": 0.013111528009176254, "reward_std": 0.004381328821182251, "rewards/reward_func/mean": 0.013111528009176254, "rewards/reward_func/std": 0.004381329286843538, "step": 3309, "step_time": 23.080633092671633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 150.0625, "completions/mean_terminated_length": 150.0625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.1849103607237339, "epoch": 0.15331171838814267, "frac_reward_zero_std": 1.0, "grad_norm": 0.003997376654297113, "kl": 0.0022924855584278703, "learning_rate": 9.693469198703103e-07, "loss": 0.0001, "num_tokens": 91072955.0, "reward": 0.07200431823730469, "reward_std": 0.0, "rewards/reward_func/mean": 0.07200431823730469, "rewards/reward_func/std": 0.0, "step": 3310, "step_time": 16.4729915112257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 209.4375, "completions/mean_terminated_length": 209.4375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3776468113064766, "epoch": 0.15335803612783697, "frac_reward_zero_std": 0.0, "grad_norm": 0.1089252382516861, "kl": 0.009051568806171417, "learning_rate": 9.693376563223714e-07, "loss": -0.1148, "num_tokens": 91101362.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 3311, "step_time": 24.408411759883165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.40296033024787903, "epoch": 0.15340435386753126, "frac_reward_zero_std": 0.0, "grad_norm": 0.12542779743671417, "kl": 0.006457364303059876, "learning_rate": 9.693283927744325e-07, "loss": -0.0625, "num_tokens": 91122739.0, "reward": 0.2542293667793274, "reward_std": 0.3926730751991272, "rewards/reward_func/mean": 0.2542293667793274, "rewards/reward_func/std": 0.3926730453968048, "step": 3312, "step_time": 20.91408522054553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.11455479264259338, "epoch": 0.15345067160722556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020320117473602295, "kl": 0.0009717029897728935, "learning_rate": 9.693191292264937e-07, "loss": 0.0, "num_tokens": 91152939.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 3313, "step_time": 19.73755782842636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 241.8125, "completions/mean_terminated_length": 241.8125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.30763477832078934, "epoch": 0.15349698934691988, "frac_reward_zero_std": 0.0, "grad_norm": 0.07847141474485397, "kl": 0.004074239986948669, "learning_rate": 9.693098656785548e-07, "loss": -0.0533, "num_tokens": 91191896.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3314, "step_time": 28.239569298923016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 186.4375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3589971140027046, "epoch": 0.15354330708661418, "frac_reward_zero_std": 0.0, "grad_norm": 0.10690788179636002, "kl": 0.00506272166967392, "learning_rate": 9.69300602130616e-07, "loss": -0.0265, "num_tokens": 91213151.0, "reward": 0.7459019422531128, "reward_std": 0.2996750473976135, "rewards/reward_func/mean": 0.7459019422531128, "rewards/reward_func/std": 0.2996750771999359, "step": 3315, "step_time": 21.345596093684435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 139.375, "completions/mean_terminated_length": 139.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.20879939943552017, "epoch": 0.15358962482630847, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035100355744361877, "kl": 0.0022340710274875164, "learning_rate": 9.69291338582677e-07, "loss": 0.0001, "num_tokens": 91232821.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3316, "step_time": 14.858672991394997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.9375, "completions/mean_terminated_length": 121.9375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.26160337403416634, "epoch": 0.15363594256600277, "frac_reward_zero_std": 1.0, "grad_norm": 0.002126466017216444, "kl": 0.0017897105135489255, "learning_rate": 9.692820750347382e-07, "loss": 0.0001, "num_tokens": 91256756.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3317, "step_time": 13.929788701236248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3657693639397621, "epoch": 0.1536822603056971, "frac_reward_zero_std": 1.0, "grad_norm": 0.008878033608198166, "kl": 0.0043170658173039556, "learning_rate": 9.692728114867995e-07, "loss": 0.0002, "num_tokens": 91289782.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3318, "step_time": 24.704879105091095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 122.0625, "completions/mean_terminated_length": 122.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2652072086930275, "epoch": 0.1537285780453914, "frac_reward_zero_std": 1.0, "grad_norm": 0.001954125240445137, "kl": 0.0016342448652721941, "learning_rate": 9.692635479388606e-07, "loss": 0.0001, "num_tokens": 91309847.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3319, "step_time": 13.324027381837368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.24706030637025833, "epoch": 0.15377489578508569, "frac_reward_zero_std": 1.0, "grad_norm": 0.005342193879187107, "kl": 0.0023952096817083657, "learning_rate": 9.692542843909217e-07, "loss": 0.0001, "num_tokens": 91332304.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3320, "step_time": 14.820265386253595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4223659113049507, "epoch": 0.15382121352477998, "frac_reward_zero_std": 0.0, "grad_norm": 0.09434162825345993, "kl": 0.005028395331464708, "learning_rate": 9.692450208429827e-07, "loss": -0.0106, "num_tokens": 91358965.0, "reward": 0.11413758993148804, "reward_std": 0.31188327074050903, "rewards/reward_func/mean": 0.11413758993148804, "rewards/reward_func/std": 0.31188327074050903, "step": 3321, "step_time": 22.400970544666052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3815314769744873, "epoch": 0.1538675312644743, "frac_reward_zero_std": 1.0, "grad_norm": 0.002653124975040555, "kl": 0.002448896935675293, "learning_rate": 9.69235757295044e-07, "loss": 0.0001, "num_tokens": 91393145.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3322, "step_time": 20.7161478176713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 164.4375, "completions/mean_terminated_length": 164.4375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.18568189442157745, "epoch": 0.1539138490041686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013142157113179564, "kl": 0.0011784150556195527, "learning_rate": 9.692264937471051e-07, "loss": 0.0001, "num_tokens": 91419232.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3323, "step_time": 17.862348187714815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3101409822702408, "epoch": 0.1539601667438629, "frac_reward_zero_std": 1.0, "grad_norm": 0.006444879807531834, "kl": 0.003332747903186828, "learning_rate": 9.692172301991662e-07, "loss": 0.0002, "num_tokens": 91455528.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3324, "step_time": 19.23028664290905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.36022551357746124, "epoch": 0.1540064844835572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030863964930176735, "kl": 0.002290126052685082, "learning_rate": 9.692079666512274e-07, "loss": 0.0001, "num_tokens": 91478396.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3325, "step_time": 17.516544554382563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3505780100822449, "epoch": 0.15405280222325152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032824836671352386, "kl": 0.002519895788282156, "learning_rate": 9.691987031032885e-07, "loss": 0.0001, "num_tokens": 91502740.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3326, "step_time": 15.37037943303585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.20433121547102928, "epoch": 0.1540991199629458, "frac_reward_zero_std": 0.0, "grad_norm": 0.11700944602489471, "kl": 0.0021943735773675144, "learning_rate": 9.691894395553496e-07, "loss": -0.0051, "num_tokens": 91530889.0, "reward": 0.24393409490585327, "reward_std": 0.036680784076452255, "rewards/reward_func/mean": 0.24393409490585327, "rewards/reward_func/std": 0.036680784076452255, "step": 3327, "step_time": 18.378038570284843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 107.375, "completions/mean_terminated_length": 107.375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2943278029561043, "epoch": 0.1541454377026401, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014756934251636267, "kl": 0.0014031844912096858, "learning_rate": 9.691801760074107e-07, "loss": 0.0001, "num_tokens": 91551919.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3328, "step_time": 12.465799763798714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.15385310351848602, "epoch": 0.1541917554423344, "frac_reward_zero_std": 0.0, "grad_norm": 0.08098611235618591, "kl": 0.0012751940521411598, "learning_rate": 9.691709124594719e-07, "loss": -0.0005, "num_tokens": 91583403.0, "reward": 0.9881047010421753, "reward_std": 0.03250421583652496, "rewards/reward_func/mean": 0.9881047010421753, "rewards/reward_func/std": 0.03250420466065407, "step": 3329, "step_time": 19.58171332255006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 157.3125, "completions/mean_terminated_length": 157.3125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.32820216566324234, "epoch": 0.15423807318202873, "frac_reward_zero_std": 1.0, "grad_norm": 0.007466540671885014, "kl": 0.005166945746168494, "learning_rate": 9.69161648911533e-07, "loss": 0.0003, "num_tokens": 91606672.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3330, "step_time": 17.079447399824858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 184.625, "completions/mean_terminated_length": 184.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3910089060664177, "epoch": 0.15428439092172302, "frac_reward_zero_std": 0.0, "grad_norm": 0.16182798147201538, "kl": 0.010804095072671771, "learning_rate": 9.691523853635941e-07, "loss": -0.0352, "num_tokens": 91636666.0, "reward": 0.05592745915055275, "reward_std": 0.223709836602211, "rewards/reward_func/mean": 0.05592745915055275, "rewards/reward_func/std": 0.223709836602211, "step": 3331, "step_time": 22.886167015880346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 134.5625, "completions/mean_terminated_length": 134.5625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3120100200176239, "epoch": 0.15433070866141732, "frac_reward_zero_std": 1.0, "grad_norm": 0.002287524752318859, "kl": 0.002078368212096393, "learning_rate": 9.691431218156555e-07, "loss": 0.0001, "num_tokens": 91656643.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3332, "step_time": 14.359949983656406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 158.0625, "completions/mean_terminated_length": 158.0625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.257740281522274, "epoch": 0.15437702640111162, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015757882501929998, "kl": 0.001483880274463445, "learning_rate": 9.691338582677166e-07, "loss": 0.0001, "num_tokens": 91691124.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 3333, "step_time": 21.722475323826075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 144.1875, "completions/mean_terminated_length": 144.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24944373592734337, "epoch": 0.15442334414080594, "frac_reward_zero_std": 1.0, "grad_norm": 0.005230751354247332, "kl": 0.0025434315903112292, "learning_rate": 9.691245947197775e-07, "loss": 0.0001, "num_tokens": 91710839.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3334, "step_time": 18.03704598918557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 191.6875, "completions/mean_terminated_length": 191.6875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.21327754482626915, "epoch": 0.15446966188050024, "frac_reward_zero_std": 0.0, "grad_norm": 0.1796390861272812, "kl": 0.006803740747272968, "learning_rate": 9.691153311718388e-07, "loss": -0.0324, "num_tokens": 91735986.0, "reward": 0.03723296523094177, "reward_std": 0.00694130826741457, "rewards/reward_func/mean": 0.03723296523094177, "rewards/reward_func/std": 0.0069413078017532825, "step": 3335, "step_time": 19.53165503963828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.25464953109622, "epoch": 0.15451597962019453, "frac_reward_zero_std": 1.0, "grad_norm": 0.003500354941934347, "kl": 0.002710888395085931, "learning_rate": 9.691060676239e-07, "loss": 0.0001, "num_tokens": 91755970.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3336, "step_time": 15.854786850512028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 136.0625, "completions/mean_terminated_length": 136.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.31221120059490204, "epoch": 0.15456229735988883, "frac_reward_zero_std": 1.0, "grad_norm": 0.001925988937728107, "kl": 0.001990779914194718, "learning_rate": 9.69096804075961e-07, "loss": 0.0001, "num_tokens": 91780451.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3337, "step_time": 15.589721571654081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 152.1875, "completions/mean_terminated_length": 152.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.23425745218992233, "epoch": 0.15460861509958315, "frac_reward_zero_std": 0.0, "grad_norm": 0.172028586268425, "kl": 0.00620538042858243, "learning_rate": 9.690875405280222e-07, "loss": -0.0734, "num_tokens": 91806550.0, "reward": 0.8298039436340332, "reward_std": 0.24870947003364563, "rewards/reward_func/mean": 0.8298039436340332, "rewards/reward_func/std": 0.24870947003364563, "step": 3338, "step_time": 17.276704136282206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.2940092608332634, "epoch": 0.15465493283927745, "frac_reward_zero_std": 0.0, "grad_norm": 0.11028830707073212, "kl": 0.008754837443120778, "learning_rate": 9.690782769800833e-07, "loss": -0.0168, "num_tokens": 91828004.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 3339, "step_time": 23.81776424124837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3867867439985275, "epoch": 0.15470125057897174, "frac_reward_zero_std": 1.0, "grad_norm": 0.004322863183915615, "kl": 0.0028193810721859336, "learning_rate": 9.690690134321445e-07, "loss": 0.0001, "num_tokens": 91876096.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3340, "step_time": 22.542425610125065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.17907460778951645, "epoch": 0.15474756831866604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033703304361552, "kl": 0.0020329627150204033, "learning_rate": 9.690597498842056e-07, "loss": 0.0001, "num_tokens": 91899808.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3341, "step_time": 16.980496268719435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.36107001453638077, "epoch": 0.15479388605836036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028498759493231773, "kl": 0.0025219633826054633, "learning_rate": 9.690504863362667e-07, "loss": 0.0001, "num_tokens": 91947789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3342, "step_time": 23.340466152876616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 193.0625, "completions/mean_terminated_length": 193.0625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.35160113871097565, "epoch": 0.15484020379805466, "frac_reward_zero_std": 0.0, "grad_norm": 0.1251196712255478, "kl": 0.0075583066791296005, "learning_rate": 9.690412227883278e-07, "loss": 0.0265, "num_tokens": 91980734.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 3343, "step_time": 23.79976823925972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 187.5625, "completions/mean_terminated_length": 187.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.14454735442996025, "epoch": 0.15488652153774896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022063462529331446, "kl": 0.001651369733735919, "learning_rate": 9.69031959240389e-07, "loss": 0.0001, "num_tokens": 92018247.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3344, "step_time": 21.65063364431262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2657321095466614, "epoch": 0.15493283927744325, "frac_reward_zero_std": 1.0, "grad_norm": 0.007140035275369883, "kl": 0.003754403966013342, "learning_rate": 9.690226956924503e-07, "loss": 0.0002, "num_tokens": 92038075.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3345, "step_time": 14.567701142281294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 143.125, "completions/mean_terminated_length": 143.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1490999199450016, "epoch": 0.15497915701713758, "frac_reward_zero_std": 0.0, "grad_norm": 0.1289130598306656, "kl": 0.0032111339969560504, "learning_rate": 9.690134321445112e-07, "loss": -0.0107, "num_tokens": 92064477.0, "reward": 0.007286164443939924, "reward_std": 0.005187314469367266, "rewards/reward_func/mean": 0.007286164443939924, "rewards/reward_func/std": 0.005187314003705978, "step": 3346, "step_time": 19.316949263215065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 145.9375, "completions/mean_terminated_length": 145.9375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.31513993442058563, "epoch": 0.15502547475683187, "frac_reward_zero_std": 1.0, "grad_norm": 0.01598193123936653, "kl": 0.005524818319827318, "learning_rate": 9.690041685965723e-07, "loss": 0.0003, "num_tokens": 92087068.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3347, "step_time": 16.297446753829718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 191.3125, "completions/mean_terminated_length": 191.3125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.14111372828483582, "epoch": 0.15507179249652617, "frac_reward_zero_std": 1.0, "grad_norm": 0.004341846331954002, "kl": 0.0023714601702522486, "learning_rate": 9.689949050486337e-07, "loss": 0.0001, "num_tokens": 92108785.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3348, "step_time": 18.504158172756433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3126991540193558, "epoch": 0.15511811023622046, "frac_reward_zero_std": 1.0, "grad_norm": 0.004649960435926914, "kl": 0.0031018926529213786, "learning_rate": 9.689856415006948e-07, "loss": 0.0002, "num_tokens": 92133815.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3349, "step_time": 16.528464261442423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 216.6875, "completions/mean_terminated_length": 216.6875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3769342973828316, "epoch": 0.1551644279759148, "frac_reward_zero_std": 0.0, "grad_norm": 0.11284460872411728, "kl": 0.013480915687978268, "learning_rate": 9.68976377952756e-07, "loss": -0.0919, "num_tokens": 92164290.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 3350, "step_time": 26.06608099862933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 113.4375, "completions/mean_terminated_length": 113.4375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.26655734330415726, "epoch": 0.15521074571560908, "frac_reward_zero_std": 1.0, "grad_norm": 0.002856267848983407, "kl": 0.0017867960850708187, "learning_rate": 9.68967114404817e-07, "loss": 0.0001, "num_tokens": 92183737.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3351, "step_time": 12.166086815297604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3527267277240753, "epoch": 0.15525706345530338, "frac_reward_zero_std": 0.0, "grad_norm": 0.1178298145532608, "kl": 0.021648069377988577, "learning_rate": 9.689578508568782e-07, "loss": -0.1421, "num_tokens": 92208651.0, "reward": 0.3111305236816406, "reward_std": 0.3643587827682495, "rewards/reward_func/mean": 0.3111305236816406, "rewards/reward_func/std": 0.3643587529659271, "step": 3352, "step_time": 25.933844342827797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 199.8125, "completions/mean_terminated_length": 199.8125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3849535584449768, "epoch": 0.15530338119499768, "frac_reward_zero_std": 0.0, "grad_norm": 0.10025057196617126, "kl": 0.004247734148520976, "learning_rate": 9.689485873089393e-07, "loss": -0.0031, "num_tokens": 92236280.0, "reward": 0.10031722486019135, "reward_std": 0.06985194981098175, "rewards/reward_func/mean": 0.10031722486019135, "rewards/reward_func/std": 0.06985194981098175, "step": 3353, "step_time": 22.336097571998835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.21971788629889488, "epoch": 0.155349698934692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032453506719321012, "kl": 0.0023439573124051094, "learning_rate": 9.689393237610004e-07, "loss": 0.0001, "num_tokens": 92273417.0, "reward": 0.9067110419273376, "reward_std": 0.0, "rewards/reward_func/mean": 0.9067110419273376, "rewards/reward_func/std": 0.0, "step": 3354, "step_time": 21.85452165454626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 172.4375, "completions/mean_terminated_length": 172.4375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.23878859728574753, "epoch": 0.1553960166743863, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037063290365040302, "kl": 0.003366717486642301, "learning_rate": 9.689300602130615e-07, "loss": 0.0002, "num_tokens": 92298768.0, "reward": 0.7187313437461853, "reward_std": 0.0, "rewards/reward_func/mean": 0.7187313437461853, "rewards/reward_func/std": 0.0, "step": 3355, "step_time": 20.31452167034149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.24774175509810448, "epoch": 0.1554423344140806, "frac_reward_zero_std": 1.0, "grad_norm": 0.015400012023746967, "kl": 0.019052762538194656, "learning_rate": 9.689207966651227e-07, "loss": 0.0009, "num_tokens": 92319456.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3356, "step_time": 16.624549858272076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 170.6875, "completions/mean_terminated_length": 170.6875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.22818206995725632, "epoch": 0.1554886521537749, "frac_reward_zero_std": 0.0, "grad_norm": 0.18956658244132996, "kl": 0.011238167993724346, "learning_rate": 9.689115331171838e-07, "loss": 0.1641, "num_tokens": 92341003.0, "reward": 0.8600144386291504, "reward_std": 0.1000090166926384, "rewards/reward_func/mean": 0.8600144386291504, "rewards/reward_func/std": 0.100009024143219, "step": 3357, "step_time": 22.292827654629946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.26849503442645073, "epoch": 0.1555349698934692, "frac_reward_zero_std": 1.0, "grad_norm": 0.002744281431660056, "kl": 0.002132963534677401, "learning_rate": 9.68902269569245e-07, "loss": 0.0001, "num_tokens": 92362841.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3358, "step_time": 13.914867967367172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 119.75, "completions/mean_terminated_length": 119.75, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.227818064391613, "epoch": 0.1555812876331635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036486228927969933, "kl": 0.002492701372830197, "learning_rate": 9.68893006021306e-07, "loss": 0.0001, "num_tokens": 92382037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3359, "step_time": 13.963378340005875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.3525514081120491, "epoch": 0.1556276053728578, "frac_reward_zero_std": 0.0, "grad_norm": 0.09065493196249008, "kl": 0.00845847011078149, "learning_rate": 9.688837424733672e-07, "loss": 0.0064, "num_tokens": 92406925.0, "reward": 0.13488443195819855, "reward_std": 0.10790754109621048, "rewards/reward_func/mean": 0.13488443195819855, "rewards/reward_func/std": 0.10790754854679108, "step": 3360, "step_time": 21.681909650564194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 208.625, "completions/mean_terminated_length": 208.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.2752373516559601, "epoch": 0.1556739231125521, "frac_reward_zero_std": 0.0, "grad_norm": 0.11485565453767776, "kl": 0.01365563995204866, "learning_rate": 9.688744789254283e-07, "loss": -0.082, "num_tokens": 92429991.0, "reward": 0.3644893765449524, "reward_std": 0.2041807323694229, "rewards/reward_func/mean": 0.3644893765449524, "rewards/reward_func/std": 0.2041807323694229, "step": 3361, "step_time": 25.30739837139845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2848980873823166, "epoch": 0.15572024085224642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031788686756044626, "kl": 0.0022560322540812194, "learning_rate": 9.688652153774896e-07, "loss": 0.0001, "num_tokens": 92453291.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3362, "step_time": 14.625582505017519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.279193215072155, "epoch": 0.15576655859194072, "frac_reward_zero_std": 1.0, "grad_norm": 0.003246487583965063, "kl": 0.002011729695368558, "learning_rate": 9.688559518295508e-07, "loss": 0.0001, "num_tokens": 92472845.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3363, "step_time": 13.783062070608139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.17071790620684624, "epoch": 0.15581287633163501, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028183585964143276, "kl": 0.0019790363148786128, "learning_rate": 9.688466882816119e-07, "loss": 0.0001, "num_tokens": 92501391.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 3364, "step_time": 17.18678993731737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.29390769824385643, "epoch": 0.1558591940713293, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022444038186222315, "kl": 0.0019568908028304577, "learning_rate": 9.68837424733673e-07, "loss": 0.0001, "num_tokens": 92523801.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3365, "step_time": 16.590839076787233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 199.1875, "completions/mean_terminated_length": 199.1875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.3531302735209465, "epoch": 0.15590551181102363, "frac_reward_zero_std": 1.0, "grad_norm": 0.007046143990010023, "kl": 0.005330355372279882, "learning_rate": 9.688281611857341e-07, "loss": 0.0003, "num_tokens": 92547644.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3366, "step_time": 20.67684406414628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 207.8125, "completions/mean_terminated_length": 207.8125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.17195185646414757, "epoch": 0.15595182955071793, "frac_reward_zero_std": 0.0, "grad_norm": 0.11639894545078278, "kl": 0.0011754246370401233, "learning_rate": 9.688188976377953e-07, "loss": 0.0158, "num_tokens": 92592265.0, "reward": 0.8732266426086426, "reward_std": 0.0017810834106057882, "rewards/reward_func/mean": 0.8732266426086426, "rewards/reward_func/std": 0.0017810744466260076, "step": 3367, "step_time": 25.887058943510056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 161.5625, "completions/mean_terminated_length": 161.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.1910524144768715, "epoch": 0.15599814729041223, "frac_reward_zero_std": 1.0, "grad_norm": 0.002463321899995208, "kl": 0.0017776959284674376, "learning_rate": 9.688096340898564e-07, "loss": 0.0001, "num_tokens": 92614322.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 3368, "step_time": 16.187005519866943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 140.3125, "completions/mean_terminated_length": 140.3125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.16739752888679504, "epoch": 0.15604446503010652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015062983147799969, "kl": 0.0015503333561355248, "learning_rate": 9.688003705419175e-07, "loss": 0.0001, "num_tokens": 92635127.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 3369, "step_time": 15.248216327279806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 114.25, "completions/mean_terminated_length": 114.25, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2350492924451828, "epoch": 0.15609078276980085, "frac_reward_zero_std": 1.0, "grad_norm": 0.003463833360001445, "kl": 0.0016782145539764315, "learning_rate": 9.687911069939786e-07, "loss": 0.0001, "num_tokens": 92654411.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3370, "step_time": 12.283231306821108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.22284315899014473, "epoch": 0.15613710050949514, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016951520228758454, "kl": 0.0011517916864249855, "learning_rate": 9.687818434460398e-07, "loss": 0.0001, "num_tokens": 92675917.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3371, "step_time": 13.068330138921738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 135.6875, "completions/mean_terminated_length": 135.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4059174954891205, "epoch": 0.15618341824918944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024354124907404184, "kl": 0.0021466552861966193, "learning_rate": 9.687725798981009e-07, "loss": 0.0001, "num_tokens": 92718568.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3372, "step_time": 20.215550310909748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2768791392445564, "epoch": 0.15622973598888373, "frac_reward_zero_std": 1.0, "grad_norm": 0.00909844134002924, "kl": 0.010500472038984299, "learning_rate": 9.68763316350162e-07, "loss": 0.0005, "num_tokens": 92744754.0, "reward": 0.11603700369596481, "reward_std": 0.0, "rewards/reward_func/mean": 0.11603700369596481, "rewards/reward_func/std": 0.0, "step": 3373, "step_time": 18.666076958179474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 169.1875, "completions/mean_terminated_length": 169.1875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3718565031886101, "epoch": 0.15627605372857806, "frac_reward_zero_std": 1.0, "grad_norm": 0.00507784727960825, "kl": 0.0032935781637206674, "learning_rate": 9.687540528022231e-07, "loss": 0.0002, "num_tokens": 92772725.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3374, "step_time": 18.568583589047194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 126.375, "completions/mean_terminated_length": 126.375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.29444124549627304, "epoch": 0.15632237146827235, "frac_reward_zero_std": 1.0, "grad_norm": 0.004532559309154749, "kl": 0.00210694590350613, "learning_rate": 9.687447892542845e-07, "loss": 0.0001, "num_tokens": 92796075.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3375, "step_time": 14.34216309338808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 222.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3261827826499939, "epoch": 0.15636868920796665, "frac_reward_zero_std": 0.0, "grad_norm": 0.07907669246196747, "kl": 0.006121080252341926, "learning_rate": 9.687355257063456e-07, "loss": 0.0348, "num_tokens": 92824473.0, "reward": 0.7797904014587402, "reward_std": 0.23554649949073792, "rewards/reward_func/mean": 0.7797904014587402, "rewards/reward_func/std": 0.23554649949073792, "step": 3376, "step_time": 24.10123337060213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2709270790219307, "epoch": 0.15641500694766095, "frac_reward_zero_std": 1.0, "grad_norm": 0.011515576392412186, "kl": 0.00539400038542226, "learning_rate": 9.687262621584065e-07, "loss": 0.0003, "num_tokens": 92844013.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3377, "step_time": 14.51206111907959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.1876688115298748, "epoch": 0.15646132468735527, "frac_reward_zero_std": 1.0, "grad_norm": 0.001437133178114891, "kl": 0.0011671070824377239, "learning_rate": 9.687169986104678e-07, "loss": 0.0001, "num_tokens": 92876693.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3378, "step_time": 22.443644117563963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.36684370785951614, "epoch": 0.15650764242704956, "frac_reward_zero_std": 0.0, "grad_norm": 0.14057622849941254, "kl": 0.010422457475215197, "learning_rate": 9.68707735062529e-07, "loss": 0.0244, "num_tokens": 92900035.0, "reward": 0.8276056051254272, "reward_std": 0.32421788573265076, "rewards/reward_func/mean": 0.8276056051254272, "rewards/reward_func/std": 0.32421791553497314, "step": 3379, "step_time": 20.422037471085787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 207.8125, "completions/mean_terminated_length": 207.8125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3677055314183235, "epoch": 0.15655396016674386, "frac_reward_zero_std": 0.0, "grad_norm": 0.10390904545783997, "kl": 0.0069690506206825376, "learning_rate": 9.6869847151459e-07, "loss": -0.0244, "num_tokens": 92937488.0, "reward": 0.7645001411437988, "reward_std": 0.3923640847206116, "rewards/reward_func/mean": 0.7645001411437988, "rewards/reward_func/std": 0.39236411452293396, "step": 3380, "step_time": 24.310496348887682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 175.4375, "completions/mean_terminated_length": 175.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.37994876503944397, "epoch": 0.15660027790643816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014101024717092514, "kl": 0.0016320306458510458, "learning_rate": 9.686892079666512e-07, "loss": 0.0001, "num_tokens": 92968039.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3381, "step_time": 20.314433723688126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 149.1875, "completions/mean_terminated_length": 149.1875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.42657849192619324, "epoch": 0.15664659564613248, "frac_reward_zero_std": 1.0, "grad_norm": 0.002053258242085576, "kl": 0.002325157751329243, "learning_rate": 9.686799444187123e-07, "loss": 0.0001, "num_tokens": 93022938.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3382, "step_time": 25.133915796875954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.23933277279138565, "epoch": 0.15669291338582678, "frac_reward_zero_std": 0.0, "grad_norm": 0.10873263329267502, "kl": 0.005963696981780231, "learning_rate": 9.686706808707735e-07, "loss": 0.0141, "num_tokens": 93055401.0, "reward": 0.8300195336341858, "reward_std": 0.0028690295293927193, "rewards/reward_func/mean": 0.8300195336341858, "rewards/reward_func/std": 0.0028690400067716837, "step": 3383, "step_time": 19.60477663949132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3595099225640297, "epoch": 0.15673923112552107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030787335708737373, "kl": 0.0030947758932597935, "learning_rate": 9.686614173228346e-07, "loss": 0.0002, "num_tokens": 93104105.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3384, "step_time": 23.159548055380583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 178.875, "completions/mean_terminated_length": 178.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4038897082209587, "epoch": 0.15678554886521537, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031293348874896765, "kl": 0.002823414048179984, "learning_rate": 9.686521537748957e-07, "loss": 0.0001, "num_tokens": 93156199.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3385, "step_time": 27.05347828567028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.44841163605451584, "epoch": 0.1568318666049097, "frac_reward_zero_std": 1.0, "grad_norm": 0.004759833682328463, "kl": 0.004427422536537051, "learning_rate": 9.686428902269568e-07, "loss": 0.0002, "num_tokens": 93190359.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3386, "step_time": 23.632544446736574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 152.125, "completions/mean_terminated_length": 152.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3723982274532318, "epoch": 0.156878184344604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029600670095533133, "kl": 0.0033330745063722134, "learning_rate": 9.68633626679018e-07, "loss": 0.0002, "num_tokens": 93245225.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3387, "step_time": 25.62436442449689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 214.1875, "completions/mean_terminated_length": 214.1875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4489590525627136, "epoch": 0.15692450208429828, "frac_reward_zero_std": 0.0, "grad_norm": 0.11331822723150253, "kl": 0.00677445693872869, "learning_rate": 9.686243631310793e-07, "loss": 0.0583, "num_tokens": 93267500.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 3388, "step_time": 22.353625752031803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3952978104352951, "epoch": 0.15697081982399258, "frac_reward_zero_std": 1.0, "grad_norm": 0.005898735020309687, "kl": 0.00412388431141153, "learning_rate": 9.686150995831402e-07, "loss": 0.0002, "num_tokens": 93321672.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3389, "step_time": 24.38019158691168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.22559742256999016, "epoch": 0.1570171375636869, "frac_reward_zero_std": 1.0, "grad_norm": 0.005241250153630972, "kl": 0.0028257886588107795, "learning_rate": 9.686058360352013e-07, "loss": 0.0001, "num_tokens": 93341342.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3390, "step_time": 14.31430471688509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.3970692455768585, "epoch": 0.1570634553033812, "frac_reward_zero_std": 1.0, "grad_norm": 0.001841334393247962, "kl": 0.0016890050610527396, "learning_rate": 9.685965724872625e-07, "loss": 0.0001, "num_tokens": 93371558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3391, "step_time": 16.866381518542767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.18258456885814667, "epoch": 0.1571097730430755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019344153115525842, "kl": 0.0013503170921467245, "learning_rate": 9.685873089393238e-07, "loss": 0.0001, "num_tokens": 93398379.0, "reward": 0.3545035123825073, "reward_std": 0.0, "rewards/reward_func/mean": 0.3545035123825073, "rewards/reward_func/std": 0.0, "step": 3392, "step_time": 18.63642694428563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 230.1875, "completions/mean_terminated_length": 230.1875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.4675423130393028, "epoch": 0.1571560907827698, "frac_reward_zero_std": 1.0, "grad_norm": 0.004736447241157293, "kl": 0.004607197945006192, "learning_rate": 9.68578045391385e-07, "loss": 0.0002, "num_tokens": 93429726.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3393, "step_time": 27.96315587684512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3540070205926895, "epoch": 0.15720240852246412, "frac_reward_zero_std": 0.0, "grad_norm": 0.12353695929050446, "kl": 0.009493994759395719, "learning_rate": 9.68568781843446e-07, "loss": -0.142, "num_tokens": 93455453.0, "reward": 0.028723783791065216, "reward_std": 0.0784883201122284, "rewards/reward_func/mean": 0.028723783791065216, "rewards/reward_func/std": 0.0784883201122284, "step": 3394, "step_time": 26.026387214660645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 132.5, "completions/mean_terminated_length": 132.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3240819647908211, "epoch": 0.1572487262621584, "frac_reward_zero_std": 1.0, "grad_norm": 0.002312229946255684, "kl": 0.0019831361423712224, "learning_rate": 9.685595182955072e-07, "loss": 0.0001, "num_tokens": 93476197.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3395, "step_time": 15.30780778080225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.20126070827245712, "epoch": 0.1572950440018527, "frac_reward_zero_std": 0.0, "grad_norm": 0.08587631583213806, "kl": 0.004368889727629721, "learning_rate": 9.685502547475683e-07, "loss": 0.0104, "num_tokens": 93515397.0, "reward": 0.996159553527832, "reward_std": 0.015361929312348366, "rewards/reward_func/mean": 0.996159553527832, "rewards/reward_func/std": 0.015361934900283813, "step": 3396, "step_time": 29.787671122699976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 179.5625, "completions/mean_terminated_length": 179.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.26427941769361496, "epoch": 0.157341361741547, "frac_reward_zero_std": 1.0, "grad_norm": 0.009840679354965687, "kl": 0.004796617780812085, "learning_rate": 9.685409911996294e-07, "loss": 0.0002, "num_tokens": 93543454.0, "reward": 0.7598356604576111, "reward_std": 0.0, "rewards/reward_func/mean": 0.7598356604576111, "rewards/reward_func/std": 0.0, "step": 3397, "step_time": 21.67714450880885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.28759458661079407, "epoch": 0.15738767948124133, "frac_reward_zero_std": 1.0, "grad_norm": 0.002306500216946006, "kl": 0.0020671751408372074, "learning_rate": 9.685317276516905e-07, "loss": 0.0001, "num_tokens": 93564065.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3398, "step_time": 14.972420245409012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 214.625, "completions/mean_terminated_length": 214.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.24402768164873123, "epoch": 0.15743399722093562, "frac_reward_zero_std": 0.0, "grad_norm": 0.09991750866174698, "kl": 0.01453024661168456, "learning_rate": 9.685224641037517e-07, "loss": -0.2018, "num_tokens": 93587323.0, "reward": 0.5235850811004639, "reward_std": 0.4175490736961365, "rewards/reward_func/mean": 0.5235850811004639, "rewards/reward_func/std": 0.41754910349845886, "step": 3399, "step_time": 23.29434657841921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.2214699350297451, "epoch": 0.15748031496062992, "frac_reward_zero_std": 1.0, "grad_norm": 0.007350526284426451, "kl": 0.021009589545428753, "learning_rate": 9.685132005558128e-07, "loss": 0.001, "num_tokens": 93614323.0, "reward": 0.8071032166481018, "reward_std": 0.0, "rewards/reward_func/mean": 0.8071032166481018, "rewards/reward_func/std": 0.0, "step": 3400, "step_time": 19.339279111474752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.20853421837091446, "epoch": 0.15752663270032422, "frac_reward_zero_std": 1.0, "grad_norm": 0.007097158581018448, "kl": 0.006123285507783294, "learning_rate": 9.68503937007874e-07, "loss": 0.0003, "num_tokens": 93649309.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3401, "step_time": 21.030549950897694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 118.6875, "completions/mean_terminated_length": 118.6875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.24639840051531792, "epoch": 0.15757295044001854, "frac_reward_zero_std": 1.0, "grad_norm": 0.006378231104463339, "kl": 0.003113148733973503, "learning_rate": 9.68494673459935e-07, "loss": 0.0002, "num_tokens": 93668776.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3402, "step_time": 13.50205684453249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 183.8125, "completions/mean_terminated_length": 183.8125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.393598273396492, "epoch": 0.15761926817971283, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025114002637565136, "kl": 0.0025301979621872306, "learning_rate": 9.684854099119962e-07, "loss": 0.0001, "num_tokens": 93696293.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3403, "step_time": 20.72625645622611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 222.5625, "completions/mean_terminated_length": 222.5625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.356412336230278, "epoch": 0.15766558591940713, "frac_reward_zero_std": 0.0, "grad_norm": 0.12194860726594925, "kl": 0.012284463155083358, "learning_rate": 9.684761463640573e-07, "loss": -0.1091, "num_tokens": 93734302.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 3404, "step_time": 28.05081032589078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 188.5625, "completions/mean_terminated_length": 188.5625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3936387225985527, "epoch": 0.15771190365910143, "frac_reward_zero_std": 0.0, "grad_norm": 0.1348794847726822, "kl": 0.007617619470693171, "learning_rate": 9.684668828161186e-07, "loss": 0.0477, "num_tokens": 93758359.0, "reward": 0.34241390228271484, "reward_std": 0.4565494656562805, "rewards/reward_func/mean": 0.34241390228271484, "rewards/reward_func/std": 0.4565494656562805, "step": 3405, "step_time": 22.871876165270805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 146.9375, "completions/mean_terminated_length": 146.9375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.18180694431066513, "epoch": 0.15775822139879575, "frac_reward_zero_std": 1.0, "grad_norm": 0.004647223278880119, "kl": 0.00461869384162128, "learning_rate": 9.684576192681798e-07, "loss": 0.0002, "num_tokens": 93780614.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3406, "step_time": 17.274998400360346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.29551438987255096, "epoch": 0.15780453913849005, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020865807309746742, "kl": 0.001857791270595044, "learning_rate": 9.684483557202409e-07, "loss": 0.0001, "num_tokens": 93809538.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3407, "step_time": 16.46083966270089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 217.6875, "completions/mean_terminated_length": 217.6875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.15124435350298882, "epoch": 0.15785085687818434, "frac_reward_zero_std": 1.0, "grad_norm": 0.00468370970338583, "kl": 0.0036985211190767586, "learning_rate": 9.68439092172302e-07, "loss": 0.0002, "num_tokens": 93835053.0, "reward": 0.8970033526420593, "reward_std": 0.0, "rewards/reward_func/mean": 0.8970033526420593, "rewards/reward_func/std": 0.0, "step": 3408, "step_time": 21.989070676267147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.19563382118940353, "epoch": 0.15789717461787864, "frac_reward_zero_std": 0.0, "grad_norm": 0.14159588515758514, "kl": 0.008461256831651554, "learning_rate": 9.684298286243631e-07, "loss": -0.0181, "num_tokens": 93856813.0, "reward": 0.5811158418655396, "reward_std": 0.2563861012458801, "rewards/reward_func/mean": 0.5811158418655396, "rewards/reward_func/std": 0.2563861012458801, "step": 3409, "step_time": 18.760935347527266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.22291510179638863, "epoch": 0.15794349235757296, "frac_reward_zero_std": 1.0, "grad_norm": 0.003395486855879426, "kl": 0.0027317614876665175, "learning_rate": 9.684205650764243e-07, "loss": 0.0001, "num_tokens": 93890249.0, "reward": 0.8970773816108704, "reward_std": 0.0, "rewards/reward_func/mean": 0.8970773816108704, "rewards/reward_func/std": 0.0, "step": 3410, "step_time": 20.72183408588171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 215.0625, "completions/mean_terminated_length": 215.0625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.32530076056718826, "epoch": 0.15798981009726726, "frac_reward_zero_std": 0.0, "grad_norm": 0.08521021157503128, "kl": 0.008857256500050426, "learning_rate": 9.684113015284854e-07, "loss": 0.0267, "num_tokens": 93918442.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 3411, "step_time": 24.42685490101576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 127.9375, "completions/mean_terminated_length": 127.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.27352775633335114, "epoch": 0.15803612783696155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023881150409579277, "kl": 0.0020404542156029493, "learning_rate": 9.684020379805465e-07, "loss": 0.0001, "num_tokens": 93940073.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3412, "step_time": 13.613922379910946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3109496012330055, "epoch": 0.15808244557665585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030934831593185663, "kl": 0.0021100371377542615, "learning_rate": 9.683927744326076e-07, "loss": 0.0001, "num_tokens": 93963029.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3413, "step_time": 15.253663532435894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 199.125, "completions/mean_terminated_length": 199.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.38036343455314636, "epoch": 0.15812876331635017, "frac_reward_zero_std": 0.0, "grad_norm": 0.09645237028598785, "kl": 0.006299379514530301, "learning_rate": 9.683835108846688e-07, "loss": 0.0145, "num_tokens": 93984919.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.4787135720252991, "step": 3414, "step_time": 20.447531413286924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.30600807815790176, "epoch": 0.15817508105604447, "frac_reward_zero_std": 1.0, "grad_norm": 0.006566811352968216, "kl": 0.0029020439833402634, "learning_rate": 9.683742473367299e-07, "loss": 0.0001, "num_tokens": 94006569.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3415, "step_time": 14.760872717946768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 306.8125, "completions/mean_terminated_length": 306.8125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.3685658276081085, "epoch": 0.15822139879573877, "frac_reward_zero_std": 0.0, "grad_norm": 0.06619012355804443, "kl": 0.0029626351897604764, "learning_rate": 9.68364983788791e-07, "loss": 0.079, "num_tokens": 94043254.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 3416, "step_time": 33.7754629291594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.37912197411060333, "epoch": 0.15826771653543306, "frac_reward_zero_std": 0.0, "grad_norm": 0.1322353631258011, "kl": 0.008519368944689631, "learning_rate": 9.683557202408521e-07, "loss": -0.0144, "num_tokens": 94065906.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3417, "step_time": 20.565210532397032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 132.75, "completions/mean_terminated_length": 132.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.32533005625009537, "epoch": 0.15831403427512739, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031542836222797632, "kl": 0.002352894749492407, "learning_rate": 9.683464566929135e-07, "loss": 0.0001, "num_tokens": 94101870.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3418, "step_time": 17.876115828752518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 214.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.25735868141055107, "epoch": 0.15836035201482168, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012005128664895892, "kl": 0.0012013968371320516, "learning_rate": 9.683371931449746e-07, "loss": 0.0001, "num_tokens": 94140622.0, "reward": 0.5623413324356079, "reward_std": 0.0, "rewards/reward_func/mean": 0.5623413324356079, "rewards/reward_func/std": 0.0, "step": 3419, "step_time": 25.61725616827607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 178.0625, "completions/mean_terminated_length": 178.0625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3643127679824829, "epoch": 0.15840666975451598, "frac_reward_zero_std": 1.0, "grad_norm": 0.003521176055073738, "kl": 0.0029144013533368707, "learning_rate": 9.683279295970355e-07, "loss": 0.0001, "num_tokens": 94162527.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3420, "step_time": 20.675210751593113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 267.9375, "completions/mean_terminated_length": 267.9375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.23120523989200592, "epoch": 0.15845298749421027, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038808188401162624, "kl": 0.002891894255299121, "learning_rate": 9.683186660490966e-07, "loss": 0.0001, "num_tokens": 94195630.0, "reward": 0.7480222582817078, "reward_std": 0.0, "rewards/reward_func/mean": 0.7480222582817078, "rewards/reward_func/std": 0.0, "step": 3421, "step_time": 27.111752171069384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4212314486503601, "epoch": 0.1584993052339046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036072733346372843, "kl": 0.0031521882046945393, "learning_rate": 9.68309402501158e-07, "loss": 0.0002, "num_tokens": 94216844.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3422, "step_time": 18.432662308216095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 244.5625, "completions/mean_terminated_length": 244.5625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.24329178407788277, "epoch": 0.1585456229735989, "frac_reward_zero_std": 1.0, "grad_norm": 0.008524253964424133, "kl": 0.00909702992066741, "learning_rate": 9.68300138953219e-07, "loss": 0.0005, "num_tokens": 94254085.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3423, "step_time": 25.48354296386242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 209.0625, "completions/mean_terminated_length": 209.0625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2374827116727829, "epoch": 0.1585919407132932, "frac_reward_zero_std": 1.0, "grad_norm": 0.008499586954712868, "kl": 0.0075718306470662355, "learning_rate": 9.682908754052802e-07, "loss": 0.0004, "num_tokens": 94283974.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3424, "step_time": 22.72313467785716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 133.4375, "completions/mean_terminated_length": 133.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2880973070859909, "epoch": 0.15863825845298749, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020167441107332706, "kl": 0.0015973700792528689, "learning_rate": 9.682816118573413e-07, "loss": 0.0001, "num_tokens": 94311341.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3425, "step_time": 15.675929341465235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 227.0, "completions/mean_terminated_length": 227.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.2634507790207863, "epoch": 0.1586845761926818, "frac_reward_zero_std": 1.0, "grad_norm": 0.007204752415418625, "kl": 0.006101812236011028, "learning_rate": 9.682723483094025e-07, "loss": 0.0003, "num_tokens": 94336733.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3426, "step_time": 23.877611380070448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 165.8125, "completions/mean_terminated_length": 165.8125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.16471055895090103, "epoch": 0.1587308939323761, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032392528373748064, "kl": 0.0022068023099564016, "learning_rate": 9.682630847614636e-07, "loss": 0.0001, "num_tokens": 94370186.0, "reward": 0.8781879544258118, "reward_std": 0.0, "rewards/reward_func/mean": 0.8781879544258118, "rewards/reward_func/std": 0.0, "step": 3427, "step_time": 19.545306116342545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 207.625, "completions/mean_terminated_length": 207.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.40462759882211685, "epoch": 0.1587772116720704, "frac_reward_zero_std": 0.0, "grad_norm": 0.10641691088676453, "kl": 0.013477440923452377, "learning_rate": 9.682538212135247e-07, "loss": -0.1892, "num_tokens": 94394916.0, "reward": 0.23951296508312225, "reward_std": 0.4300934672355652, "rewards/reward_func/mean": 0.23951296508312225, "rewards/reward_func/std": 0.4300934970378876, "step": 3428, "step_time": 26.73448269441724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 126.5625, "completions/mean_terminated_length": 126.5625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2731931433081627, "epoch": 0.1588235294117647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023880538064986467, "kl": 0.0017022228857968003, "learning_rate": 9.682445576655858e-07, "loss": 0.0001, "num_tokens": 94414557.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3429, "step_time": 13.645759027451277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.33578938245773315, "epoch": 0.15886984715145902, "frac_reward_zero_std": 1.0, "grad_norm": 0.002920533297583461, "kl": 0.001993319718167186, "learning_rate": 9.68235294117647e-07, "loss": 0.0001, "num_tokens": 94445430.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3430, "step_time": 21.873732414096594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 176.5625, "completions/mean_terminated_length": 176.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3507569134235382, "epoch": 0.15891616489115332, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032085312996059656, "kl": 0.0025718085817061365, "learning_rate": 9.68226030569708e-07, "loss": 0.0001, "num_tokens": 94466815.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3431, "step_time": 19.05837071686983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.30730706453323364, "epoch": 0.1589624826308476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038303835317492485, "kl": 0.0025260396650992334, "learning_rate": 9.682167670217692e-07, "loss": 0.0001, "num_tokens": 94486949.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3432, "step_time": 16.480333171784878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 133.6875, "completions/mean_terminated_length": 133.6875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.22230102494359016, "epoch": 0.1590088003705419, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017625931650400162, "kl": 0.0012499088479671627, "learning_rate": 9.682075034738303e-07, "loss": 0.0001, "num_tokens": 94506496.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3433, "step_time": 14.038519285619259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4865777716040611, "epoch": 0.15905511811023623, "frac_reward_zero_std": 1.0, "grad_norm": 0.003165813395753503, "kl": 0.0032938916701823473, "learning_rate": 9.681982399258915e-07, "loss": 0.0002, "num_tokens": 94531844.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3434, "step_time": 24.92224333807826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 169.9375, "completions/mean_terminated_length": 169.9375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4473801478743553, "epoch": 0.15910143584993053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019321624422445893, "kl": 0.002116112707881257, "learning_rate": 9.681889763779528e-07, "loss": 0.0001, "num_tokens": 94560931.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3435, "step_time": 21.872719943523407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 132.75, "completions/mean_terminated_length": 132.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24387311190366745, "epoch": 0.15914775358962482, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038844021037220955, "kl": 0.002167000056942925, "learning_rate": 9.68179712830014e-07, "loss": 0.0001, "num_tokens": 94580335.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3436, "step_time": 14.557889740914106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 163.875, "completions/mean_terminated_length": 163.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.33780235052108765, "epoch": 0.15919407132931912, "frac_reward_zero_std": 1.0, "grad_norm": 0.008975336328148842, "kl": 0.007652397267520428, "learning_rate": 9.68170449282075e-07, "loss": 0.0004, "num_tokens": 94603469.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3437, "step_time": 18.062211614102125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 175.0, "completions/mean_terminated_length": 175.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.26001911610364914, "epoch": 0.15924038906901344, "frac_reward_zero_std": 0.0, "grad_norm": 0.1583942174911499, "kl": 0.00648200698196888, "learning_rate": 9.681611857341362e-07, "loss": 0.0605, "num_tokens": 94624125.0, "reward": 0.7865698337554932, "reward_std": 0.2204297035932541, "rewards/reward_func/mean": 0.7865698337554932, "rewards/reward_func/std": 0.2204297035932541, "step": 3438, "step_time": 22.236902624368668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 170.8125, "completions/mean_terminated_length": 170.8125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.23996953666210175, "epoch": 0.15928670680870774, "frac_reward_zero_std": 0.0, "grad_norm": 0.1449446976184845, "kl": 0.010345470858737826, "learning_rate": 9.681519221861973e-07, "loss": -0.0544, "num_tokens": 94661098.0, "reward": 0.609000027179718, "reward_std": 0.4912189245223999, "rewards/reward_func/mean": 0.609000027179718, "rewards/reward_func/std": 0.4912189245223999, "step": 3439, "step_time": 21.725098561495543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 122.6875, "completions/mean_terminated_length": 122.6875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23615119233727455, "epoch": 0.15933302454840204, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022867852821946144, "kl": 0.0015611594135407358, "learning_rate": 9.681426586382584e-07, "loss": 0.0001, "num_tokens": 94684325.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3440, "step_time": 13.927519869059324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3941641077399254, "epoch": 0.15937934228809633, "frac_reward_zero_std": 1.0, "grad_norm": 0.005300555378198624, "kl": 0.004834897932596505, "learning_rate": 9.681333950903196e-07, "loss": 0.0002, "num_tokens": 94709981.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3441, "step_time": 18.493968956172466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.23197712376713753, "epoch": 0.15942566002779066, "frac_reward_zero_std": 0.0, "grad_norm": 0.0854479968547821, "kl": 0.011729596881195903, "learning_rate": 9.681241315423807e-07, "loss": -0.0347, "num_tokens": 94745571.0, "reward": 0.5779284238815308, "reward_std": 0.14207902550697327, "rewards/reward_func/mean": 0.5779284238815308, "rewards/reward_func/std": 0.14207902550697327, "step": 3442, "step_time": 31.97803706303239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 152.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.20799680799245834, "epoch": 0.15947197776748495, "frac_reward_zero_std": 1.0, "grad_norm": 0.002170357620343566, "kl": 0.0017289408133365214, "learning_rate": 9.681148679944418e-07, "loss": 0.0001, "num_tokens": 94780617.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 3443, "step_time": 20.633345916867256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 184.0625, "completions/mean_terminated_length": 184.0625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.40174292773008347, "epoch": 0.15951829550717925, "frac_reward_zero_std": 1.0, "grad_norm": 0.006572690326720476, "kl": 0.003529248700942844, "learning_rate": 9.68105604446503e-07, "loss": 0.0002, "num_tokens": 94810666.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3444, "step_time": 21.925837852060795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.413627065718174, "epoch": 0.15956461324687354, "frac_reward_zero_std": 0.0, "grad_norm": 0.005545541178435087, "kl": 0.006141959456726909, "learning_rate": 9.68096340898564e-07, "loss": -0.0017, "num_tokens": 94848768.0, "reward": 5.986431119708868e-07, "reward_std": 1.6358043239961262e-06, "rewards/reward_func/mean": 5.986431119708868e-07, "rewards/reward_func/std": 1.635804437682964e-06, "step": 3445, "step_time": 28.424156863242388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.38636625558137894, "epoch": 0.15961093098656787, "frac_reward_zero_std": 1.0, "grad_norm": 0.005689219571650028, "kl": 0.004969300061929971, "learning_rate": 9.680870773506252e-07, "loss": 0.0003, "num_tokens": 94870532.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3446, "step_time": 18.343412913382053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 142.9375, "completions/mean_terminated_length": 142.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.20394393801689148, "epoch": 0.15965724872626216, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015257818158715963, "kl": 0.001183865446364507, "learning_rate": 9.680778138026863e-07, "loss": 0.0001, "num_tokens": 94890515.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3447, "step_time": 14.80742610245943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 194.5625, "completions/mean_terminated_length": 194.5625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.34553437680006027, "epoch": 0.15970356646595646, "frac_reward_zero_std": 0.0, "grad_norm": 0.0065522403456270695, "kl": 0.005666043201927096, "learning_rate": 9.680685502547476e-07, "loss": 0.0004, "num_tokens": 94917564.0, "reward": 1.0834193062692066e-06, "reward_std": 5.212616542848991e-07, "rewards/reward_func/mean": 1.0834193062692066e-06, "rewards/reward_func/std": 5.212616542848991e-07, "step": 3448, "step_time": 20.05315352603793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 163.9375, "completions/mean_terminated_length": 163.9375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.362993448972702, "epoch": 0.15974988420565076, "frac_reward_zero_std": 1.0, "grad_norm": 0.005834056995809078, "kl": 0.004242094000801444, "learning_rate": 9.680592867068088e-07, "loss": 0.0002, "num_tokens": 94939003.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3449, "step_time": 18.971878744661808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 206.3125, "completions/mean_terminated_length": 206.3125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.24320535734295845, "epoch": 0.15979620194534508, "frac_reward_zero_std": 0.0, "grad_norm": 0.10007061809301376, "kl": 0.005496683996170759, "learning_rate": 9.680500231588699e-07, "loss": 0.0052, "num_tokens": 94964368.0, "reward": 0.9650192260742188, "reward_std": 0.020858503878116608, "rewards/reward_func/mean": 0.9650192260742188, "rewards/reward_func/std": 0.020858513191342354, "step": 3450, "step_time": 25.901068847626448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 132.75, "completions/mean_terminated_length": 132.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.23371541500091553, "epoch": 0.15984251968503937, "frac_reward_zero_std": 1.0, "grad_norm": 0.002019039588049054, "kl": 0.0014723509084433317, "learning_rate": 9.680407596109308e-07, "loss": 0.0001, "num_tokens": 94984268.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3451, "step_time": 14.849624052643776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 172.1875, "completions/mean_terminated_length": 172.1875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4102185070514679, "epoch": 0.15988883742473367, "frac_reward_zero_std": 1.0, "grad_norm": 0.003499179147183895, "kl": 0.002860469976440072, "learning_rate": 9.680314960629921e-07, "loss": 0.0001, "num_tokens": 95010095.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3452, "step_time": 18.349357716739178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.34826090186834335, "epoch": 0.15993515516442797, "frac_reward_zero_std": 1.0, "grad_norm": 0.002932294737547636, "kl": 0.002209923230111599, "learning_rate": 9.680222325150533e-07, "loss": 0.0001, "num_tokens": 95041121.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3453, "step_time": 20.477946385741234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4032389596104622, "epoch": 0.1599814729041223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004911939613521099, "kl": 0.004299458349123597, "learning_rate": 9.680129689671144e-07, "loss": 0.0002, "num_tokens": 95069233.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3454, "step_time": 20.019405771046877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 147.8125, "completions/mean_terminated_length": 147.8125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2919314131140709, "epoch": 0.1600277906438166, "frac_reward_zero_std": 1.0, "grad_norm": 0.002293745754286647, "kl": 0.0022720624110661447, "learning_rate": 9.680037054191755e-07, "loss": 0.0001, "num_tokens": 95090942.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3455, "step_time": 17.30674758180976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 143.5625, "completions/mean_terminated_length": 143.5625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.23198441788554192, "epoch": 0.16007410838351088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027736607007682323, "kl": 0.0017384123057126999, "learning_rate": 9.679944418712366e-07, "loss": 0.0001, "num_tokens": 95111351.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3456, "step_time": 16.20187959820032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 271.0625, "completions/mean_terminated_length": 271.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.27159250155091286, "epoch": 0.16012042612320518, "frac_reward_zero_std": 0.0, "grad_norm": 0.09969770163297653, "kl": 0.011295750504359603, "learning_rate": 9.679851783232978e-07, "loss": -0.3148, "num_tokens": 95139416.0, "reward": 0.39337778091430664, "reward_std": 0.4508124887943268, "rewards/reward_func/mean": 0.39337778091430664, "rewards/reward_func/std": 0.45081251859664917, "step": 3457, "step_time": 33.91886493563652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.1908768080174923, "epoch": 0.1601667438628995, "frac_reward_zero_std": 1.0, "grad_norm": 0.002794044790789485, "kl": 0.002192254352848977, "learning_rate": 9.679759147753589e-07, "loss": 0.0001, "num_tokens": 95161722.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3458, "step_time": 16.333599999547005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 143.9375, "completions/mean_terminated_length": 143.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.29709840565919876, "epoch": 0.1602130616025938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025707187596708536, "kl": 0.001648180972551927, "learning_rate": 9.6796665122742e-07, "loss": 0.0001, "num_tokens": 95184073.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3459, "step_time": 15.60405432805419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 131.4375, "completions/mean_terminated_length": 131.4375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2899218425154686, "epoch": 0.1602593793422881, "frac_reward_zero_std": 1.0, "grad_norm": 0.004910370334982872, "kl": 0.002971170237287879, "learning_rate": 9.679573876794811e-07, "loss": 0.0001, "num_tokens": 95207440.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3460, "step_time": 15.209514487534761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.30879829823970795, "epoch": 0.1603056970819824, "frac_reward_zero_std": 1.0, "grad_norm": 0.002629196736961603, "kl": 0.0021158503368496895, "learning_rate": 9.679481241315423e-07, "loss": 0.0001, "num_tokens": 95233124.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3461, "step_time": 16.275724075734615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 174.6875, "completions/mean_terminated_length": 174.6875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.1905820220708847, "epoch": 0.1603520148216767, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032742188777774572, "kl": 0.0061743218684569, "learning_rate": 9.679388605836036e-07, "loss": 0.0003, "num_tokens": 95254527.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3462, "step_time": 17.634504687041044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 192.8125, "completions/mean_terminated_length": 192.8125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2610615938901901, "epoch": 0.160398332561371, "frac_reward_zero_std": 0.0, "grad_norm": 0.22092193365097046, "kl": 0.006980406120419502, "learning_rate": 9.679295970356645e-07, "loss": -0.07, "num_tokens": 95303628.0, "reward": 0.0852164477109909, "reward_std": 0.05081327259540558, "rewards/reward_func/mean": 0.0852164477109909, "rewards/reward_func/std": 0.05081327632069588, "step": 3463, "step_time": 27.251349058002234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 151.0, "completions/mean_terminated_length": 151.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.19659963622689247, "epoch": 0.1604446503010653, "frac_reward_zero_std": 0.0, "grad_norm": 0.15603183209896088, "kl": 0.010294223902747035, "learning_rate": 9.679203334877256e-07, "loss": -0.0192, "num_tokens": 95324828.0, "reward": 0.9464435577392578, "reward_std": 0.014281725510954857, "rewards/reward_func/mean": 0.9464435577392578, "rewards/reward_func/std": 0.014281720854341984, "step": 3464, "step_time": 16.60369373112917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.2528546415269375, "epoch": 0.1604909680407596, "frac_reward_zero_std": 0.0, "grad_norm": 0.09785427898168564, "kl": 0.003852492547594011, "learning_rate": 9.67911069939787e-07, "loss": 0.0099, "num_tokens": 95360001.0, "reward": 0.1862429976463318, "reward_std": 0.049664802849292755, "rewards/reward_func/mean": 0.1862429976463318, "rewards/reward_func/std": 0.04966479912400246, "step": 3465, "step_time": 29.23867255076766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 199.0625, "completions/mean_terminated_length": 199.0625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3757583796977997, "epoch": 0.16053728578045393, "frac_reward_zero_std": 0.0, "grad_norm": 0.13698962330818176, "kl": 0.015001054853200912, "learning_rate": 9.67901806391848e-07, "loss": -0.0277, "num_tokens": 95392098.0, "reward": 0.2977655529975891, "reward_std": 0.40756502747535706, "rewards/reward_func/mean": 0.2977655529975891, "rewards/reward_func/std": 0.40756499767303467, "step": 3466, "step_time": 21.701540529727936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.38655295222997665, "epoch": 0.16058360352014822, "frac_reward_zero_std": 1.0, "grad_norm": 0.00909447856247425, "kl": 0.0066490627359598875, "learning_rate": 9.678925428439092e-07, "loss": 0.0003, "num_tokens": 95413793.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3467, "step_time": 19.39699961617589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 132.4375, "completions/mean_terminated_length": 132.4375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.29878050088882446, "epoch": 0.16062992125984252, "frac_reward_zero_std": 1.0, "grad_norm": 0.004993073642253876, "kl": 0.00306686176918447, "learning_rate": 9.678832792959704e-07, "loss": 0.0002, "num_tokens": 95436760.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3468, "step_time": 15.505547307431698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 127.8125, "completions/mean_terminated_length": 127.8125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3205405920743942, "epoch": 0.1606762389995368, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023301353212445974, "kl": 0.001960799883818254, "learning_rate": 9.678740157480315e-07, "loss": 0.0001, "num_tokens": 95462165.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3469, "step_time": 15.969914954155684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 165.3125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4200323596596718, "epoch": 0.16072255673923114, "frac_reward_zero_std": 1.0, "grad_norm": 0.003014979185536504, "kl": 0.0024219228071160614, "learning_rate": 9.678647522000926e-07, "loss": 0.0001, "num_tokens": 95507978.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3470, "step_time": 22.58441649377346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.2844608575105667, "epoch": 0.16076887447892543, "frac_reward_zero_std": 1.0, "grad_norm": 0.003143490757793188, "kl": 0.0021750289015471935, "learning_rate": 9.678554886521537e-07, "loss": 0.0001, "num_tokens": 95527488.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3471, "step_time": 12.80592230707407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2442374974489212, "epoch": 0.16081519221861973, "frac_reward_zero_std": 1.0, "grad_norm": 0.021279064938426018, "kl": 0.012040883302688599, "learning_rate": 9.678462251042148e-07, "loss": 0.0006, "num_tokens": 95548198.0, "reward": 0.5986744165420532, "reward_std": 0.0, "rewards/reward_func/mean": 0.5986744165420532, "rewards/reward_func/std": 0.0, "step": 3472, "step_time": 18.40150808915496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.412839911878109, "epoch": 0.16086150995831403, "frac_reward_zero_std": 1.0, "grad_norm": 0.007059311494231224, "kl": 0.004065240442287177, "learning_rate": 9.67836961556276e-07, "loss": 0.0002, "num_tokens": 95569018.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3473, "step_time": 17.311843056231737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 132.5625, "completions/mean_terminated_length": 132.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.27657292783260345, "epoch": 0.16090782769800835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030245569068938494, "kl": 0.0019606098940130323, "learning_rate": 9.67827698008337e-07, "loss": 0.0001, "num_tokens": 95588675.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3474, "step_time": 14.221028413623571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 202.8125, "completions/mean_terminated_length": 202.8125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.3112871088087559, "epoch": 0.16095414543770264, "frac_reward_zero_std": 0.0, "grad_norm": 0.10125451534986496, "kl": 0.017683086451143026, "learning_rate": 9.678184344603982e-07, "loss": -0.0175, "num_tokens": 95617088.0, "reward": 0.5819142460823059, "reward_std": 0.4655313789844513, "rewards/reward_func/mean": 0.5819142460823059, "rewards/reward_func/std": 0.4655313789844513, "step": 3475, "step_time": 21.86097837984562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 128.9375, "completions/mean_terminated_length": 128.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2914179116487503, "epoch": 0.16100046317739694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024098509456962347, "kl": 0.001557468785904348, "learning_rate": 9.678091709124593e-07, "loss": 0.0001, "num_tokens": 95638015.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3476, "step_time": 13.901871923357248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4389551281929016, "epoch": 0.16104678091709124, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016640285030007362, "kl": 0.0019833649857901037, "learning_rate": 9.677999073645205e-07, "loss": 0.0001, "num_tokens": 95681729.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3477, "step_time": 21.65598686784506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 165.8125, "completions/mean_terminated_length": 165.8125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.23619088158011436, "epoch": 0.16109309865678556, "frac_reward_zero_std": 1.0, "grad_norm": 0.006988085340708494, "kl": 0.004499021626543254, "learning_rate": 9.677906438165818e-07, "loss": 0.0002, "num_tokens": 95702462.0, "reward": 0.29761362075805664, "reward_std": 0.0, "rewards/reward_func/mean": 0.29761362075805664, "rewards/reward_func/std": 0.0, "step": 3478, "step_time": 17.2142390049994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.29452070593833923, "epoch": 0.16113941639647986, "frac_reward_zero_std": 1.0, "grad_norm": 0.002671242458745837, "kl": 0.0019010700343642384, "learning_rate": 9.67781380268643e-07, "loss": 0.0001, "num_tokens": 95722978.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3479, "step_time": 15.805702719837427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 175.875, "completions/mean_terminated_length": 175.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.33927278220653534, "epoch": 0.16118573413617415, "frac_reward_zero_std": 0.0, "grad_norm": 0.12174668908119202, "kl": 0.011155621381476521, "learning_rate": 9.67772116720704e-07, "loss": -0.0707, "num_tokens": 95746160.0, "reward": 0.04252343252301216, "reward_std": 0.17009373009204865, "rewards/reward_func/mean": 0.04252343252301216, "rewards/reward_func/std": 0.17009373009204865, "step": 3480, "step_time": 20.51972856372595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.23060177639126778, "epoch": 0.16123205187586845, "frac_reward_zero_std": 1.0, "grad_norm": 0.014210606925189495, "kl": 0.0080897071165964, "learning_rate": 9.67762853172765e-07, "loss": 0.0004, "num_tokens": 95768144.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3481, "step_time": 18.474943548440933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2537849582731724, "epoch": 0.16127836961556277, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038933195173740387, "kl": 0.003093552717473358, "learning_rate": 9.677535896248263e-07, "loss": 0.0002, "num_tokens": 95805557.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3482, "step_time": 23.168149556964636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 134.8125, "completions/mean_terminated_length": 134.8125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.38977406173944473, "epoch": 0.16132468735525707, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028483986388891935, "kl": 0.002649834379553795, "learning_rate": 9.677443260768874e-07, "loss": 0.0001, "num_tokens": 95847650.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3483, "step_time": 19.173815231770277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 339.1875, "completions/mean_terminated_length": 339.1875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.21618995442986488, "epoch": 0.16137100509495136, "frac_reward_zero_std": 0.0, "grad_norm": 0.0660412535071373, "kl": 0.0036207985249347985, "learning_rate": 9.677350625289486e-07, "loss": 0.0025, "num_tokens": 95875797.0, "reward": 0.977266788482666, "reward_std": 0.014211704954504967, "rewards/reward_func/mean": 0.977266788482666, "rewards/reward_func/std": 0.014211706817150116, "step": 3484, "step_time": 30.194304917007685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.36701615154743195, "epoch": 0.16141732283464566, "frac_reward_zero_std": 1.0, "grad_norm": 0.004110343288630247, "kl": 0.0034016179852187634, "learning_rate": 9.677257989810097e-07, "loss": 0.0002, "num_tokens": 95897751.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3485, "step_time": 20.234230373054743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 172.75, "completions/mean_terminated_length": 172.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.20842647179961205, "epoch": 0.16146364057433998, "frac_reward_zero_std": 1.0, "grad_norm": 0.00140270940028131, "kl": 0.0012515323469415307, "learning_rate": 9.677165354330708e-07, "loss": 0.0001, "num_tokens": 95924563.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 3486, "step_time": 18.932017970830202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 213.0625, "completions/mean_terminated_length": 213.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.36111921072006226, "epoch": 0.16150995831403428, "frac_reward_zero_std": 0.0, "grad_norm": 0.10579774528741837, "kl": 0.014420822961255908, "learning_rate": 9.67707271885132e-07, "loss": -0.217, "num_tokens": 95947828.0, "reward": 0.22921621799468994, "reward_std": 0.2373117208480835, "rewards/reward_func/mean": 0.22921621799468994, "rewards/reward_func/std": 0.2373117208480835, "step": 3487, "step_time": 26.59403732419014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 194.4375, "completions/mean_terminated_length": 194.4375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.19422125443816185, "epoch": 0.16155627605372858, "frac_reward_zero_std": 0.0, "grad_norm": 0.08066341280937195, "kl": 0.007530417904490605, "learning_rate": 9.67698008337193e-07, "loss": -0.0174, "num_tokens": 95984987.0, "reward": 0.8204387426376343, "reward_std": 0.050927866250276566, "rewards/reward_func/mean": 0.8204387426376343, "rewards/reward_func/std": 0.05092788115143776, "step": 3488, "step_time": 22.20290519297123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2949298843741417, "epoch": 0.16160259379342287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029069860465824604, "kl": 0.002154695277567953, "learning_rate": 9.676887447892542e-07, "loss": 0.0001, "num_tokens": 96011553.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3489, "step_time": 17.839666597545147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3527740314602852, "epoch": 0.1616489115331172, "frac_reward_zero_std": 0.0, "grad_norm": 0.11044298112392426, "kl": 0.007320728502236307, "learning_rate": 9.676794812413153e-07, "loss": -0.0366, "num_tokens": 96044457.0, "reward": 0.06291896849870682, "reward_std": 0.13527171313762665, "rewards/reward_func/mean": 0.06291896849870682, "rewards/reward_func/std": 0.13527172803878784, "step": 3490, "step_time": 22.78624314442277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.2925775647163391, "epoch": 0.1616952292728115, "frac_reward_zero_std": 1.0, "grad_norm": 0.006018325686454773, "kl": 0.004330728843342513, "learning_rate": 9.676702176933764e-07, "loss": 0.0002, "num_tokens": 96070088.0, "reward": 0.3678794503211975, "reward_std": 0.0, "rewards/reward_func/mean": 0.3678794503211975, "rewards/reward_func/std": 0.0, "step": 3491, "step_time": 17.549418538808823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 207.3125, "completions/mean_terminated_length": 207.3125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.2782725617289543, "epoch": 0.1617415470125058, "frac_reward_zero_std": 0.0, "grad_norm": 0.11863549798727036, "kl": 0.02717333915643394, "learning_rate": 9.676609541454378e-07, "loss": -0.0041, "num_tokens": 96091853.0, "reward": 0.9162025451660156, "reward_std": 0.2586844563484192, "rewards/reward_func/mean": 0.9162025451660156, "rewards/reward_func/std": 0.2586844563484192, "step": 3492, "step_time": 20.521996207535267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.39548636227846146, "epoch": 0.16178786475220008, "frac_reward_zero_std": 0.0, "grad_norm": 0.10352324694395065, "kl": 0.007150716031901538, "learning_rate": 9.67651690597499e-07, "loss": -0.0549, "num_tokens": 96114565.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 3493, "step_time": 20.147456903010607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 217.3125, "completions/mean_terminated_length": 217.3125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.36438772827386856, "epoch": 0.1618341824918944, "frac_reward_zero_std": 0.0, "grad_norm": 0.11570117622613907, "kl": 0.004850264871492982, "learning_rate": 9.676424270495598e-07, "loss": -0.0339, "num_tokens": 96144538.0, "reward": 0.6824508905410767, "reward_std": 0.4069547653198242, "rewards/reward_func/mean": 0.6824508905410767, "rewards/reward_func/std": 0.4069547653198242, "step": 3494, "step_time": 23.432373207062483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.406574584543705, "epoch": 0.1618805002315887, "frac_reward_zero_std": 1.0, "grad_norm": 0.002736002206802368, "kl": 0.002772693696897477, "learning_rate": 9.676331635016211e-07, "loss": 0.0001, "num_tokens": 96183808.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3495, "step_time": 20.829304948449135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 266.9375, "completions/mean_terminated_length": 266.9375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.4659477397799492, "epoch": 0.161926817971283, "frac_reward_zero_std": 0.0, "grad_norm": 0.11198234558105469, "kl": 0.006621643784455955, "learning_rate": 9.676238999536823e-07, "loss": 0.1292, "num_tokens": 96213423.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 3496, "step_time": 31.571284301579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 126.4375, "completions/mean_terminated_length": 126.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.282311387360096, "epoch": 0.1619731357109773, "frac_reward_zero_std": 1.0, "grad_norm": 0.004010648932307959, "kl": 0.0024612965062260628, "learning_rate": 9.676146364057434e-07, "loss": 0.0001, "num_tokens": 96234886.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3497, "step_time": 16.80099131911993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 136.6875, "completions/mean_terminated_length": 136.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2940609082579613, "epoch": 0.16201945345067162, "frac_reward_zero_std": 1.0, "grad_norm": 0.00230865809135139, "kl": 0.0018131097604054958, "learning_rate": 9.676053728578045e-07, "loss": 0.0001, "num_tokens": 96262481.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3498, "step_time": 16.95644073560834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 186.4375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.2812589779496193, "epoch": 0.16206577119036591, "frac_reward_zero_std": 0.0, "grad_norm": 0.13529819250106812, "kl": 0.026054322253912687, "learning_rate": 9.675961093098656e-07, "loss": 0.1005, "num_tokens": 96308840.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 3499, "step_time": 27.80487647652626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 123.9375, "completions/mean_terminated_length": 123.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2315989024937153, "epoch": 0.1621120889300602, "frac_reward_zero_std": 1.0, "grad_norm": 0.006247741635888815, "kl": 0.003260422556195408, "learning_rate": 9.675868457619268e-07, "loss": 0.0002, "num_tokens": 96328231.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3500, "step_time": 13.85839718952775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 175.9375, "completions/mean_terminated_length": 175.9375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.22639838978648186, "epoch": 0.1621584066697545, "frac_reward_zero_std": 0.0, "grad_norm": 0.1781572848558426, "kl": 0.011106195393949747, "learning_rate": 9.67577582213988e-07, "loss": -0.0405, "num_tokens": 96350086.0, "reward": 0.3874533176422119, "reward_std": 0.26677781343460083, "rewards/reward_func/mean": 0.3874533176422119, "rewards/reward_func/std": 0.26677781343460083, "step": 3501, "step_time": 18.180234760046005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 139.9375, "completions/mean_terminated_length": 139.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.30354005843400955, "epoch": 0.16220472440944883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018522125901654363, "kl": 0.0015996306610759348, "learning_rate": 9.67568318666049e-07, "loss": 0.0001, "num_tokens": 96386197.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3502, "step_time": 18.0606238655746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.22315406426787376, "epoch": 0.16225104214914313, "frac_reward_zero_std": 0.0, "grad_norm": 0.08620049804449081, "kl": 0.007499936851672828, "learning_rate": 9.675590551181101e-07, "loss": -0.0355, "num_tokens": 96418431.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 3503, "step_time": 25.348066557198763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3328055292367935, "epoch": 0.16229735988883742, "frac_reward_zero_std": 1.0, "grad_norm": 0.003742165630683303, "kl": 0.0026473395992070436, "learning_rate": 9.675497915701713e-07, "loss": 0.0001, "num_tokens": 96441463.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3504, "step_time": 15.532160520553589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2995671108365059, "epoch": 0.16234367762853172, "frac_reward_zero_std": 1.0, "grad_norm": 0.004644697532057762, "kl": 0.002737248345511034, "learning_rate": 9.675405280222326e-07, "loss": 0.0001, "num_tokens": 96464122.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3505, "step_time": 15.495228987187147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 196.1875, "completions/mean_terminated_length": 196.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.44053924083709717, "epoch": 0.16238999536822604, "frac_reward_zero_std": 0.0, "grad_norm": 0.14676623046398163, "kl": 0.007510208641178906, "learning_rate": 9.675312644742935e-07, "loss": 0.125, "num_tokens": 96488045.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3506, "step_time": 24.72891664132476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.31276872754096985, "epoch": 0.16243631310792034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032454750034958124, "kl": 0.002006040042033419, "learning_rate": 9.675220009263546e-07, "loss": 0.0001, "num_tokens": 96516039.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3507, "step_time": 15.469757694751024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 139.1875, "completions/mean_terminated_length": 139.1875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2615719810128212, "epoch": 0.16248263084761463, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014279045863077044, "kl": 0.0014216804702300578, "learning_rate": 9.67512737378416e-07, "loss": 0.0001, "num_tokens": 96538010.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3508, "step_time": 14.813536275178194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 130.9375, "completions/mean_terminated_length": 130.9375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2843427509069443, "epoch": 0.16252894858730893, "frac_reward_zero_std": 1.0, "grad_norm": 0.002361882012337446, "kl": 0.002055309771094471, "learning_rate": 9.67503473830477e-07, "loss": 0.0001, "num_tokens": 96560105.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3509, "step_time": 14.728366624563932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.22428186237812042, "epoch": 0.16257526632700325, "frac_reward_zero_std": 0.0, "grad_norm": 0.13002124428749084, "kl": 0.005687119672074914, "learning_rate": 9.674942102825382e-07, "loss": 0.1285, "num_tokens": 96582269.0, "reward": 0.481328547000885, "reward_std": 0.1283542811870575, "rewards/reward_func/mean": 0.481328547000885, "rewards/reward_func/std": 0.1283542811870575, "step": 3510, "step_time": 25.801049027591944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.352471724152565, "epoch": 0.16262158406669755, "frac_reward_zero_std": 1.0, "grad_norm": 0.004096616059541702, "kl": 0.0030700730276294053, "learning_rate": 9.674849467345994e-07, "loss": 0.0002, "num_tokens": 96604445.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3511, "step_time": 16.6450727134943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 138.4375, "completions/mean_terminated_length": 138.4375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.23251745104789734, "epoch": 0.16266790180639185, "frac_reward_zero_std": 1.0, "grad_norm": 0.00755663076415658, "kl": 0.005337439710274339, "learning_rate": 9.674756831866605e-07, "loss": 0.0003, "num_tokens": 96627044.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3512, "step_time": 16.149804331362247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 168.625, "completions/mean_terminated_length": 168.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.38153017312288284, "epoch": 0.16271421954608614, "frac_reward_zero_std": 1.0, "grad_norm": 0.007727692369371653, "kl": 0.006425941362977028, "learning_rate": 9.674664196387216e-07, "loss": 0.0003, "num_tokens": 96649710.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3513, "step_time": 19.649431314319372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.38822100311517715, "epoch": 0.16276053728578047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037169994320720434, "kl": 0.002981725672725588, "learning_rate": 9.674571560907827e-07, "loss": 0.0002, "num_tokens": 96676782.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3514, "step_time": 21.991681169718504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.33966681361198425, "epoch": 0.16280685502547476, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031428884249180555, "kl": 0.0028423998737707734, "learning_rate": 9.674478925428439e-07, "loss": 0.0001, "num_tokens": 96699524.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3515, "step_time": 14.928597826510668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.2440076507627964, "epoch": 0.16285317276516906, "frac_reward_zero_std": 1.0, "grad_norm": 0.011665847152471542, "kl": 0.006670278729870915, "learning_rate": 9.67438628994905e-07, "loss": 0.0003, "num_tokens": 96723242.0, "reward": 0.5795782804489136, "reward_std": 0.0, "rewards/reward_func/mean": 0.5795782804489136, "rewards/reward_func/std": 0.0, "step": 3516, "step_time": 23.105410888791084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 160.25, "completions/mean_terminated_length": 160.25, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1922718994319439, "epoch": 0.16289949050486335, "frac_reward_zero_std": 1.0, "grad_norm": 0.006182128097862005, "kl": 0.0038107127184048295, "learning_rate": 9.67429365446966e-07, "loss": 0.0002, "num_tokens": 96751166.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 3517, "step_time": 19.49392169341445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 139.9375, "completions/mean_terminated_length": 139.9375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.35049815475940704, "epoch": 0.16294580824455768, "frac_reward_zero_std": 1.0, "grad_norm": 0.002779029542580247, "kl": 0.002253408427350223, "learning_rate": 9.674201018990272e-07, "loss": 0.0001, "num_tokens": 96777677.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3518, "step_time": 17.656462874263525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2003810554742813, "epoch": 0.16299212598425197, "frac_reward_zero_std": 1.0, "grad_norm": 0.014294126071035862, "kl": 0.06613391451537609, "learning_rate": 9.674108383510884e-07, "loss": 0.0033, "num_tokens": 96807390.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3519, "step_time": 19.170617293566465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 127.3125, "completions/mean_terminated_length": 127.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.29370229691267014, "epoch": 0.16303844372394627, "frac_reward_zero_std": 1.0, "grad_norm": 0.004364653490483761, "kl": 0.002967514330521226, "learning_rate": 9.674015748031495e-07, "loss": 0.0001, "num_tokens": 96830963.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3520, "step_time": 14.375239446759224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3580208495259285, "epoch": 0.16308476146364057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033027513418346643, "kl": 0.002833028440363705, "learning_rate": 9.673923112552106e-07, "loss": 0.0001, "num_tokens": 96852346.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3521, "step_time": 16.44275674968958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.18305179476737976, "epoch": 0.1631310792033349, "frac_reward_zero_std": 1.0, "grad_norm": 0.005787827540189028, "kl": 0.0037083630450069904, "learning_rate": 9.67383047707272e-07, "loss": 0.0002, "num_tokens": 96873340.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 3522, "step_time": 16.114144783467054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 172.8125, "completions/mean_terminated_length": 172.8125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2033519260585308, "epoch": 0.16317739694302918, "frac_reward_zero_std": 1.0, "grad_norm": 0.004432092420756817, "kl": 0.0017462515970692039, "learning_rate": 9.67373784159333e-07, "loss": 0.0001, "num_tokens": 96908665.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 3523, "step_time": 20.865708526223898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27791858464479446, "epoch": 0.16322371468272348, "frac_reward_zero_std": 1.0, "grad_norm": 0.004077862948179245, "kl": 0.002710001775994897, "learning_rate": 9.673645206113942e-07, "loss": 0.0001, "num_tokens": 96932772.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3524, "step_time": 14.771175995469093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3206168934702873, "epoch": 0.16327003242241778, "frac_reward_zero_std": 1.0, "grad_norm": 0.002391499001532793, "kl": 0.0020895492634736, "learning_rate": 9.673552570634553e-07, "loss": 0.0001, "num_tokens": 96963678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3525, "step_time": 16.159237887710333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.36402788758277893, "epoch": 0.1633163501621121, "frac_reward_zero_std": 0.0, "grad_norm": 0.09697293490171432, "kl": 0.008576249238103628, "learning_rate": 9.673459935155164e-07, "loss": -0.0083, "num_tokens": 96988690.0, "reward": 0.0159545186907053, "reward_std": 0.033227622509002686, "rewards/reward_func/mean": 0.0159545186907053, "rewards/reward_func/std": 0.033227622509002686, "step": 3526, "step_time": 22.216883279383183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.29022638499736786, "epoch": 0.1633626679018064, "frac_reward_zero_std": 1.0, "grad_norm": 0.007445821538567543, "kl": 0.0036948389606550336, "learning_rate": 9.673367299675776e-07, "loss": 0.0002, "num_tokens": 97011161.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 3527, "step_time": 18.00516940653324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 135.875, "completions/mean_terminated_length": 135.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.25457216054201126, "epoch": 0.1634089856415007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023652641102671623, "kl": 0.0014288907987065613, "learning_rate": 9.673274664196387e-07, "loss": 0.0001, "num_tokens": 97036583.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3528, "step_time": 15.217930767685175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.289127416908741, "epoch": 0.163455303381195, "frac_reward_zero_std": 0.0, "grad_norm": 0.1538974940776825, "kl": 0.013005726970732212, "learning_rate": 9.673182028716998e-07, "loss": -0.1096, "num_tokens": 97058172.0, "reward": 0.17225128412246704, "reward_std": 0.3080783188343048, "rewards/reward_func/mean": 0.17225128412246704, "rewards/reward_func/std": 0.3080783188343048, "step": 3529, "step_time": 20.44297206401825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.34234559535980225, "epoch": 0.1635016211208893, "frac_reward_zero_std": 0.0, "grad_norm": 0.07779324054718018, "kl": 0.007530900416895747, "learning_rate": 9.67308939323761e-07, "loss": -0.1397, "num_tokens": 97085626.0, "reward": 0.37527552247047424, "reward_std": 0.4395437240600586, "rewards/reward_func/mean": 0.37527552247047424, "rewards/reward_func/std": 0.4395437240600586, "step": 3530, "step_time": 28.89034976810217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 129.5625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2742532603442669, "epoch": 0.1635479388605836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032103965058922768, "kl": 0.0024091703817248344, "learning_rate": 9.67299675775822e-07, "loss": 0.0001, "num_tokens": 97106915.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3531, "step_time": 14.497124005109072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.5625, "completions/mean_terminated_length": 130.5625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.35865744948387146, "epoch": 0.1635942566002779, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035574487410485744, "kl": 0.0029727817163802683, "learning_rate": 9.672904122278832e-07, "loss": 0.0001, "num_tokens": 97128940.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3532, "step_time": 14.25592328235507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.31367600709199905, "epoch": 0.1636405743399722, "frac_reward_zero_std": 0.0, "grad_norm": 0.1216372549533844, "kl": 0.021039447281509638, "learning_rate": 9.672811486799443e-07, "loss": -0.1128, "num_tokens": 97154658.0, "reward": 0.4968356192111969, "reward_std": 0.5131920576095581, "rewards/reward_func/mean": 0.4968356192111969, "rewards/reward_func/std": 0.5131920576095581, "step": 3533, "step_time": 26.200247816741467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2965153381228447, "epoch": 0.16368689207966652, "frac_reward_zero_std": 1.0, "grad_norm": 0.004787066485732794, "kl": 0.002312621392775327, "learning_rate": 9.672718851320054e-07, "loss": 0.0001, "num_tokens": 97176571.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3534, "step_time": 14.440358653664589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2939995676279068, "epoch": 0.16373320981936082, "frac_reward_zero_std": 1.0, "grad_norm": 0.003910813480615616, "kl": 0.0028675563517026603, "learning_rate": 9.672626215840668e-07, "loss": 0.0001, "num_tokens": 97196468.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3535, "step_time": 14.77380719780922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.26630087196826935, "epoch": 0.16377952755905512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031438334845006466, "kl": 0.0021875373204238713, "learning_rate": 9.67253358036128e-07, "loss": 0.0001, "num_tokens": 97217566.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3536, "step_time": 13.541229378432035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 120.8125, "completions/mean_terminated_length": 120.8125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.28888891637325287, "epoch": 0.1638258452987494, "frac_reward_zero_std": 1.0, "grad_norm": 0.005562159698456526, "kl": 0.0032723327167332172, "learning_rate": 9.672440944881888e-07, "loss": 0.0002, "num_tokens": 97238091.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3537, "step_time": 13.543254546821117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 190.8125, "completions/mean_terminated_length": 190.8125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.272205114364624, "epoch": 0.16387216303844374, "frac_reward_zero_std": 1.0, "grad_norm": 0.03555941954255104, "kl": 0.014776590745896101, "learning_rate": 9.672348309402502e-07, "loss": 0.0007, "num_tokens": 97259752.0, "reward": 0.4191337525844574, "reward_std": 0.0, "rewards/reward_func/mean": 0.4191337525844574, "rewards/reward_func/std": 0.0, "step": 3538, "step_time": 19.38827906176448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 243.9375, "completions/mean_terminated_length": 243.9375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.2799694687128067, "epoch": 0.16391848077813803, "frac_reward_zero_std": 1.0, "grad_norm": 0.004785658325999975, "kl": 0.0038703237078152597, "learning_rate": 9.672255673923113e-07, "loss": 0.0002, "num_tokens": 97293255.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3539, "step_time": 25.595364157110453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 227.4375, "completions/mean_terminated_length": 227.4375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2612280547618866, "epoch": 0.16396479851783233, "frac_reward_zero_std": 0.0, "grad_norm": 0.07925572246313095, "kl": 0.015011367620900273, "learning_rate": 9.672163038443724e-07, "loss": -0.1021, "num_tokens": 97318830.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 3540, "step_time": 23.21995610371232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.29261116683483124, "epoch": 0.16401111625752662, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067114802077412605, "kl": 0.006157987168990076, "learning_rate": 9.672070402964335e-07, "loss": 0.0003, "num_tokens": 97357020.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3541, "step_time": 25.231066454201937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 166.8125, "completions/mean_terminated_length": 166.8125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.46383872628211975, "epoch": 0.16405743399722095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026960340328514576, "kl": 0.0028817428392358124, "learning_rate": 9.671977767484947e-07, "loss": 0.0001, "num_tokens": 97403401.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3542, "step_time": 23.84604797139764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 163.0625, "completions/mean_terminated_length": 163.0625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2743274196982384, "epoch": 0.16410375173691524, "frac_reward_zero_std": 1.0, "grad_norm": 0.010367260314524174, "kl": 0.00846461532637477, "learning_rate": 9.671885132005558e-07, "loss": 0.0004, "num_tokens": 97427178.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3543, "step_time": 18.02756090834737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3865014314651489, "epoch": 0.16415006947660954, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017552862409502268, "kl": 0.0021251430734992027, "learning_rate": 9.67179249652617e-07, "loss": 0.0001, "num_tokens": 97477280.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3544, "step_time": 25.08999341726303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 131.3125, "completions/mean_terminated_length": 131.3125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.292327456176281, "epoch": 0.16419638721630384, "frac_reward_zero_std": 1.0, "grad_norm": 0.005096784792840481, "kl": 0.002694442169740796, "learning_rate": 9.67169986104678e-07, "loss": 0.0001, "num_tokens": 97499125.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3545, "step_time": 14.179843433201313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.24239921942353249, "epoch": 0.16424270495599816, "frac_reward_zero_std": 1.0, "grad_norm": 0.002874167403206229, "kl": 0.001770373055478558, "learning_rate": 9.671607225567392e-07, "loss": 0.0001, "num_tokens": 97519053.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3546, "step_time": 13.279848337173462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2715304493904114, "epoch": 0.16428902269569245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024509415961802006, "kl": 0.002041867992375046, "learning_rate": 9.671514590088003e-07, "loss": 0.0001, "num_tokens": 97544321.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3547, "step_time": 16.602627348154783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4028412252664566, "epoch": 0.16433534043538675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033263410441577435, "kl": 0.002346788882277906, "learning_rate": 9.671421954608616e-07, "loss": 0.0001, "num_tokens": 97565479.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3548, "step_time": 16.204454492777586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.24278324842453003, "epoch": 0.16438165817508105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047670938074588776, "kl": 0.0036830284516327083, "learning_rate": 9.671329319129225e-07, "loss": 0.0002, "num_tokens": 97586722.0, "reward": 0.522855818271637, "reward_std": 0.0, "rewards/reward_func/mean": 0.522855818271637, "rewards/reward_func/std": 0.0, "step": 3549, "step_time": 16.94256315380335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 237.9375, "completions/mean_terminated_length": 237.9375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.28508298844099045, "epoch": 0.16442797591477537, "frac_reward_zero_std": 0.0, "grad_norm": 0.09614391624927521, "kl": 0.005944852251559496, "learning_rate": 9.671236683649836e-07, "loss": -0.0285, "num_tokens": 97610209.0, "reward": 0.6619340181350708, "reward_std": 0.21096941828727722, "rewards/reward_func/mean": 0.6619340181350708, "rewards/reward_func/std": 0.21096940338611603, "step": 3550, "step_time": 23.33081215620041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 176.8125, "completions/mean_terminated_length": 176.8125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.385079562664032, "epoch": 0.16447429365446967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036837179213762283, "kl": 0.003025463200174272, "learning_rate": 9.671144048170448e-07, "loss": 0.0002, "num_tokens": 97638302.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3551, "step_time": 19.04620984196663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.23403598368167877, "epoch": 0.16452061139416396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040371473878622055, "kl": 0.0025438999000471085, "learning_rate": 9.671051412691061e-07, "loss": 0.0001, "num_tokens": 97658854.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3552, "step_time": 15.845664210617542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 194.9375, "completions/mean_terminated_length": 194.9375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3761293590068817, "epoch": 0.16456692913385826, "frac_reward_zero_std": 0.0, "grad_norm": 0.07410187274217606, "kl": 0.006458802963607013, "learning_rate": 9.670958777211672e-07, "loss": 0.0233, "num_tokens": 97689733.0, "reward": 0.05592745915055275, "reward_std": 0.223709836602211, "rewards/reward_func/mean": 0.05592745915055275, "rewards/reward_func/std": 0.223709836602211, "step": 3553, "step_time": 21.835889488458633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.20000215247273445, "epoch": 0.16461324687355258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025598048232495785, "kl": 0.001941290043760091, "learning_rate": 9.670866141732284e-07, "loss": 0.0001, "num_tokens": 97709070.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3554, "step_time": 13.468267250806093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2784581035375595, "epoch": 0.16465956461324688, "frac_reward_zero_std": 0.0, "grad_norm": 0.07169782370328903, "kl": 0.01579317357391119, "learning_rate": 9.670773506252895e-07, "loss": -0.1591, "num_tokens": 97732908.0, "reward": 0.6524466276168823, "reward_std": 0.33146098256111145, "rewards/reward_func/mean": 0.6524466276168823, "rewards/reward_func/std": 0.33146098256111145, "step": 3555, "step_time": 26.99260038509965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.26143621653318405, "epoch": 0.16470588235294117, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056390464305877686, "kl": 0.002976380579639226, "learning_rate": 9.670680870773506e-07, "loss": 0.0001, "num_tokens": 97754487.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3556, "step_time": 13.53805648908019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 177.0, "completions/mean_terminated_length": 177.0, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.36464864015579224, "epoch": 0.16475220009263547, "frac_reward_zero_std": 1.0, "grad_norm": 0.003303313162177801, "kl": 0.002488663129042834, "learning_rate": 9.670588235294117e-07, "loss": 0.0001, "num_tokens": 97787639.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3557, "step_time": 20.25088758021593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.20431096479296684, "epoch": 0.1647985178323298, "frac_reward_zero_std": 1.0, "grad_norm": 0.001888621598482132, "kl": 0.0036209102836437523, "learning_rate": 9.670495599814729e-07, "loss": 0.0002, "num_tokens": 97815531.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 3558, "step_time": 21.710391961038113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 120.0625, "completions/mean_terminated_length": 120.0625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2611607015132904, "epoch": 0.1648448355720241, "frac_reward_zero_std": 1.0, "grad_norm": 0.003172141732648015, "kl": 0.0021691049623768777, "learning_rate": 9.67040296433534e-07, "loss": 0.0001, "num_tokens": 97835020.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3559, "step_time": 12.920670833438635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 137.1875, "completions/mean_terminated_length": 137.1875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.20644544437527657, "epoch": 0.16489115331171839, "frac_reward_zero_std": 1.0, "grad_norm": 0.003373936051502824, "kl": 0.0022112150327302516, "learning_rate": 9.670310328855951e-07, "loss": 0.0001, "num_tokens": 97855583.0, "reward": 1.6640468774513188e-14, "reward_std": 0.0, "rewards/reward_func/mean": 1.6640468774513188e-14, "rewards/reward_func/std": 0.0, "step": 3560, "step_time": 16.511867452412844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 200.0625, "completions/mean_terminated_length": 200.0625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.38920268416404724, "epoch": 0.16493747105141268, "frac_reward_zero_std": 0.0, "grad_norm": 0.1139177531003952, "kl": 0.01539679802954197, "learning_rate": 9.670217693376562e-07, "loss": 0.0144, "num_tokens": 97886784.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 3561, "step_time": 24.593369621783495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 147.8125, "completions/mean_terminated_length": 147.8125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3605232983827591, "epoch": 0.164983788791107, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035274229012429714, "kl": 0.002665524953044951, "learning_rate": 9.670125057897174e-07, "loss": 0.0001, "num_tokens": 97924061.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3562, "step_time": 20.690412435680628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 141.5625, "completions/mean_terminated_length": 141.5625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3203904703259468, "epoch": 0.1650301065308013, "frac_reward_zero_std": 1.0, "grad_norm": 0.00446776719763875, "kl": 0.003133551625069231, "learning_rate": 9.670032422417785e-07, "loss": 0.0002, "num_tokens": 97945654.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3563, "step_time": 14.90017794445157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 120.1875, "completions/mean_terminated_length": 120.1875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3613254576921463, "epoch": 0.1650764242704956, "frac_reward_zero_std": 1.0, "grad_norm": 0.003320826217532158, "kl": 0.002470236911904067, "learning_rate": 9.669939786938396e-07, "loss": 0.0001, "num_tokens": 97972617.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3564, "step_time": 15.292906329035759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 168.0625, "completions/mean_terminated_length": 168.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.28290047869086266, "epoch": 0.1651227420101899, "frac_reward_zero_std": 0.0, "grad_norm": 0.36806610226631165, "kl": 0.01397179055493325, "learning_rate": 9.66984715145901e-07, "loss": 0.0664, "num_tokens": 97997306.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.5625, "rewards/reward_func/std": 0.5123475790023804, "step": 3565, "step_time": 19.496180344372988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 155.9375, "completions/mean_terminated_length": 155.9375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3890189155936241, "epoch": 0.16516905974988422, "frac_reward_zero_std": 1.0, "grad_norm": 0.02676943875849247, "kl": 0.014399828622117639, "learning_rate": 9.66975451597962e-07, "loss": 0.0007, "num_tokens": 98032745.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3566, "step_time": 22.030289605259895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 180.4375, "completions/mean_terminated_length": 180.4375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1966230422258377, "epoch": 0.1652153774895785, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018266900442540646, "kl": 0.0015785284340381622, "learning_rate": 9.669661880500232e-07, "loss": 0.0001, "num_tokens": 98057072.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3567, "step_time": 19.08217379450798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 114.5, "completions/mean_terminated_length": 114.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2490151971578598, "epoch": 0.1652616952292728, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033037809189409018, "kl": 0.0022920635528862476, "learning_rate": 9.669569245020841e-07, "loss": 0.0001, "num_tokens": 98076936.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3568, "step_time": 13.264443390071392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 148.5625, "completions/mean_terminated_length": 148.5625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3315586596727371, "epoch": 0.1653080129689671, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023900519590824842, "kl": 0.002021096966927871, "learning_rate": 9.669476609541454e-07, "loss": 0.0001, "num_tokens": 98113073.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3569, "step_time": 19.52743361517787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 111.4375, "completions/mean_terminated_length": 111.4375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.27076223492622375, "epoch": 0.16535433070866143, "frac_reward_zero_std": 1.0, "grad_norm": 0.003763409797102213, "kl": 0.002294118225108832, "learning_rate": 9.669383974062066e-07, "loss": 0.0001, "num_tokens": 98132472.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3570, "step_time": 12.283430144190788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 210.9375, "completions/mean_terminated_length": 210.9375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.25906703248620033, "epoch": 0.16540064844835572, "frac_reward_zero_std": 0.0, "grad_norm": 0.0987786054611206, "kl": 0.008897863561287522, "learning_rate": 9.669291338582677e-07, "loss": -0.0449, "num_tokens": 98157399.0, "reward": 0.32327309250831604, "reward_std": 0.15240254998207092, "rewards/reward_func/mean": 0.32327309250831604, "rewards/reward_func/std": 0.15240256488323212, "step": 3571, "step_time": 22.225793097168207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3547776862978935, "epoch": 0.16544696618805002, "frac_reward_zero_std": 0.0, "grad_norm": 0.0881710946559906, "kl": 0.011886299354955554, "learning_rate": 9.669198703103288e-07, "loss": -0.0213, "num_tokens": 98181199.0, "reward": 0.11778545379638672, "reward_std": 0.22982218861579895, "rewards/reward_func/mean": 0.11778545379638672, "rewards/reward_func/std": 0.22982218861579895, "step": 3572, "step_time": 21.435869842767715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.435652494430542, "epoch": 0.16549328392774432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016045479569584131, "kl": 0.0018821700941771269, "learning_rate": 9.6691060676239e-07, "loss": 0.0001, "num_tokens": 98231321.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3573, "step_time": 23.15938265994191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 192.5625, "completions/mean_terminated_length": 192.5625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.33999859541654587, "epoch": 0.16553960166743864, "frac_reward_zero_std": 0.0, "grad_norm": 0.1332770437002182, "kl": 0.007052879314869642, "learning_rate": 9.66901343214451e-07, "loss": -0.0199, "num_tokens": 98265714.0, "reward": 0.6569495797157288, "reward_std": 0.45744097232818604, "rewards/reward_func/mean": 0.6569495797157288, "rewards/reward_func/std": 0.45744097232818604, "step": 3574, "step_time": 26.043886370956898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 228.1875, "completions/mean_terminated_length": 228.1875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4236639142036438, "epoch": 0.16558591940713294, "frac_reward_zero_std": 0.0, "grad_norm": 0.09896840155124664, "kl": 0.00724818604066968, "learning_rate": 9.668920796665122e-07, "loss": 0.1189, "num_tokens": 98290341.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 3575, "step_time": 29.304279312491417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 199.1875, "completions/mean_terminated_length": 199.1875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.28325093537569046, "epoch": 0.16563223714682723, "frac_reward_zero_std": 1.0, "grad_norm": 0.005324595142155886, "kl": 0.003494455828331411, "learning_rate": 9.668828161185733e-07, "loss": 0.0002, "num_tokens": 98316520.0, "reward": 0.084062859416008, "reward_std": 0.0, "rewards/reward_func/mean": 0.084062859416008, "rewards/reward_func/std": 0.0, "step": 3576, "step_time": 20.90087179839611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3959891349077225, "epoch": 0.16567855488652153, "frac_reward_zero_std": 1.0, "grad_norm": 0.003521413542330265, "kl": 0.002813965082168579, "learning_rate": 9.668735525706344e-07, "loss": 0.0001, "num_tokens": 98340066.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3577, "step_time": 15.995197925716639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.39261355996131897, "epoch": 0.16572487262621585, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020177988335490227, "kl": 0.002152281114831567, "learning_rate": 9.668642890226958e-07, "loss": 0.0001, "num_tokens": 98380084.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3578, "step_time": 23.109325744211674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 183.1875, "completions/mean_terminated_length": 183.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.28571048378944397, "epoch": 0.16577119036591015, "frac_reward_zero_std": 0.0, "grad_norm": 0.09259343892335892, "kl": 0.004429163585882634, "learning_rate": 9.66855025474757e-07, "loss": 0.0136, "num_tokens": 98407431.0, "reward": 0.8894338607788086, "reward_std": 0.2371823638677597, "rewards/reward_func/mean": 0.8894338607788086, "rewards/reward_func/std": 0.2371823638677597, "step": 3579, "step_time": 18.640504773706198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 203.375, "completions/mean_terminated_length": 203.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.1659403070807457, "epoch": 0.16581750810560444, "frac_reward_zero_std": 1.0, "grad_norm": 0.002949152607470751, "kl": 0.002286954957526177, "learning_rate": 9.668457619268178e-07, "loss": 0.0001, "num_tokens": 98433053.0, "reward": 0.9607894420623779, "reward_std": 0.0, "rewards/reward_func/mean": 0.9607894420623779, "rewards/reward_func/std": 0.0, "step": 3580, "step_time": 20.531835954636335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.2799430415034294, "epoch": 0.16586382584529874, "frac_reward_zero_std": 1.0, "grad_norm": 0.002769128419458866, "kl": 0.0019599811639636755, "learning_rate": 9.66836498378879e-07, "loss": 0.0001, "num_tokens": 98452663.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3581, "step_time": 14.744931012392044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2276948243379593, "epoch": 0.16591014358499306, "frac_reward_zero_std": 1.0, "grad_norm": 0.004453351255506277, "kl": 0.002728738298173994, "learning_rate": 9.668272348309403e-07, "loss": 0.0001, "num_tokens": 98472351.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3582, "step_time": 13.851874709129333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 172.5625, "completions/mean_terminated_length": 172.5625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3971063941717148, "epoch": 0.16595646132468736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023131745401769876, "kl": 0.002333566138986498, "learning_rate": 9.668179712830014e-07, "loss": 0.0001, "num_tokens": 98518312.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3583, "step_time": 25.018486488610506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 157.125, "completions/mean_terminated_length": 157.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.36475537717342377, "epoch": 0.16600277906438166, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022115667816251516, "kl": 0.0019968247215729207, "learning_rate": 9.668087077350625e-07, "loss": 0.0001, "num_tokens": 98546938.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3584, "step_time": 17.789002742618322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 123.625, "completions/mean_terminated_length": 123.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.28959520161151886, "epoch": 0.16604909680407595, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018079724395647645, "kl": 0.001498726400313899, "learning_rate": 9.667994441871237e-07, "loss": 0.0001, "num_tokens": 98569844.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3585, "step_time": 14.291997365653515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2067979909479618, "epoch": 0.16609541454377028, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038332007825374603, "kl": 0.006445974169764668, "learning_rate": 9.667901806391848e-07, "loss": 0.0003, "num_tokens": 98602286.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3586, "step_time": 25.763326067477465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 191.8125, "completions/mean_terminated_length": 191.8125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.19510289654135704, "epoch": 0.16614173228346457, "frac_reward_zero_std": 0.0, "grad_norm": 0.1861986517906189, "kl": 0.007212674827314913, "learning_rate": 9.66780917091246e-07, "loss": -0.1037, "num_tokens": 98630843.0, "reward": 0.7634987831115723, "reward_std": 0.2769618332386017, "rewards/reward_func/mean": 0.7634987831115723, "rewards/reward_func/std": 0.2769618630409241, "step": 3587, "step_time": 23.240470733493567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 148.5625, "completions/mean_terminated_length": 148.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.35953081399202347, "epoch": 0.16618805002315887, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029343648348003626, "kl": 0.001841022924054414, "learning_rate": 9.66771653543307e-07, "loss": 0.0001, "num_tokens": 98666084.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3588, "step_time": 19.61584033817053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 148.6875, "completions/mean_terminated_length": 148.6875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3365582451224327, "epoch": 0.16623436776285316, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048422785475850105, "kl": 0.002994633687194437, "learning_rate": 9.667623899953682e-07, "loss": 0.0002, "num_tokens": 98687599.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3589, "step_time": 17.233600221574306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 218.9375, "completions/mean_terminated_length": 218.9375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.20218537375330925, "epoch": 0.1662806855025475, "frac_reward_zero_std": 0.0, "grad_norm": 0.10181840509176254, "kl": 0.00199402665020898, "learning_rate": 9.667531264474293e-07, "loss": -0.0012, "num_tokens": 98728046.0, "reward": 0.9775209426879883, "reward_std": 0.04832850396633148, "rewards/reward_func/mean": 0.9775209426879883, "rewards/reward_func/std": 0.048328500241041183, "step": 3590, "step_time": 25.279921278357506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 197.8125, "completions/mean_terminated_length": 197.8125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.22607450187206268, "epoch": 0.16632700324224178, "frac_reward_zero_std": 1.0, "grad_norm": 0.002626505447551608, "kl": 0.002227245655376464, "learning_rate": 9.667438628994904e-07, "loss": 0.0001, "num_tokens": 98753771.0, "reward": 0.19180183112621307, "reward_std": 0.0, "rewards/reward_func/mean": 0.19180183112621307, "rewards/reward_func/std": 0.0, "step": 3591, "step_time": 21.07438062131405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.30981430411338806, "epoch": 0.16637332098193608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032275815028697252, "kl": 0.0024592981790192425, "learning_rate": 9.667345993515515e-07, "loss": 0.0001, "num_tokens": 98789742.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3592, "step_time": 19.41781486943364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.20867987722158432, "epoch": 0.16641963872163038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029367273673415184, "kl": 0.0023950578179210424, "learning_rate": 9.667253358036127e-07, "loss": 0.0001, "num_tokens": 98827558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3593, "step_time": 24.983286380767822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 199.0625, "completions/mean_terminated_length": 199.0625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2755452021956444, "epoch": 0.1664659564613247, "frac_reward_zero_std": 0.0, "grad_norm": 0.1034320741891861, "kl": 0.017279275692999363, "learning_rate": 9.667160722556738e-07, "loss": 0.0045, "num_tokens": 98848839.0, "reward": 0.9962133169174194, "reward_std": 0.015146732330322266, "rewards/reward_func/mean": 0.9962133169174194, "rewards/reward_func/std": 0.015146732330322266, "step": 3594, "step_time": 22.22673163190484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.40091605484485626, "epoch": 0.166512274201019, "frac_reward_zero_std": 0.0, "grad_norm": 0.0701683908700943, "kl": 0.011251306626945734, "learning_rate": 9.667068087077351e-07, "loss": -0.1091, "num_tokens": 98879765.0, "reward": 0.1425301730632782, "reward_std": 0.21833959221839905, "rewards/reward_func/mean": 0.1425301730632782, "rewards/reward_func/std": 0.21833959221839905, "step": 3595, "step_time": 27.728194940835238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.19141894206404686, "epoch": 0.1665585919407133, "frac_reward_zero_std": 1.0, "grad_norm": 0.003598688868805766, "kl": 0.0027538148569874465, "learning_rate": 9.666975451597962e-07, "loss": 0.0001, "num_tokens": 98911760.0, "reward": 0.9560167789459229, "reward_std": 0.0, "rewards/reward_func/mean": 0.9560167789459229, "rewards/reward_func/std": 0.0, "step": 3596, "step_time": 23.942966651171446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 186.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.33885566890239716, "epoch": 0.1666049096804076, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030243494547903538, "kl": 0.002889836789108813, "learning_rate": 9.666882816118574e-07, "loss": 0.0001, "num_tokens": 98948744.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3597, "step_time": 22.082825370132923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 149.125, "completions/mean_terminated_length": 149.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.37782856822013855, "epoch": 0.1666512274201019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038001537322998047, "kl": 0.002931522554717958, "learning_rate": 9.666790180639183e-07, "loss": 0.0001, "num_tokens": 98985018.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3598, "step_time": 19.916902281343937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 132.9375, "completions/mean_terminated_length": 132.9375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.30282729864120483, "epoch": 0.1666975451597962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020080318208783865, "kl": 0.0018979228334501386, "learning_rate": 9.666697545159796e-07, "loss": 0.0001, "num_tokens": 99021065.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3599, "step_time": 18.39753397554159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4366292282938957, "epoch": 0.1667438628994905, "frac_reward_zero_std": 1.0, "grad_norm": 0.00261685810983181, "kl": 0.0029804747900925577, "learning_rate": 9.666604909680407e-07, "loss": 0.0002, "num_tokens": 99046276.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3600, "step_time": 20.289067335426807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 186.4375, "completions/mean_terminated_length": 186.4375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.33161909133195877, "epoch": 0.1667901806391848, "frac_reward_zero_std": 1.0, "grad_norm": 0.005478667560964823, "kl": 0.004064594686497003, "learning_rate": 9.666512274201019e-07, "loss": 0.0002, "num_tokens": 99073227.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 3601, "step_time": 21.90181877836585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 187.0625, "completions/mean_terminated_length": 187.0625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.27226074039936066, "epoch": 0.16683649837887912, "frac_reward_zero_std": 0.0, "grad_norm": 0.15579216182231903, "kl": 0.00792406452819705, "learning_rate": 9.66641963872163e-07, "loss": 0.0364, "num_tokens": 99094508.0, "reward": 0.6438685059547424, "reward_std": 0.17169827222824097, "rewards/reward_func/mean": 0.6438685059547424, "rewards/reward_func/std": 0.17169827222824097, "step": 3602, "step_time": 20.344610940665007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.2098471112549305, "epoch": 0.16688281611857342, "frac_reward_zero_std": 1.0, "grad_norm": 0.002205336233600974, "kl": 0.001868272986030206, "learning_rate": 9.666327003242241e-07, "loss": 0.0001, "num_tokens": 99119488.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3603, "step_time": 18.055586989969015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3071671575307846, "epoch": 0.16692913385826771, "frac_reward_zero_std": 0.0, "grad_norm": 0.13708633184432983, "kl": 0.009593281196430326, "learning_rate": 9.666234367762852e-07, "loss": -0.0212, "num_tokens": 99157480.0, "reward": 0.7879804968833923, "reward_std": 0.013710908591747284, "rewards/reward_func/mean": 0.7879804968833923, "rewards/reward_func/std": 0.013710916973650455, "step": 3604, "step_time": 25.048242699354887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 143.75, "completions/mean_terminated_length": 143.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.1587459072470665, "epoch": 0.166975451597962, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048674545250833035, "kl": 0.002053667150903493, "learning_rate": 9.666141732283464e-07, "loss": 0.0001, "num_tokens": 99178468.0, "reward": 0.8446319699287415, "reward_std": 0.0, "rewards/reward_func/mean": 0.8446319699287415, "rewards/reward_func/std": 0.0, "step": 3605, "step_time": 15.05143042653799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2497384026646614, "epoch": 0.16702176933765633, "frac_reward_zero_std": 1.0, "grad_norm": 0.003642859635874629, "kl": 0.0023387824185192585, "learning_rate": 9.666049096804075e-07, "loss": 0.0001, "num_tokens": 99197972.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3606, "step_time": 13.351442039012909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.4811240881681442, "epoch": 0.16706808707735063, "frac_reward_zero_std": 1.0, "grad_norm": 0.01033422164618969, "kl": 0.007307854946702719, "learning_rate": 9.665956461324686e-07, "loss": 0.0004, "num_tokens": 99222966.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3607, "step_time": 21.15530388429761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.4534684345126152, "epoch": 0.16711440481704493, "frac_reward_zero_std": 0.0, "grad_norm": 0.12001563608646393, "kl": 0.009250278701074421, "learning_rate": 9.6658638258453e-07, "loss": -0.1372, "num_tokens": 99261582.0, "reward": 0.00035374873550608754, "reward_std": 0.0014149949420243502, "rewards/reward_func/mean": 0.00035374873550608754, "rewards/reward_func/std": 0.0014149949420243502, "step": 3608, "step_time": 32.87519274279475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 182.25, "completions/mean_terminated_length": 182.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.36298853158950806, "epoch": 0.16716072255673922, "frac_reward_zero_std": 0.0, "grad_norm": 0.11058811843395233, "kl": 0.03417673148214817, "learning_rate": 9.66577119036591e-07, "loss": -0.122, "num_tokens": 99282530.0, "reward": 0.17713744938373566, "reward_std": 0.2361832857131958, "rewards/reward_func/mean": 0.17713744938373566, "rewards/reward_func/std": 0.2361832708120346, "step": 3609, "step_time": 22.58516302704811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 167.75, "completions/mean_terminated_length": 167.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3905542939901352, "epoch": 0.16720704029643355, "frac_reward_zero_std": 1.0, "grad_norm": 0.0064782253466546535, "kl": 0.002873776655178517, "learning_rate": 9.665678554886522e-07, "loss": 0.0001, "num_tokens": 99318494.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3610, "step_time": 23.65369301661849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 170.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.17546479031443596, "epoch": 0.16725335803612784, "frac_reward_zero_std": 0.0, "grad_norm": 0.09214676171541214, "kl": 0.005759982392191887, "learning_rate": 9.665585919407131e-07, "loss": -0.0209, "num_tokens": 99343796.0, "reward": 0.8912093043327332, "reward_std": 0.0704302117228508, "rewards/reward_func/mean": 0.8912093043327332, "rewards/reward_func/std": 0.0704302042722702, "step": 3611, "step_time": 17.869517344981432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 194.9375, "completions/mean_terminated_length": 194.9375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4239168018102646, "epoch": 0.16729967577582214, "frac_reward_zero_std": 1.0, "grad_norm": 0.007603461388498545, "kl": 0.006265804171562195, "learning_rate": 9.665493283927745e-07, "loss": 0.0003, "num_tokens": 99374947.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3612, "step_time": 21.957412358373404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.4286089539527893, "epoch": 0.16734599351551643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020595502573996782, "kl": 0.002293061406817287, "learning_rate": 9.665400648448356e-07, "loss": 0.0001, "num_tokens": 99428343.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3613, "step_time": 26.00006529316306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.4116162583231926, "epoch": 0.16739231125521076, "frac_reward_zero_std": 1.0, "grad_norm": 0.008351681753993034, "kl": 0.006845270982012153, "learning_rate": 9.665308012968967e-07, "loss": 0.0003, "num_tokens": 99460259.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3614, "step_time": 22.543399397283792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.24369941279292107, "epoch": 0.16743862899490505, "frac_reward_zero_std": 1.0, "grad_norm": 0.002607213333249092, "kl": 0.0017242001194972545, "learning_rate": 9.665215377489578e-07, "loss": 0.0001, "num_tokens": 99479803.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3615, "step_time": 13.694619506597519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 157.5625, "completions/mean_terminated_length": 157.5625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.21114365756511688, "epoch": 0.16748494673459935, "frac_reward_zero_std": 0.0, "grad_norm": 0.09268580377101898, "kl": 0.0046520818723365664, "learning_rate": 9.66512274201019e-07, "loss": 0.0244, "num_tokens": 99501812.0, "reward": 0.9900055527687073, "reward_std": 0.027310028672218323, "rewards/reward_func/mean": 0.9900055527687073, "rewards/reward_func/std": 0.027310030534863472, "step": 3616, "step_time": 17.534027237445116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 152.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.2408118136227131, "epoch": 0.16753126447429365, "frac_reward_zero_std": 0.0, "grad_norm": 0.08825484663248062, "kl": 0.003742107772268355, "learning_rate": 9.6650301065308e-07, "loss": -0.021, "num_tokens": 99522821.0, "reward": 0.9185318946838379, "reward_std": 0.021724820137023926, "rewards/reward_func/mean": 0.9185318946838379, "rewards/reward_func/std": 0.021724820137023926, "step": 3617, "step_time": 15.844024267047644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 146.0625, "completions/mean_terminated_length": 146.0625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.4209437370300293, "epoch": 0.16757758221398797, "frac_reward_zero_std": 1.0, "grad_norm": 0.002499717753380537, "kl": 0.0021618661703541875, "learning_rate": 9.664937471051412e-07, "loss": 0.0001, "num_tokens": 99545862.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3618, "step_time": 16.462530065327883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 215.1875, "completions/mean_terminated_length": 215.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.34370262175798416, "epoch": 0.16762389995368226, "frac_reward_zero_std": 0.0, "grad_norm": 0.1858927607536316, "kl": 0.010673115029931068, "learning_rate": 9.664844835572023e-07, "loss": 0.1613, "num_tokens": 99569609.0, "reward": 0.7000278234481812, "reward_std": 0.41826578974723816, "rewards/reward_func/mean": 0.7000278234481812, "rewards/reward_func/std": 0.41826578974723816, "step": 3619, "step_time": 28.33480976894498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 137.9375, "completions/mean_terminated_length": 137.9375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.17957430332899094, "epoch": 0.16767021769337656, "frac_reward_zero_std": 0.0, "grad_norm": 0.36428120732307434, "kl": 0.010026685078628361, "learning_rate": 9.664752200092635e-07, "loss": 0.0142, "num_tokens": 99594056.0, "reward": 0.8488635420799255, "reward_std": 0.09884501993656158, "rewards/reward_func/mean": 0.8488635420799255, "rewards/reward_func/std": 0.09884503483772278, "step": 3620, "step_time": 15.420955941081047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 204.0625, "completions/mean_terminated_length": 204.0625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.33192652463912964, "epoch": 0.16771653543307086, "frac_reward_zero_std": 0.0, "grad_norm": 0.1426052302122116, "kl": 0.019870974589139223, "learning_rate": 9.664659564613246e-07, "loss": -0.0734, "num_tokens": 99616201.0, "reward": 0.10064637660980225, "reward_std": 0.10394712537527084, "rewards/reward_func/mean": 0.10064637660980225, "rewards/reward_func/std": 0.10394713282585144, "step": 3621, "step_time": 21.123180232942104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 186.25, "completions/mean_terminated_length": 186.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3310723751783371, "epoch": 0.16776285317276518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038498728536069393, "kl": 0.002794923959299922, "learning_rate": 9.66456692913386e-07, "loss": 0.0001, "num_tokens": 99643709.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3622, "step_time": 19.18281963467598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 194.1875, "completions/mean_terminated_length": 194.1875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.186534583568573, "epoch": 0.16780917091245948, "frac_reward_zero_std": 1.0, "grad_norm": 0.005528921727091074, "kl": 0.004269551369361579, "learning_rate": 9.664474293654468e-07, "loss": 0.0002, "num_tokens": 99670608.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3623, "step_time": 21.365838635712862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.28029765188694, "epoch": 0.16785548865215377, "frac_reward_zero_std": 1.0, "grad_norm": 0.007722316309809685, "kl": 0.00351451471215114, "learning_rate": 9.66438165817508e-07, "loss": 0.0002, "num_tokens": 99691470.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3624, "step_time": 16.521307606250048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2888113483786583, "epoch": 0.16790180639184807, "frac_reward_zero_std": 1.0, "grad_norm": 0.003515079850330949, "kl": 0.0021059686259832233, "learning_rate": 9.664289022695693e-07, "loss": 0.0001, "num_tokens": 99716206.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3625, "step_time": 17.258567236363888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 203.1875, "completions/mean_terminated_length": 203.1875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.19794563576579094, "epoch": 0.1679481241315424, "frac_reward_zero_std": 1.0, "grad_norm": 0.005172573961317539, "kl": 0.003150037897285074, "learning_rate": 9.664196387216304e-07, "loss": 0.0002, "num_tokens": 99747169.0, "reward": 0.6246347427368164, "reward_std": 0.0, "rewards/reward_func/mean": 0.6246347427368164, "rewards/reward_func/std": 0.0, "step": 3626, "step_time": 22.692824937403202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.13266050815582275, "epoch": 0.1679944418712367, "frac_reward_zero_std": 0.0, "grad_norm": 0.13565035164356232, "kl": 0.00248489883961156, "learning_rate": 9.664103751736915e-07, "loss": 0.0043, "num_tokens": 99768484.0, "reward": 0.9950882196426392, "reward_std": 0.01964726485311985, "rewards/reward_func/mean": 0.9950882196426392, "rewards/reward_func/std": 0.019647270441055298, "step": 3627, "step_time": 17.078135419636965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 189.4375, "completions/mean_terminated_length": 189.4375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.262265108525753, "epoch": 0.16804075961093098, "frac_reward_zero_std": 0.0, "grad_norm": 0.13999216258525848, "kl": 0.010434851050376892, "learning_rate": 9.664011116257527e-07, "loss": -0.0453, "num_tokens": 99793515.0, "reward": 0.5128942728042603, "reward_std": 0.30541178584098816, "rewards/reward_func/mean": 0.5128942728042603, "rewards/reward_func/std": 0.30541181564331055, "step": 3628, "step_time": 19.783077280968428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 150.6875, "completions/mean_terminated_length": 150.6875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4042646214365959, "epoch": 0.16808707735062528, "frac_reward_zero_std": 1.0, "grad_norm": 0.005343710537999868, "kl": 0.003096283588092774, "learning_rate": 9.663918480778138e-07, "loss": 0.0002, "num_tokens": 99834902.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3629, "step_time": 21.930745758116245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.355318583548069, "epoch": 0.1681333950903196, "frac_reward_zero_std": 0.0, "grad_norm": 0.10351262241601944, "kl": 0.015597585588693619, "learning_rate": 9.66382584529875e-07, "loss": -0.0117, "num_tokens": 99856944.0, "reward": 0.8472354412078857, "reward_std": 0.3326157331466675, "rewards/reward_func/mean": 0.8472354412078857, "rewards/reward_func/std": 0.3326157331466675, "step": 3630, "step_time": 23.895892221480608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.12586481124162674, "epoch": 0.1681797128300139, "frac_reward_zero_std": 0.0, "grad_norm": 0.0977628231048584, "kl": 0.002275153383379802, "learning_rate": 9.66373320981936e-07, "loss": -0.0957, "num_tokens": 99879692.0, "reward": 0.21041209995746613, "reward_std": 0.32232680916786194, "rewards/reward_func/mean": 0.21041209995746613, "rewards/reward_func/std": 0.3223268389701843, "step": 3631, "step_time": 21.60739677026868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.4112020879983902, "epoch": 0.1682260305697082, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027718129567801952, "kl": 0.0023315809085033834, "learning_rate": 9.663640574339972e-07, "loss": 0.0001, "num_tokens": 99918036.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3632, "step_time": 23.05859698727727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3297823369503021, "epoch": 0.1682723483094025, "frac_reward_zero_std": 0.0, "grad_norm": 0.1071990579366684, "kl": 0.015387247549369931, "learning_rate": 9.663547938860583e-07, "loss": -0.1113, "num_tokens": 99940170.0, "reward": 0.11924160271883011, "reward_std": 0.20211108028888702, "rewards/reward_func/mean": 0.11924160271883011, "rewards/reward_func/std": 0.20211108028888702, "step": 3633, "step_time": 22.06792050972581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 189.25, "completions/mean_terminated_length": 189.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.22469836100935936, "epoch": 0.16831866604909682, "frac_reward_zero_std": 1.0, "grad_norm": 0.004018844570964575, "kl": 0.003375634434632957, "learning_rate": 9.663455303381194e-07, "loss": 0.0002, "num_tokens": 99975582.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3634, "step_time": 21.8283023647964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 207.0625, "completions/mean_terminated_length": 207.0625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.17764092236757278, "epoch": 0.1683649837887911, "frac_reward_zero_std": 1.0, "grad_norm": 0.00385016156360507, "kl": 0.008594031794928014, "learning_rate": 9.663362667901805e-07, "loss": 0.0004, "num_tokens": 100013503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3635, "step_time": 23.138325460255146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.158108189702034, "epoch": 0.1684113015284854, "frac_reward_zero_std": 1.0, "grad_norm": 0.004413930233567953, "kl": 0.0019016998703591526, "learning_rate": 9.663270032422417e-07, "loss": 0.0001, "num_tokens": 100044399.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 3636, "step_time": 20.046655353158712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 153.9375, "completions/mean_terminated_length": 153.9375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.19572916254401207, "epoch": 0.1684576192681797, "frac_reward_zero_std": 1.0, "grad_norm": 0.004057406447827816, "kl": 0.003427662595640868, "learning_rate": 9.663177396943028e-07, "loss": 0.0002, "num_tokens": 100065246.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3637, "step_time": 15.902507115155458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.24629414826631546, "epoch": 0.16850393700787403, "frac_reward_zero_std": 0.0, "grad_norm": 0.23255613446235657, "kl": 0.015454307897016406, "learning_rate": 9.66308476146364e-07, "loss": -0.0161, "num_tokens": 100101824.0, "reward": 0.42951178550720215, "reward_std": 0.009121465496718884, "rewards/reward_func/mean": 0.42951178550720215, "rewards/reward_func/std": 0.00912146270275116, "step": 3638, "step_time": 18.848948996514082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3296635001897812, "epoch": 0.16855025474756832, "frac_reward_zero_std": 0.0, "grad_norm": 0.09846010059118271, "kl": 0.010080184903927147, "learning_rate": 9.662992125984252e-07, "loss": -0.101, "num_tokens": 100134484.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 3639, "step_time": 26.790442250669003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 183.875, "completions/mean_terminated_length": 183.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.40635445713996887, "epoch": 0.16859657248726262, "frac_reward_zero_std": 1.0, "grad_norm": 0.013727196492254734, "kl": 0.006187449675053358, "learning_rate": 9.662899490504864e-07, "loss": 0.0003, "num_tokens": 100165330.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3640, "step_time": 21.61533609032631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.22182074561715126, "epoch": 0.16864289022695692, "frac_reward_zero_std": 1.0, "grad_norm": 0.005874851252883673, "kl": 0.004019855405203998, "learning_rate": 9.662806855025473e-07, "loss": 0.0002, "num_tokens": 100190661.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 3641, "step_time": 19.146815598011017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 123.4375, "completions/mean_terminated_length": 123.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.33297599107027054, "epoch": 0.16868920796665124, "frac_reward_zero_std": 1.0, "grad_norm": 0.005786799360066652, "kl": 0.003008337924256921, "learning_rate": 9.662714219546086e-07, "loss": 0.0001, "num_tokens": 100215052.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3642, "step_time": 15.082947868853807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.39536600559949875, "epoch": 0.16873552570634553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0043201870284974575, "kl": 0.00370199978351593, "learning_rate": 9.662621584066697e-07, "loss": 0.0002, "num_tokens": 100258298.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3643, "step_time": 24.709015142172575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 185.8125, "completions/mean_terminated_length": 185.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4209500700235367, "epoch": 0.16878184344603983, "frac_reward_zero_std": 1.0, "grad_norm": 0.004707318264991045, "kl": 0.004353383614216, "learning_rate": 9.662528948587309e-07, "loss": 0.0002, "num_tokens": 100280759.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3644, "step_time": 19.487607669085264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3488072007894516, "epoch": 0.16882816118573413, "frac_reward_zero_std": 1.0, "grad_norm": 0.007075755391269922, "kl": 0.0051837723003700376, "learning_rate": 9.66243631310792e-07, "loss": 0.0003, "num_tokens": 100303057.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3645, "step_time": 19.613004866987467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 223.6875, "completions/mean_terminated_length": 223.6875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3842403292655945, "epoch": 0.16887447892542845, "frac_reward_zero_std": 0.0, "grad_norm": 0.11126924306154251, "kl": 0.008895701728761196, "learning_rate": 9.662343677628531e-07, "loss": -0.2362, "num_tokens": 100340924.0, "reward": 0.16754452884197235, "reward_std": 0.3664451539516449, "rewards/reward_func/mean": 0.16754452884197235, "rewards/reward_func/std": 0.3664451539516449, "step": 3646, "step_time": 33.60752094164491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.29095131158828735, "epoch": 0.16892079666512275, "frac_reward_zero_std": 1.0, "grad_norm": 0.004700443707406521, "kl": 0.002529663441237062, "learning_rate": 9.662251042149142e-07, "loss": 0.0001, "num_tokens": 100362155.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3647, "step_time": 16.778659086674452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 197.125, "completions/mean_terminated_length": 197.125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.19357973709702492, "epoch": 0.16896711440481704, "frac_reward_zero_std": 0.0, "grad_norm": 0.14569637179374695, "kl": 0.012966676848009229, "learning_rate": 9.662158406669754e-07, "loss": -0.028, "num_tokens": 100391261.0, "reward": 0.557201087474823, "reward_std": 0.054861608892679214, "rewards/reward_func/mean": 0.557201087474823, "rewards/reward_func/std": 0.05486161261796951, "step": 3648, "step_time": 20.71492462977767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 136.1875, "completions/mean_terminated_length": 136.1875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2660072445869446, "epoch": 0.16901343214451134, "frac_reward_zero_std": 1.0, "grad_norm": 0.01897454261779785, "kl": 0.008365912595763803, "learning_rate": 9.662065771190365e-07, "loss": 0.0004, "num_tokens": 100410864.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3649, "step_time": 14.980790104717016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 220.8125, "completions/mean_terminated_length": 220.8125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.41321227699518204, "epoch": 0.16905974988420566, "frac_reward_zero_std": 0.0, "grad_norm": 0.1119815856218338, "kl": 0.011686134850606322, "learning_rate": 9.661973135710976e-07, "loss": -0.1593, "num_tokens": 100438685.0, "reward": 0.22011367976665497, "reward_std": 0.393751323223114, "rewards/reward_func/mean": 0.22011367976665497, "rewards/reward_func/std": 0.393751323223114, "step": 3650, "step_time": 27.304416824132204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 129.1875, "completions/mean_terminated_length": 129.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2917369455099106, "epoch": 0.16910606762389996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027787622530013323, "kl": 0.002260442852275446, "learning_rate": 9.661880500231587e-07, "loss": 0.0001, "num_tokens": 100461184.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3651, "step_time": 14.18328521028161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 148.6875, "completions/mean_terminated_length": 148.6875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.393869049847126, "epoch": 0.16915238536359425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025377217680215836, "kl": 0.0024148482480086386, "learning_rate": 9.6617878647522e-07, "loss": 0.0001, "num_tokens": 100506411.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3652, "step_time": 22.5497288107872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3528295233845711, "epoch": 0.16919870310328855, "frac_reward_zero_std": 0.0, "grad_norm": 0.10889166593551636, "kl": 0.018985942471772432, "learning_rate": 9.661695229272812e-07, "loss": 0.0128, "num_tokens": 100527407.0, "reward": 0.21260768175125122, "reward_std": 0.393032044172287, "rewards/reward_func/mean": 0.21260768175125122, "rewards/reward_func/std": 0.393032044172287, "step": 3653, "step_time": 18.676404014229774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 118.5, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.34263648837804794, "epoch": 0.16924502084298287, "frac_reward_zero_std": 1.0, "grad_norm": 0.00307916896417737, "kl": 0.002494905376806855, "learning_rate": 9.661602593793421e-07, "loss": 0.0001, "num_tokens": 100552311.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3654, "step_time": 14.8729502633214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.34922515600919724, "epoch": 0.16929133858267717, "frac_reward_zero_std": 1.0, "grad_norm": 0.002448687329888344, "kl": 0.0023555216030217707, "learning_rate": 9.661509958314035e-07, "loss": 0.0001, "num_tokens": 100580059.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3655, "step_time": 20.91957689449191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 227.9375, "completions/mean_terminated_length": 227.9375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.18775209039449692, "epoch": 0.16933765632237147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021545514464378357, "kl": 0.0016433187702205032, "learning_rate": 9.661417322834646e-07, "loss": 0.0001, "num_tokens": 100613370.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3656, "step_time": 24.909799750894308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 136.125, "completions/mean_terminated_length": 136.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2396089732646942, "epoch": 0.16938397406206576, "frac_reward_zero_std": 1.0, "grad_norm": 0.004323905799537897, "kl": 0.002229051402537152, "learning_rate": 9.661324687355257e-07, "loss": 0.0001, "num_tokens": 100633020.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3657, "step_time": 14.211376182734966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 170.375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.16995025426149368, "epoch": 0.16943029180176009, "frac_reward_zero_std": 0.0, "grad_norm": 0.1598382443189621, "kl": 0.026814636774361134, "learning_rate": 9.661232051875868e-07, "loss": -0.0344, "num_tokens": 100665922.0, "reward": 0.7106248736381531, "reward_std": 0.33894577622413635, "rewards/reward_func/mean": 0.7106248736381531, "rewards/reward_func/std": 0.33894577622413635, "step": 3658, "step_time": 19.974772695451975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 126.8125, "completions/mean_terminated_length": 126.8125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.34412428736686707, "epoch": 0.16947660954145438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038280535954982042, "kl": 0.0029077634681016207, "learning_rate": 9.66113941639648e-07, "loss": 0.0001, "num_tokens": 100701727.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3659, "step_time": 18.098359052091837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 130.375, "completions/mean_terminated_length": 130.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24189125001430511, "epoch": 0.16952292728114868, "frac_reward_zero_std": 1.0, "grad_norm": 0.003285208949819207, "kl": 0.0021098259603604674, "learning_rate": 9.66104678091709e-07, "loss": 0.0001, "num_tokens": 100721317.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3660, "step_time": 14.200971778482199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.23702138662338257, "epoch": 0.16956924502084297, "frac_reward_zero_std": 0.0, "grad_norm": 0.12934033572673798, "kl": 0.0036044674343429506, "learning_rate": 9.660954145437702e-07, "loss": 0.0309, "num_tokens": 100742597.0, "reward": 0.984572172164917, "reward_std": 0.06171126291155815, "rewards/reward_func/mean": 0.984572172164917, "rewards/reward_func/std": 0.06171126663684845, "step": 3661, "step_time": 19.861062217503786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 172.5625, "completions/mean_terminated_length": 172.5625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.21542150154709816, "epoch": 0.1696155627605373, "frac_reward_zero_std": 1.0, "grad_norm": 0.006063431967049837, "kl": 0.0042796487687155604, "learning_rate": 9.660861509958313e-07, "loss": 0.0002, "num_tokens": 100765518.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 3662, "step_time": 17.872896548360586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 154.5625, "completions/mean_terminated_length": 154.5625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3327007219195366, "epoch": 0.1696618805002316, "frac_reward_zero_std": 1.0, "grad_norm": 0.007003942970186472, "kl": 0.006100799655541778, "learning_rate": 9.660768874478925e-07, "loss": 0.0003, "num_tokens": 100787943.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3663, "step_time": 17.094124987721443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.27463462203741074, "epoch": 0.1697081982399259, "frac_reward_zero_std": 0.0, "grad_norm": 0.10698054730892181, "kl": 0.0058757910737767816, "learning_rate": 9.660676238999536e-07, "loss": -0.0013, "num_tokens": 100815641.0, "reward": 0.9841635823249817, "reward_std": 0.04327329620718956, "rewards/reward_func/mean": 0.9841635823249817, "rewards/reward_func/std": 0.04327329248189926, "step": 3664, "step_time": 24.39543791860342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 137.0, "completions/mean_terminated_length": 137.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3345174938440323, "epoch": 0.16975451597962019, "frac_reward_zero_std": 1.0, "grad_norm": 0.004585576709359884, "kl": 0.002789916645269841, "learning_rate": 9.66058360352015e-07, "loss": 0.0001, "num_tokens": 100838185.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3665, "step_time": 15.527457643300295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2437593974173069, "epoch": 0.1698008337193145, "frac_reward_zero_std": 1.0, "grad_norm": 0.007722876965999603, "kl": 0.0038712118403054774, "learning_rate": 9.660490968040758e-07, "loss": 0.0002, "num_tokens": 100857763.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3666, "step_time": 13.929988894611597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.28165485709905624, "epoch": 0.1698471514590088, "frac_reward_zero_std": 0.0, "grad_norm": 0.10015315562486649, "kl": 0.02622091630473733, "learning_rate": 9.66039833256137e-07, "loss": -0.0143, "num_tokens": 100880687.0, "reward": 0.8524489998817444, "reward_std": 0.19673466682434082, "rewards/reward_func/mean": 0.8524489998817444, "rewards/reward_func/std": 0.19673468172550201, "step": 3667, "step_time": 20.948937579989433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.28634975105524063, "epoch": 0.1698934691987031, "frac_reward_zero_std": 1.0, "grad_norm": 0.003771155374124646, "kl": 0.002001322660362348, "learning_rate": 9.66030569708198e-07, "loss": 0.0001, "num_tokens": 100906169.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3668, "step_time": 15.294891849160194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 144.0625, "completions/mean_terminated_length": 144.0625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3732023164629936, "epoch": 0.1699397869383974, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026728189550340176, "kl": 0.002181119954911992, "learning_rate": 9.660213061602594e-07, "loss": 0.0001, "num_tokens": 100929850.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3669, "step_time": 17.441290482878685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 142.1875, "completions/mean_terminated_length": 142.1875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.2838490232825279, "epoch": 0.16998610467809172, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025254993233829737, "kl": 0.001794445444829762, "learning_rate": 9.660120426123205e-07, "loss": 0.0001, "num_tokens": 100951373.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3670, "step_time": 15.839925896376371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2890567481517792, "epoch": 0.17003242241778602, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031862810719758272, "kl": 0.002349153161048889, "learning_rate": 9.660027790643817e-07, "loss": 0.0001, "num_tokens": 100972187.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3671, "step_time": 13.070206925272942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 170.5625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.33420466631650925, "epoch": 0.1700787401574803, "frac_reward_zero_std": 0.0, "grad_norm": 0.025074990466237068, "kl": 0.015349670546129346, "learning_rate": 9.659935155164428e-07, "loss": 0.0007, "num_tokens": 100993668.0, "reward": 1.8562681702860573e-07, "reward_std": 7.425072681144229e-07, "rewards/reward_func/mean": 1.8562681702860573e-07, "rewards/reward_func/std": 7.425073249578418e-07, "step": 3672, "step_time": 17.41484932228923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.26415394246578217, "epoch": 0.1701250578971746, "frac_reward_zero_std": 0.0, "grad_norm": 0.19075551629066467, "kl": 0.01063548936508596, "learning_rate": 9.65984251968504e-07, "loss": 0.003, "num_tokens": 101015687.0, "reward": 0.8836101293563843, "reward_std": 0.04517616704106331, "rewards/reward_func/mean": 0.8836101293563843, "rewards/reward_func/std": 0.045176174491643906, "step": 3673, "step_time": 17.412964086979628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 126.3125, "completions/mean_terminated_length": 126.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.25147486478090286, "epoch": 0.17017137563686893, "frac_reward_zero_std": 1.0, "grad_norm": 0.002212977735325694, "kl": 0.0018555527785792947, "learning_rate": 9.65974988420565e-07, "loss": 0.0001, "num_tokens": 101037036.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3674, "step_time": 13.811803121119738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 206.3125, "completions/mean_terminated_length": 206.3125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.25448210909962654, "epoch": 0.17021769337656323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040047443471848965, "kl": 0.02641190541908145, "learning_rate": 9.659657248726262e-07, "loss": 0.0013, "num_tokens": 101071713.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3675, "step_time": 23.2487272284925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 132.5625, "completions/mean_terminated_length": 132.5625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.306539848446846, "epoch": 0.17026401111625752, "frac_reward_zero_std": 1.0, "grad_norm": 0.002413178561255336, "kl": 0.0018781186663545668, "learning_rate": 9.659564613246873e-07, "loss": 0.0001, "num_tokens": 101107530.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3676, "step_time": 17.817917369306087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 173.9375, "completions/mean_terminated_length": 173.9375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.1745246984064579, "epoch": 0.17031032885595182, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027705396059900522, "kl": 0.0018873816588893533, "learning_rate": 9.659471977767484e-07, "loss": 0.0001, "num_tokens": 101144569.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3677, "step_time": 21.63655637949705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 151.8125, "completions/mean_terminated_length": 151.8125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.18667694926261902, "epoch": 0.17035664659564614, "frac_reward_zero_std": 0.0, "grad_norm": 0.23493200540542603, "kl": 0.004584392299875617, "learning_rate": 9.659379342288095e-07, "loss": -0.05, "num_tokens": 101168758.0, "reward": 0.017583835870027542, "reward_std": 0.030946951359510422, "rewards/reward_func/mean": 0.017583835870027542, "rewards/reward_func/std": 0.03094695322215557, "step": 3678, "step_time": 16.856909211724997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.27224060148000717, "epoch": 0.17040296433534044, "frac_reward_zero_std": 1.0, "grad_norm": 0.005826570559293032, "kl": 0.0031756037496961653, "learning_rate": 9.659286706808707e-07, "loss": 0.0002, "num_tokens": 101189050.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3679, "step_time": 16.656532626599073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 134.625, "completions/mean_terminated_length": 134.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.23219594731926918, "epoch": 0.17044928207503474, "frac_reward_zero_std": 1.0, "grad_norm": 0.004386923275887966, "kl": 0.002785380696877837, "learning_rate": 9.659194071329318e-07, "loss": 0.0001, "num_tokens": 101209812.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3680, "step_time": 14.081293478608131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.37270666658878326, "epoch": 0.17049559981472903, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055905235931277275, "kl": 0.004493484157137573, "learning_rate": 9.65910143584993e-07, "loss": 0.0002, "num_tokens": 101260568.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3681, "step_time": 24.699590887874365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 147.0625, "completions/mean_terminated_length": 147.0625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2637674883008003, "epoch": 0.17054191755442336, "frac_reward_zero_std": 1.0, "grad_norm": 0.007293649483472109, "kl": 0.004482871852815151, "learning_rate": 9.659008800370543e-07, "loss": 0.0002, "num_tokens": 101280649.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3682, "step_time": 16.569469437003136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.25608301162719727, "epoch": 0.17058823529411765, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035667158663272858, "kl": 0.0022067642712499946, "learning_rate": 9.658916164891154e-07, "loss": 0.0001, "num_tokens": 101302383.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3683, "step_time": 13.727999657392502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 155.0, "completions/mean_terminated_length": 155.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.44421854615211487, "epoch": 0.17063455303381195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038980096578598022, "kl": 0.0033952242229133844, "learning_rate": 9.658823529411765e-07, "loss": 0.0002, "num_tokens": 101360431.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3684, "step_time": 25.657791543751955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 176.1875, "completions/mean_terminated_length": 176.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4523504003882408, "epoch": 0.17068087077350624, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030881785787642, "kl": 0.0030222403001971543, "learning_rate": 9.658730893932376e-07, "loss": 0.0002, "num_tokens": 101382098.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3685, "step_time": 20.41754274070263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 154.75, "completions/mean_terminated_length": 154.75, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.33699050545692444, "epoch": 0.17072718851320057, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036779611837118864, "kl": 0.0024129419471137226, "learning_rate": 9.658638258452988e-07, "loss": 0.0001, "num_tokens": 101403582.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3686, "step_time": 16.62067049369216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.2957451716065407, "epoch": 0.17077350625289486, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024708774872124195, "kl": 0.0018769590242300183, "learning_rate": 9.658545622973599e-07, "loss": 0.0001, "num_tokens": 101425530.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3687, "step_time": 15.382260516285896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.1754971146583557, "epoch": 0.17081982399258916, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021309982985258102, "kl": 0.0015650664572604, "learning_rate": 9.65845298749421e-07, "loss": 0.0001, "num_tokens": 101449542.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 3688, "step_time": 18.965327501296997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.18681560829281807, "epoch": 0.17086614173228346, "frac_reward_zero_std": 0.0, "grad_norm": 0.19526633620262146, "kl": 0.004680161306168884, "learning_rate": 9.658360352014821e-07, "loss": -0.0346, "num_tokens": 101470830.0, "reward": 0.6141525506973267, "reward_std": 0.2965758144855499, "rewards/reward_func/mean": 0.6141525506973267, "rewards/reward_func/std": 0.2965758144855499, "step": 3689, "step_time": 15.90220457687974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.27682189643383026, "epoch": 0.17091245947197778, "frac_reward_zero_std": 1.0, "grad_norm": 0.002285691909492016, "kl": 0.0017480867099948227, "learning_rate": 9.658267716535433e-07, "loss": 0.0001, "num_tokens": 101493616.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3690, "step_time": 14.771844647824764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 177.75, "completions/mean_terminated_length": 177.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2852865308523178, "epoch": 0.17095877721167207, "frac_reward_zero_std": 0.0, "grad_norm": 0.11144199222326279, "kl": 0.020489394664764404, "learning_rate": 9.658175081056044e-07, "loss": -0.0358, "num_tokens": 101515948.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 3691, "step_time": 18.764317836612463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 159.1875, "completions/mean_terminated_length": 159.1875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.15364742279052734, "epoch": 0.17100509495136637, "frac_reward_zero_std": 0.0, "grad_norm": 0.1287347376346588, "kl": 0.003039184375666082, "learning_rate": 9.658082445576655e-07, "loss": -0.0072, "num_tokens": 101539407.0, "reward": 0.3012182116508484, "reward_std": 0.01231658086180687, "rewards/reward_func/mean": 0.3012182116508484, "rewards/reward_func/std": 0.01231657899916172, "step": 3692, "step_time": 16.599547754973173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 150.25, "completions/mean_terminated_length": 150.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3196949064731598, "epoch": 0.17105141269106067, "frac_reward_zero_std": 1.0, "grad_norm": 0.010743924416601658, "kl": 0.004210477869492024, "learning_rate": 9.657989810097266e-07, "loss": 0.0002, "num_tokens": 101566963.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3693, "step_time": 17.0893935225904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3706919550895691, "epoch": 0.171097730430755, "frac_reward_zero_std": 1.0, "grad_norm": 0.016289448365569115, "kl": 0.005931872874498367, "learning_rate": 9.657897174617878e-07, "loss": 0.0003, "num_tokens": 101612005.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3694, "step_time": 21.769167751073837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "entropy": 0.20919641852378845, "epoch": 0.1711440481704493, "frac_reward_zero_std": 1.0, "grad_norm": 0.006265141069889069, "kl": 0.008982607396319509, "learning_rate": 9.65780453913849e-07, "loss": 0.0005, "num_tokens": 101637365.0, "reward": 0.8914382457733154, "reward_std": 0.0, "rewards/reward_func/mean": 0.8914382457733154, "rewards/reward_func/std": 0.0, "step": 3695, "step_time": 25.146470360457897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.24084605649113655, "epoch": 0.17119036591014358, "frac_reward_zero_std": 1.0, "grad_norm": 0.01442259643226862, "kl": 0.013131072046235204, "learning_rate": 9.657711903659102e-07, "loss": 0.0007, "num_tokens": 101660931.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3696, "step_time": 21.33481117337942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 198.0625, "completions/mean_terminated_length": 198.0625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2683481052517891, "epoch": 0.17123668364983788, "frac_reward_zero_std": 1.0, "grad_norm": 0.012098944745957851, "kl": 0.011228818213567138, "learning_rate": 9.657619268179711e-07, "loss": 0.0006, "num_tokens": 101690884.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3697, "step_time": 21.216736134141684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 151.375, "completions/mean_terminated_length": 151.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.32109837979078293, "epoch": 0.1712830013895322, "frac_reward_zero_std": 1.0, "grad_norm": 0.007906484417617321, "kl": 0.005550313624553382, "learning_rate": 9.657526632700323e-07, "loss": 0.0003, "num_tokens": 101712458.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3698, "step_time": 15.93731328472495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 178.6875, "completions/mean_terminated_length": 178.6875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3686307594180107, "epoch": 0.1713293191292265, "frac_reward_zero_std": 1.0, "grad_norm": 0.00804579071700573, "kl": 0.008447291096672416, "learning_rate": 9.657433997220936e-07, "loss": 0.0004, "num_tokens": 101733589.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3699, "step_time": 20.16174814477563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 167.0625, "completions/mean_terminated_length": 167.0625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.1832829974591732, "epoch": 0.1713756368689208, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025956721510738134, "kl": 0.0017483013507444412, "learning_rate": 9.657341361741547e-07, "loss": 0.0001, "num_tokens": 101756982.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 3700, "step_time": 18.25501473993063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2978828251361847, "epoch": 0.1714219546086151, "frac_reward_zero_std": 1.0, "grad_norm": 0.004572514444589615, "kl": 0.001750871044350788, "learning_rate": 9.657248726262158e-07, "loss": 0.0001, "num_tokens": 101782522.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3701, "step_time": 16.17101848870516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.18214595690369606, "epoch": 0.17146827234830941, "frac_reward_zero_std": 0.0, "grad_norm": 0.13575305044651031, "kl": 0.011036734678782523, "learning_rate": 9.65715609078277e-07, "loss": -0.0263, "num_tokens": 101820970.0, "reward": 0.835374116897583, "reward_std": 0.1148674339056015, "rewards/reward_func/mean": 0.835374116897583, "rewards/reward_func/std": 0.1148674339056015, "step": 3702, "step_time": 25.153350312262774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 121.6875, "completions/mean_terminated_length": 121.6875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.21703851595520973, "epoch": 0.1715145900880037, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034549045376479626, "kl": 0.002502273127902299, "learning_rate": 9.65706345530338e-07, "loss": 0.0001, "num_tokens": 101840629.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3703, "step_time": 13.370243936777115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 159.3125, "completions/mean_terminated_length": 159.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.17573610693216324, "epoch": 0.171560907827698, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019829643424600363, "kl": 0.0013354003604035825, "learning_rate": 9.656970819823992e-07, "loss": 0.0001, "num_tokens": 101870906.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 3704, "step_time": 19.326514348387718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 133.625, "completions/mean_terminated_length": 133.625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.31791510432958603, "epoch": 0.1716072255673923, "frac_reward_zero_std": 1.0, "grad_norm": 0.002956633921712637, "kl": 0.0022645049612037838, "learning_rate": 9.656878184344603e-07, "loss": 0.0001, "num_tokens": 101906916.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3705, "step_time": 18.75690321996808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 204.9375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.3802553340792656, "epoch": 0.17165354330708663, "frac_reward_zero_std": 1.0, "grad_norm": 0.010592067614197731, "kl": 0.009749772492796183, "learning_rate": 9.656785548865215e-07, "loss": 0.0005, "num_tokens": 101931827.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3706, "step_time": 24.345446296036243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.17301489785313606, "epoch": 0.17169986104678092, "frac_reward_zero_std": 1.0, "grad_norm": 0.006229810882359743, "kl": 0.0051842586835846305, "learning_rate": 9.656692913385826e-07, "loss": 0.0003, "num_tokens": 101964049.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 3707, "step_time": 21.264019422233105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 156.8125, "completions/mean_terminated_length": 156.8125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.39049182087183, "epoch": 0.17174617878647522, "frac_reward_zero_std": 1.0, "grad_norm": 0.004415351431816816, "kl": 0.002992833615280688, "learning_rate": 9.656600277906437e-07, "loss": 0.0001, "num_tokens": 101996990.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3708, "step_time": 19.036223154515028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 243.0625, "completions/mean_terminated_length": 243.0625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.28115164488554, "epoch": 0.1717924965261695, "frac_reward_zero_std": 0.0, "grad_norm": 0.08259239792823792, "kl": 0.015964378137141466, "learning_rate": 9.656507642427048e-07, "loss": -0.0682, "num_tokens": 102021487.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3709, "step_time": 25.482900887727737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.21375613659620285, "epoch": 0.17183881426586384, "frac_reward_zero_std": 1.0, "grad_norm": 0.005931749939918518, "kl": 0.0047469911514781415, "learning_rate": 9.65641500694766e-07, "loss": 0.0002, "num_tokens": 102046059.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3710, "step_time": 19.37825195491314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 139.3125, "completions/mean_terminated_length": 139.3125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2109673209488392, "epoch": 0.17188513200555813, "frac_reward_zero_std": 1.0, "grad_norm": 0.002921863691881299, "kl": 0.0015583883505314589, "learning_rate": 9.65632237146827e-07, "loss": 0.0001, "num_tokens": 102066224.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3711, "step_time": 15.059500459581614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.25227079540491104, "epoch": 0.17193144974525243, "frac_reward_zero_std": 1.0, "grad_norm": 0.004785130266100168, "kl": 0.011102706892415881, "learning_rate": 9.656229735988884e-07, "loss": 0.0006, "num_tokens": 102099698.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 3712, "step_time": 20.44314457848668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 202.8125, "completions/mean_terminated_length": 202.8125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.25471294671297073, "epoch": 0.17197776748494673, "frac_reward_zero_std": 1.0, "grad_norm": 0.004278079140931368, "kl": 0.003029489715117961, "learning_rate": 9.656137100509495e-07, "loss": 0.0002, "num_tokens": 102128703.0, "reward": 0.5044883489608765, "reward_std": 0.0, "rewards/reward_func/mean": 0.5044883489608765, "rewards/reward_func/std": 0.0, "step": 3713, "step_time": 20.81208061054349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 166.3125, "completions/mean_terminated_length": 166.3125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.13849686458706856, "epoch": 0.17202408522464105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012530670501291752, "kl": 0.0008675205754116178, "learning_rate": 9.656044465030107e-07, "loss": 0.0, "num_tokens": 102166548.0, "reward": 0.8385766744613647, "reward_std": 0.0, "rewards/reward_func/mean": 0.8385766744613647, "rewards/reward_func/std": 0.0, "step": 3714, "step_time": 19.745044253766537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 133.5625, "completions/mean_terminated_length": 133.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3137066662311554, "epoch": 0.17207040296433534, "frac_reward_zero_std": 1.0, "grad_norm": 0.008268541656434536, "kl": 0.0035453151213005185, "learning_rate": 9.655951829550718e-07, "loss": 0.0002, "num_tokens": 102187341.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3715, "step_time": 14.896617818623781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 122.125, "completions/mean_terminated_length": 122.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3074679300189018, "epoch": 0.17211672070402964, "frac_reward_zero_std": 1.0, "grad_norm": 0.002979022217914462, "kl": 0.0019353233510628343, "learning_rate": 9.65585919407133e-07, "loss": 0.0001, "num_tokens": 102211311.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3716, "step_time": 14.561978124082088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.30533620715141296, "epoch": 0.17216303844372394, "frac_reward_zero_std": 1.0, "grad_norm": 0.014913151040673256, "kl": 0.011204065987840295, "learning_rate": 9.65576655859194e-07, "loss": 0.0006, "num_tokens": 102232309.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3717, "step_time": 16.69642524048686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 121.375, "completions/mean_terminated_length": 121.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2996649816632271, "epoch": 0.17220935618341826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034739826805889606, "kl": 0.0024550510570406914, "learning_rate": 9.655673923112552e-07, "loss": 0.0001, "num_tokens": 102255083.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3718, "step_time": 14.627103310078382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 136.125, "completions/mean_terminated_length": 136.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.24994129687547684, "epoch": 0.17225567392311256, "frac_reward_zero_std": 1.0, "grad_norm": 0.003131583333015442, "kl": 0.001976861385628581, "learning_rate": 9.655581287633163e-07, "loss": 0.0001, "num_tokens": 102275805.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3719, "step_time": 14.621182892471552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2310882769525051, "epoch": 0.17230199166280685, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030810649041086435, "kl": 0.0019817696302197874, "learning_rate": 9.655488652153774e-07, "loss": 0.0001, "num_tokens": 102330524.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3720, "step_time": 27.614696621894836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 162.75, "completions/mean_terminated_length": 162.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.39921774715185165, "epoch": 0.17234830940250115, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016555432230234146, "kl": 0.002115784795023501, "learning_rate": 9.655396016674385e-07, "loss": 0.0001, "num_tokens": 102389976.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3721, "step_time": 27.416408576071262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 156.8125, "completions/mean_terminated_length": 156.8125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.19609655812382698, "epoch": 0.17239462714219547, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019529511919245124, "kl": 0.00153403909644112, "learning_rate": 9.655303381194997e-07, "loss": 0.0001, "num_tokens": 102410581.0, "reward": 0.7026185393333435, "reward_std": 0.0, "rewards/reward_func/mean": 0.7026185393333435, "rewards/reward_func/std": 0.0, "step": 3722, "step_time": 16.58870291337371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 158.75, "completions/mean_terminated_length": 158.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3258315548300743, "epoch": 0.17244094488188977, "frac_reward_zero_std": 1.0, "grad_norm": 0.006843021605163813, "kl": 0.004411225905641913, "learning_rate": 9.655210745715608e-07, "loss": 0.0002, "num_tokens": 102436433.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3723, "step_time": 17.99579330161214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 174.5, "completions/mean_terminated_length": 174.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.38945455104112625, "epoch": 0.17248726262158406, "frac_reward_zero_std": 1.0, "grad_norm": 0.007077403832226992, "kl": 0.005766460788436234, "learning_rate": 9.65511811023622e-07, "loss": 0.0003, "num_tokens": 102463545.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3724, "step_time": 19.768442127853632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 155.25, "completions/mean_terminated_length": 155.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.4106160178780556, "epoch": 0.17253358036127836, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020999412517994642, "kl": 0.0023164761951193213, "learning_rate": 9.655025474756833e-07, "loss": 0.0001, "num_tokens": 102498701.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3725, "step_time": 19.501278955489397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.38444218039512634, "epoch": 0.17257989810097268, "frac_reward_zero_std": 1.0, "grad_norm": 0.005422711372375488, "kl": 0.003909895662218332, "learning_rate": 9.654932839277444e-07, "loss": 0.0002, "num_tokens": 102542169.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3726, "step_time": 24.480846971273422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.17081598192453384, "epoch": 0.17262621584066698, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026369125116616488, "kl": 0.002134683105396107, "learning_rate": 9.654840203798055e-07, "loss": 0.0001, "num_tokens": 102581845.0, "reward": 0.8507331609725952, "reward_std": 0.0, "rewards/reward_func/mean": 0.8507331609725952, "rewards/reward_func/std": 0.0, "step": 3727, "step_time": 23.693589121103287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.28495217114686966, "epoch": 0.17267253358036128, "frac_reward_zero_std": 1.0, "grad_norm": 0.00954211875796318, "kl": 0.005100173759274185, "learning_rate": 9.654747568318664e-07, "loss": 0.0003, "num_tokens": 102603782.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3728, "step_time": 15.57606941089034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2724595218896866, "epoch": 0.17271885132005557, "frac_reward_zero_std": 1.0, "grad_norm": 0.004697760101407766, "kl": 0.0028598178178071976, "learning_rate": 9.654654932839278e-07, "loss": 0.0001, "num_tokens": 102625148.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3729, "step_time": 15.565168585628271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.20056196674704552, "epoch": 0.1727651690597499, "frac_reward_zero_std": 1.0, "grad_norm": 0.004472844768315554, "kl": 0.01144473161548376, "learning_rate": 9.654562297359889e-07, "loss": 0.0006, "num_tokens": 102654686.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3730, "step_time": 20.77423469349742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.248933307826519, "epoch": 0.1728114867994442, "frac_reward_zero_std": 1.0, "grad_norm": 0.004179767332971096, "kl": 0.0024145441129803658, "learning_rate": 9.6544696618805e-07, "loss": 0.0001, "num_tokens": 102675553.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3731, "step_time": 13.76466766744852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.21037736907601357, "epoch": 0.1728578045391385, "frac_reward_zero_std": 1.0, "grad_norm": 0.003996069543063641, "kl": 0.00316495617153123, "learning_rate": 9.654377026401111e-07, "loss": 0.0002, "num_tokens": 102703443.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3732, "step_time": 19.998630672693253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2855217680335045, "epoch": 0.17290412227883278, "frac_reward_zero_std": 0.0, "grad_norm": 0.16108264029026031, "kl": 0.022220322862267494, "learning_rate": 9.654284390921723e-07, "loss": -0.0756, "num_tokens": 102724536.0, "reward": 0.8462153673171997, "reward_std": 0.3358876705169678, "rewards/reward_func/mean": 0.8462153673171997, "rewards/reward_func/std": 0.3358876705169678, "step": 3733, "step_time": 18.034131448715925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.36467454582452774, "epoch": 0.1729504400185271, "frac_reward_zero_std": 1.0, "grad_norm": 0.007738959975540638, "kl": 0.007918860763311386, "learning_rate": 9.654191755442334e-07, "loss": 0.0004, "num_tokens": 102753670.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3734, "step_time": 23.042970154434443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3759901747107506, "epoch": 0.1729967577582214, "frac_reward_zero_std": 1.0, "grad_norm": 0.005353032145649195, "kl": 0.004246300901286304, "learning_rate": 9.654099119962945e-07, "loss": 0.0002, "num_tokens": 102777931.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3735, "step_time": 17.61615765839815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.207004364579916, "epoch": 0.1730430754979157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030513142701238394, "kl": 0.0021838270185980946, "learning_rate": 9.654006484483556e-07, "loss": 0.0001, "num_tokens": 102803551.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 3736, "step_time": 17.92522521317005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.2569631487131119, "epoch": 0.17308939323761, "frac_reward_zero_std": 0.0, "grad_norm": 0.07253891229629517, "kl": 0.009775074431672692, "learning_rate": 9.653913849004168e-07, "loss": -0.1115, "num_tokens": 102833001.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 3737, "step_time": 32.41397521272302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.28750574588775635, "epoch": 0.17313571097730432, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025953215081244707, "kl": 0.0023064085689838976, "learning_rate": 9.653821213524779e-07, "loss": 0.0001, "num_tokens": 102856671.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3738, "step_time": 16.143691390752792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3195468336343765, "epoch": 0.17318202871699862, "frac_reward_zero_std": 1.0, "grad_norm": 0.003679557703435421, "kl": 0.0026374868175480515, "learning_rate": 9.653728578045392e-07, "loss": 0.0001, "num_tokens": 102877288.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3739, "step_time": 15.23041708394885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 117.3125, "completions/mean_terminated_length": 117.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2956790328025818, "epoch": 0.1732283464566929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032189737539738417, "kl": 0.0019083183142356575, "learning_rate": 9.653635942566001e-07, "loss": 0.0001, "num_tokens": 102898493.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3740, "step_time": 13.946897964924574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.41823677718639374, "epoch": 0.1732746641963872, "frac_reward_zero_std": 1.0, "grad_norm": 0.003907694946974516, "kl": 0.0029561029514297843, "learning_rate": 9.653543307086613e-07, "loss": 0.0001, "num_tokens": 102940277.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3741, "step_time": 23.860097888857126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 118.9375, "completions/mean_terminated_length": 118.9375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3027636259794235, "epoch": 0.17332098193608153, "frac_reward_zero_std": 1.0, "grad_norm": 0.001629327773116529, "kl": 0.0017339542391709983, "learning_rate": 9.653450671607226e-07, "loss": 0.0001, "num_tokens": 102961412.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3742, "step_time": 13.017041232436895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 198.1875, "completions/mean_terminated_length": 198.1875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.20047512650489807, "epoch": 0.17336729967577583, "frac_reward_zero_std": 1.0, "grad_norm": 0.001798186800442636, "kl": 0.0012854207889176905, "learning_rate": 9.653358036127837e-07, "loss": 0.0001, "num_tokens": 102996583.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 3743, "step_time": 22.757054530084133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3016543537378311, "epoch": 0.17341361741547012, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034233916085213423, "kl": 0.002116954739904031, "learning_rate": 9.653265400648448e-07, "loss": 0.0001, "num_tokens": 103018176.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3744, "step_time": 15.639807790517807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 259.9375, "completions/mean_terminated_length": 259.9375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.4923515170812607, "epoch": 0.17345993515516442, "frac_reward_zero_std": 0.0, "grad_norm": 0.07116729021072388, "kl": 0.010051576886326075, "learning_rate": 9.65317276516906e-07, "loss": -0.0145, "num_tokens": 103051183.0, "reward": 0.11555609852075577, "reward_std": 0.31404364109039307, "rewards/reward_func/mean": 0.11555609852075577, "rewards/reward_func/std": 0.3140436112880707, "step": 3745, "step_time": 34.470788452774286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 115.125, "completions/mean_terminated_length": 115.125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2561406195163727, "epoch": 0.17350625289485874, "frac_reward_zero_std": 1.0, "grad_norm": 0.004937897901982069, "kl": 0.003161235072184354, "learning_rate": 9.65308012968967e-07, "loss": 0.0002, "num_tokens": 103070497.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3746, "step_time": 14.427247531712055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 140.9375, "completions/mean_terminated_length": 140.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3614794984459877, "epoch": 0.17355257063455304, "frac_reward_zero_std": 1.0, "grad_norm": 0.006700413767248392, "kl": 0.00337793497601524, "learning_rate": 9.652987494210282e-07, "loss": 0.0002, "num_tokens": 103101888.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3747, "step_time": 18.192476235330105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 148.1875, "completions/mean_terminated_length": 148.1875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.16572047024965286, "epoch": 0.17359888837424733, "frac_reward_zero_std": 1.0, "grad_norm": 0.010025094263255596, "kl": 0.0083791270153597, "learning_rate": 9.652894858730893e-07, "loss": 0.0004, "num_tokens": 103124147.0, "reward": 0.6778095960617065, "reward_std": 0.0, "rewards/reward_func/mean": 0.6778095960617065, "rewards/reward_func/std": 0.0, "step": 3748, "step_time": 16.14195951446891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 153.0, "completions/mean_terminated_length": 153.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.19808354601264, "epoch": 0.17364520611394163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061765736900269985, "kl": 0.004112478287424892, "learning_rate": 9.652802223251505e-07, "loss": 0.0002, "num_tokens": 103144947.0, "reward": 0.780767560005188, "reward_std": 0.0, "rewards/reward_func/mean": 0.780767560005188, "rewards/reward_func/std": 0.0, "step": 3749, "step_time": 15.514467250555754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 234.5625, "completions/mean_terminated_length": 234.5625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.20365862175822258, "epoch": 0.17369152385363595, "frac_reward_zero_std": 1.0, "grad_norm": 0.005523050203919411, "kl": 0.005299078766256571, "learning_rate": 9.652709587772116e-07, "loss": 0.0003, "num_tokens": 103168860.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3750, "step_time": 22.849873408675194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.23281722888350487, "epoch": 0.17373784159333025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035154789220541716, "kl": 0.0028017257573083043, "learning_rate": 9.652616952292727e-07, "loss": 0.0001, "num_tokens": 103200529.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 3751, "step_time": 18.584865167737007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 180.4375, "completions/mean_terminated_length": 180.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.31790756434202194, "epoch": 0.17378415933302455, "frac_reward_zero_std": 0.0, "grad_norm": 0.09971493482589722, "kl": 0.0063216263661161065, "learning_rate": 9.652524316813338e-07, "loss": 0.0316, "num_tokens": 103222216.0, "reward": 0.8482850790023804, "reward_std": 0.2262093424797058, "rewards/reward_func/mean": 0.8482850790023804, "rewards/reward_func/std": 0.226209357380867, "step": 3752, "step_time": 20.639958526939154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 147.9375, "completions/mean_terminated_length": 147.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.29077449440956116, "epoch": 0.17383047707271884, "frac_reward_zero_std": 1.0, "grad_norm": 0.00254775770008564, "kl": 0.002161647193133831, "learning_rate": 9.65243168133395e-07, "loss": 0.0001, "num_tokens": 103245623.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3753, "step_time": 15.709707036614418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 167.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.21284860745072365, "epoch": 0.17387679481241317, "frac_reward_zero_std": 1.0, "grad_norm": 0.007645934820175171, "kl": 0.024556422606110573, "learning_rate": 9.65233904585456e-07, "loss": 0.0012, "num_tokens": 103266277.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 3754, "step_time": 19.057133305817842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2866797000169754, "epoch": 0.17392311255210746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035950294695794582, "kl": 0.002579561376478523, "learning_rate": 9.652246410375174e-07, "loss": 0.0001, "num_tokens": 103288259.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3755, "step_time": 15.143626194447279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3602429926395416, "epoch": 0.17396943029180176, "frac_reward_zero_std": 1.0, "grad_norm": 0.00384682253934443, "kl": 0.0033572075772099197, "learning_rate": 9.652153774895786e-07, "loss": 0.0002, "num_tokens": 103334785.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3756, "step_time": 22.83459320664406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 167.4375, "completions/mean_terminated_length": 167.4375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.28992898762226105, "epoch": 0.17401574803149605, "frac_reward_zero_std": 1.0, "grad_norm": 0.006469201762229204, "kl": 0.004892459022812545, "learning_rate": 9.652061139416397e-07, "loss": 0.0002, "num_tokens": 103359816.0, "reward": 0.694277822971344, "reward_std": 0.0, "rewards/reward_func/mean": 0.694277822971344, "rewards/reward_func/std": 0.0, "step": 3757, "step_time": 18.62649766355753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 137.6875, "completions/mean_terminated_length": 137.6875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3078532963991165, "epoch": 0.17406206577119038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054520429112017155, "kl": 0.002901038737036288, "learning_rate": 9.651968503937006e-07, "loss": 0.0001, "num_tokens": 103383187.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3758, "step_time": 14.65495702624321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 129.6875, "completions/mean_terminated_length": 129.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2888607755303383, "epoch": 0.17410838351088467, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038695086259394884, "kl": 0.00270639342488721, "learning_rate": 9.65187586845762e-07, "loss": 0.0001, "num_tokens": 103403342.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3759, "step_time": 15.090375158935785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3306100144982338, "epoch": 0.17415470125057897, "frac_reward_zero_std": 1.0, "grad_norm": 0.011807381175458431, "kl": 0.005443289061076939, "learning_rate": 9.65178323297823e-07, "loss": 0.0003, "num_tokens": 103431916.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3760, "step_time": 16.116522643715143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.21775106713175774, "epoch": 0.17420101899027327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031909984536468983, "kl": 0.002064206579234451, "learning_rate": 9.651690597498842e-07, "loss": 0.0001, "num_tokens": 103451748.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3761, "step_time": 15.601998090744019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 171.4375, "completions/mean_terminated_length": 171.4375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.1548592373728752, "epoch": 0.1742473367299676, "frac_reward_zero_std": 0.0, "grad_norm": 0.09636900573968887, "kl": 0.0011171667283633724, "learning_rate": 9.651597962019453e-07, "loss": 0.0067, "num_tokens": 103480795.0, "reward": 0.939050555229187, "reward_std": 0.0152902128174901, "rewards/reward_func/mean": 0.939050555229187, "rewards/reward_func/std": 0.015290215611457825, "step": 3762, "step_time": 18.289710648357868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 276.9375, "completions/mean_terminated_length": 276.9375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.28093042969703674, "epoch": 0.17429365446966189, "frac_reward_zero_std": 0.0, "grad_norm": 0.09751872718334198, "kl": 0.009608272463083267, "learning_rate": 9.651505326540064e-07, "loss": -0.0371, "num_tokens": 103520666.0, "reward": 0.857061505317688, "reward_std": 0.11662736535072327, "rewards/reward_func/mean": 0.857061505317688, "rewards/reward_func/std": 0.11662737280130386, "step": 3763, "step_time": 29.446891706436872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 138.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.37999802827835083, "epoch": 0.17433997220935618, "frac_reward_zero_std": 1.0, "grad_norm": 0.009503517299890518, "kl": 0.005113681661896408, "learning_rate": 9.651412691060676e-07, "loss": 0.0003, "num_tokens": 103545717.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3764, "step_time": 16.105688523501158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3866517096757889, "epoch": 0.17438628994905048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031348082702606916, "kl": 0.0024833722854964435, "learning_rate": 9.651320055581287e-07, "loss": 0.0001, "num_tokens": 103577535.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3765, "step_time": 19.784521024674177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.46232395619153976, "epoch": 0.1744326076887448, "frac_reward_zero_std": 1.0, "grad_norm": 0.006246357224881649, "kl": 0.004766554455272853, "learning_rate": 9.651227420101898e-07, "loss": 0.0002, "num_tokens": 103611643.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3766, "step_time": 24.84987773373723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 168.5, "completions/mean_terminated_length": 168.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.34058190137147903, "epoch": 0.1744789254284391, "frac_reward_zero_std": 1.0, "grad_norm": 0.005178524646908045, "kl": 0.003166097041685134, "learning_rate": 9.65113478462251e-07, "loss": 0.0002, "num_tokens": 103645011.0, "reward": 0.0024787522852420807, "reward_std": 0.0, "rewards/reward_func/mean": 0.0024787522852420807, "rewards/reward_func/std": 0.0, "step": 3767, "step_time": 21.10859439522028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 148.0625, "completions/mean_terminated_length": 148.0625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.40794871747493744, "epoch": 0.1745252431681334, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030148965306580067, "kl": 0.002487560792360455, "learning_rate": 9.65104214914312e-07, "loss": 0.0001, "num_tokens": 103670820.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3768, "step_time": 16.841842528432608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.23108908906579018, "epoch": 0.1745715609078277, "frac_reward_zero_std": 0.0, "grad_norm": 0.09400482475757599, "kl": 0.00745700323022902, "learning_rate": 9.650949513663734e-07, "loss": 0.0083, "num_tokens": 103693104.0, "reward": 0.9030756950378418, "reward_std": 0.10626877844333649, "rewards/reward_func/mean": 0.9030756950378418, "rewards/reward_func/std": 0.10626878589391708, "step": 3769, "step_time": 20.494326047599316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 204.9375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.24559643864631653, "epoch": 0.174617878647522, "frac_reward_zero_std": 0.0, "grad_norm": 0.0932474210858345, "kl": 0.021621070336550474, "learning_rate": 9.650856878184345e-07, "loss": -0.0401, "num_tokens": 103718927.0, "reward": 0.6158918142318726, "reward_std": 0.25134479999542236, "rewards/reward_func/mean": 0.6158918142318726, "rewards/reward_func/std": 0.25134482979774475, "step": 3770, "step_time": 21.515521958470345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 154.3125, "completions/mean_terminated_length": 154.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.14618897810578346, "epoch": 0.1746641963872163, "frac_reward_zero_std": 0.0, "grad_norm": 0.16064424812793732, "kl": 0.004377821343950927, "learning_rate": 9.650764242704954e-07, "loss": -0.0244, "num_tokens": 103756196.0, "reward": 0.4293261468410492, "reward_std": 0.0618918351829052, "rewards/reward_func/mean": 0.4293261468410492, "rewards/reward_func/std": 0.061891838908195496, "step": 3771, "step_time": 20.11259887367487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.23949642106890678, "epoch": 0.1747105141269106, "frac_reward_zero_std": 1.0, "grad_norm": 0.00627810787409544, "kl": 0.005583815509453416, "learning_rate": 9.650671607225568e-07, "loss": 0.0003, "num_tokens": 103783011.0, "reward": 0.9534969329833984, "reward_std": 0.0, "rewards/reward_func/mean": 0.9534969329833984, "rewards/reward_func/std": 0.0, "step": 3772, "step_time": 25.637911964207888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.30497778952121735, "epoch": 0.1747568318666049, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022148836869746447, "kl": 0.0017805375391617417, "learning_rate": 9.650578971746179e-07, "loss": 0.0001, "num_tokens": 103809563.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3773, "step_time": 16.14947897940874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.1669195331633091, "epoch": 0.17480314960629922, "frac_reward_zero_std": 0.0, "grad_norm": 0.09515602141618729, "kl": 0.002404117549303919, "learning_rate": 9.65048633626679e-07, "loss": -0.0007, "num_tokens": 103833859.0, "reward": 0.7707434892654419, "reward_std": 0.29473787546157837, "rewards/reward_func/mean": 0.7707434892654419, "rewards/reward_func/std": 0.29473790526390076, "step": 3774, "step_time": 19.48267900198698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "entropy": 0.18492351472377777, "epoch": 0.17484946734599352, "frac_reward_zero_std": 1.0, "grad_norm": 0.004487346392124891, "kl": 0.005292492685839534, "learning_rate": 9.650393700787401e-07, "loss": 0.0003, "num_tokens": 103860621.0, "reward": 0.8445175886154175, "reward_std": 0.0, "rewards/reward_func/mean": 0.8445175886154175, "rewards/reward_func/std": 0.0, "step": 3775, "step_time": 25.96353581547737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 227.3125, "completions/mean_terminated_length": 227.3125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3666505515575409, "epoch": 0.17489578508568782, "frac_reward_zero_std": 0.0, "grad_norm": 0.10239812731742859, "kl": 0.013023757841438055, "learning_rate": 9.650301065308013e-07, "loss": -0.1897, "num_tokens": 103899666.0, "reward": 0.2180238515138626, "reward_std": 0.3339870870113373, "rewards/reward_func/mean": 0.2180238515138626, "rewards/reward_func/std": 0.33398711681365967, "step": 3776, "step_time": 32.03803377598524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 174.1875, "completions/mean_terminated_length": 174.1875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.21427936851978302, "epoch": 0.1749421028253821, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031666303984820843, "kl": 0.0021426088642328978, "learning_rate": 9.650208429828624e-07, "loss": 0.0001, "num_tokens": 103936917.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 3777, "step_time": 21.710630007088184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 186.5625, "completions/mean_terminated_length": 186.5625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.42350853979587555, "epoch": 0.17498842056507644, "frac_reward_zero_std": 1.0, "grad_norm": 0.006017395295202732, "kl": 0.004326632770244032, "learning_rate": 9.650115794349235e-07, "loss": 0.0002, "num_tokens": 103971358.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3778, "step_time": 22.553493205457926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 180.75, "completions/mean_terminated_length": 180.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2004511021077633, "epoch": 0.17503473830477073, "frac_reward_zero_std": 0.0, "grad_norm": 0.12609823048114777, "kl": 0.06032711360603571, "learning_rate": 9.650023158869846e-07, "loss": 0.024, "num_tokens": 103992970.0, "reward": 0.9580358266830444, "reward_std": 0.16785673797130585, "rewards/reward_func/mean": 0.9580358266830444, "rewards/reward_func/std": 0.16785675287246704, "step": 3779, "step_time": 19.642533019185066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 245.0625, "completions/mean_terminated_length": 245.0625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.36056920886039734, "epoch": 0.17508105604446503, "frac_reward_zero_std": 0.0, "grad_norm": 0.1281254142522812, "kl": 0.014152311254292727, "learning_rate": 9.649930523390458e-07, "loss": -0.1922, "num_tokens": 104032363.0, "reward": 0.3268398940563202, "reward_std": 0.33202239871025085, "rewards/reward_func/mean": 0.3268398940563202, "rewards/reward_func/std": 0.33202236890792847, "step": 3780, "step_time": 33.77379969879985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 168.9375, "completions/mean_terminated_length": 168.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3546975255012512, "epoch": 0.17512737378415932, "frac_reward_zero_std": 1.0, "grad_norm": 0.00474060233682394, "kl": 0.003740232961717993, "learning_rate": 9.649837887911069e-07, "loss": 0.0002, "num_tokens": 104052906.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3781, "step_time": 16.384960014373064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.16587354615330696, "epoch": 0.17517369152385365, "frac_reward_zero_std": 0.0, "grad_norm": 0.2559323012828827, "kl": 0.02330538514070213, "learning_rate": 9.649745252431682e-07, "loss": 0.0037, "num_tokens": 104076634.0, "reward": 0.6426084041595459, "reward_std": 0.16432462632656097, "rewards/reward_func/mean": 0.6426084041595459, "rewards/reward_func/std": 0.16432464122772217, "step": 3782, "step_time": 16.389335557818413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.44991335272789, "epoch": 0.17522000926354794, "frac_reward_zero_std": 1.0, "grad_norm": 0.007366468198597431, "kl": 0.005784983513876796, "learning_rate": 9.649652616952291e-07, "loss": 0.0003, "num_tokens": 104100793.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3783, "step_time": 21.555057518184185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.375, "completions/mean_terminated_length": 132.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2406536191701889, "epoch": 0.17526632700324224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029314288403838873, "kl": 0.001833113085012883, "learning_rate": 9.649559981472903e-07, "loss": 0.0001, "num_tokens": 104120319.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3784, "step_time": 13.804068814963102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 134.6875, "completions/mean_terminated_length": 134.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3224761486053467, "epoch": 0.17531264474293654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015656468458473682, "kl": 0.0014824274694547057, "learning_rate": 9.649467345993516e-07, "loss": 0.0001, "num_tokens": 104147322.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3785, "step_time": 15.190575946122408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 120.0625, "completions/mean_terminated_length": 120.0625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.26283227279782295, "epoch": 0.17535896248263086, "frac_reward_zero_std": 1.0, "grad_norm": 0.004732994362711906, "kl": 0.0025724535225890577, "learning_rate": 9.649374710514127e-07, "loss": 0.0001, "num_tokens": 104166843.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3786, "step_time": 13.980366911739111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 195.4375, "completions/mean_terminated_length": 195.4375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.4102611541748047, "epoch": 0.17540528022232516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037792969960719347, "kl": 0.0041391217964701355, "learning_rate": 9.649282075034738e-07, "loss": 0.0002, "num_tokens": 104213282.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3787, "step_time": 24.304872281849384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.5076296031475067, "epoch": 0.17545159796201945, "frac_reward_zero_std": 0.0, "grad_norm": 0.10037500411272049, "kl": 0.008740164805203676, "learning_rate": 9.64918943955535e-07, "loss": 0.1476, "num_tokens": 104237106.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 3788, "step_time": 30.68525357171893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.16099544242024422, "epoch": 0.17549791570171375, "frac_reward_zero_std": 1.0, "grad_norm": 0.004643074236810207, "kl": 0.003465694549959153, "learning_rate": 9.64909680407596e-07, "loss": 0.0002, "num_tokens": 104260924.0, "reward": 0.9591894745826721, "reward_std": 0.0, "rewards/reward_func/mean": 0.9591894745826721, "rewards/reward_func/std": 0.0, "step": 3789, "step_time": 19.04364400729537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 110.4375, "completions/mean_terminated_length": 110.4375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.30926015973091125, "epoch": 0.17554423344140807, "frac_reward_zero_std": 1.0, "grad_norm": 0.003996169660240412, "kl": 0.0023928280570544302, "learning_rate": 9.649004168596572e-07, "loss": 0.0001, "num_tokens": 104281715.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3790, "step_time": 12.806249611079693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 118.375, "completions/mean_terminated_length": 118.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2586973272264004, "epoch": 0.17559055118110237, "frac_reward_zero_std": 1.0, "grad_norm": 0.002421583281829953, "kl": 0.0017168657795991749, "learning_rate": 9.648911533117183e-07, "loss": 0.0001, "num_tokens": 104305753.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3791, "step_time": 13.677399579435587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 140.4375, "completions/mean_terminated_length": 140.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.28452108800411224, "epoch": 0.17563686892079666, "frac_reward_zero_std": 1.0, "grad_norm": 0.004345900844782591, "kl": 0.001882482465589419, "learning_rate": 9.648818897637795e-07, "loss": 0.0001, "num_tokens": 104327584.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3792, "step_time": 15.165101058781147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 168.75, "completions/mean_terminated_length": 168.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.25236088037490845, "epoch": 0.17568318666049096, "frac_reward_zero_std": 0.0, "grad_norm": 0.12061569839715958, "kl": 0.022313192021101713, "learning_rate": 9.648726262158406e-07, "loss": -0.0775, "num_tokens": 104350268.0, "reward": 0.4090709090232849, "reward_std": 0.2306751012802124, "rewards/reward_func/mean": 0.4090709090232849, "rewards/reward_func/std": 0.2306751012802124, "step": 3793, "step_time": 21.085606180131435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 153.4375, "completions/mean_terminated_length": 153.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4003930240869522, "epoch": 0.17572950440018528, "frac_reward_zero_std": 1.0, "grad_norm": 0.002661115489900112, "kl": 0.0022960519418120384, "learning_rate": 9.648633626679017e-07, "loss": 0.0001, "num_tokens": 104385251.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3794, "step_time": 19.423167020082474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 117.125, "completions/mean_terminated_length": 117.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2657902389764786, "epoch": 0.17577582213987958, "frac_reward_zero_std": 1.0, "grad_norm": 0.003785985754802823, "kl": 0.0022969872516114265, "learning_rate": 9.648540991199628e-07, "loss": 0.0001, "num_tokens": 104405925.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3795, "step_time": 12.784420773386955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3875296860933304, "epoch": 0.17582213987957387, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031278333626687527, "kl": 0.0026590877096168697, "learning_rate": 9.64844835572024e-07, "loss": 0.0001, "num_tokens": 104431727.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3796, "step_time": 16.57256530225277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 294.1875, "completions/mean_terminated_length": 294.1875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.24025741964578629, "epoch": 0.17586845761926817, "frac_reward_zero_std": 0.0, "grad_norm": 0.07518387585878372, "kl": 0.014827840030193329, "learning_rate": 9.64835572024085e-07, "loss": -0.0995, "num_tokens": 104471890.0, "reward": 0.7640482187271118, "reward_std": 0.3236340582370758, "rewards/reward_func/mean": 0.7640482187271118, "rewards/reward_func/std": 0.3236340880393982, "step": 3797, "step_time": 30.441859886050224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 191.0, "completions/mean_terminated_length": 191.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3019564151763916, "epoch": 0.1759147753589625, "frac_reward_zero_std": 0.0, "grad_norm": 0.12074263393878937, "kl": 0.010099475388415158, "learning_rate": 9.648263084761462e-07, "loss": -0.0083, "num_tokens": 104494066.0, "reward": 0.962847113609314, "reward_std": 0.02586994878947735, "rewards/reward_func/mean": 0.962847113609314, "rewards/reward_func/std": 0.02586994506418705, "step": 3798, "step_time": 20.14840253815055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.44835473597049713, "epoch": 0.1759610930986568, "frac_reward_zero_std": 0.0, "grad_norm": 0.10527169704437256, "kl": 0.004380170023068786, "learning_rate": 9.648170449282076e-07, "loss": 0.124, "num_tokens": 104516666.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 3799, "step_time": 26.33690120279789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 134.4375, "completions/mean_terminated_length": 134.4375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.35252758860588074, "epoch": 0.17600741083835109, "frac_reward_zero_std": 1.0, "grad_norm": 0.005306762643158436, "kl": 0.0034655986819416285, "learning_rate": 9.648077813802687e-07, "loss": 0.0002, "num_tokens": 104536689.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3800, "step_time": 14.679051656275988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 157.3125, "completions/mean_terminated_length": 157.3125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.24515363201498985, "epoch": 0.17605372857804538, "frac_reward_zero_std": 0.0, "grad_norm": 0.14069297909736633, "kl": 0.0238860035315156, "learning_rate": 9.647985178323296e-07, "loss": -0.0122, "num_tokens": 104558486.0, "reward": 0.464751660823822, "reward_std": 0.3110589385032654, "rewards/reward_func/mean": 0.464751660823822, "rewards/reward_func/std": 0.31105896830558777, "step": 3801, "step_time": 17.15539462864399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 187.375, "completions/mean_terminated_length": 187.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3115087226033211, "epoch": 0.1761000463177397, "frac_reward_zero_std": 0.0, "grad_norm": 0.30319005250930786, "kl": 0.023208333179354668, "learning_rate": 9.64789254284391e-07, "loss": -0.0291, "num_tokens": 104584700.0, "reward": 0.5386459827423096, "reward_std": 0.4325544536113739, "rewards/reward_func/mean": 0.5386459827423096, "rewards/reward_func/std": 0.4325544536113739, "step": 3802, "step_time": 21.620222613215446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 211.8125, "completions/mean_terminated_length": 211.8125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.20918000489473343, "epoch": 0.176146364057434, "frac_reward_zero_std": 0.0, "grad_norm": 0.15997625887393951, "kl": 0.014598413603380322, "learning_rate": 9.64779990736452e-07, "loss": -0.1067, "num_tokens": 104623193.0, "reward": 0.5002998113632202, "reward_std": 0.29802316427230835, "rewards/reward_func/mean": 0.5002998113632202, "rewards/reward_func/std": 0.29802316427230835, "step": 3803, "step_time": 27.53459670767188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 138.0625, "completions/mean_terminated_length": 138.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.31225357949733734, "epoch": 0.1761926817971283, "frac_reward_zero_std": 1.0, "grad_norm": 0.003856898285448551, "kl": 0.0028518750332295895, "learning_rate": 9.647707271885132e-07, "loss": 0.0001, "num_tokens": 104644970.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3804, "step_time": 14.3464663811028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 160.3125, "completions/mean_terminated_length": 160.3125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.17767687886953354, "epoch": 0.1762389995368226, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013742530718445778, "kl": 0.00119293577154167, "learning_rate": 9.647614636405743e-07, "loss": 0.0001, "num_tokens": 104697951.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 3805, "step_time": 23.665065202862024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 121.0625, "completions/mean_terminated_length": 121.0625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.29276765137910843, "epoch": 0.17628531727651692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026806010864675045, "kl": 0.0021326115529518574, "learning_rate": 9.647522000926354e-07, "loss": 0.0001, "num_tokens": 104718336.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3806, "step_time": 13.176405761390924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 100.1875, "completions/mean_terminated_length": 100.1875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.29868993163108826, "epoch": 0.1763316350162112, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025214203633368015, "kl": 0.0015813722275197506, "learning_rate": 9.647429365446966e-07, "loss": 0.0001, "num_tokens": 104740259.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3807, "step_time": 12.031390871852636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 139.125, "completions/mean_terminated_length": 139.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2776578664779663, "epoch": 0.1763779527559055, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035993652418255806, "kl": 0.0025423296028748155, "learning_rate": 9.647336729967577e-07, "loss": 0.0001, "num_tokens": 104763749.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3808, "step_time": 15.653035368770361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.12160328961908817, "epoch": 0.1764242704955998, "frac_reward_zero_std": 1.0, "grad_norm": 0.005729260388761759, "kl": 0.005660561204422265, "learning_rate": 9.647244094488188e-07, "loss": 0.0003, "num_tokens": 104786385.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3809, "step_time": 16.994318760931492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 138.8125, "completions/mean_terminated_length": 138.8125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3268174082040787, "epoch": 0.17647058823529413, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053449454717338085, "kl": 0.0030448375619016588, "learning_rate": 9.6471514590088e-07, "loss": 0.0002, "num_tokens": 104813918.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3810, "step_time": 17.80192095041275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 188.4375, "completions/mean_terminated_length": 188.4375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.27199970930814743, "epoch": 0.17651690597498843, "frac_reward_zero_std": 0.0, "grad_norm": 0.12092722207307816, "kl": 0.011022645980119705, "learning_rate": 9.64705882352941e-07, "loss": -0.1191, "num_tokens": 104835477.0, "reward": 0.30022096633911133, "reward_std": 0.4599035382270813, "rewards/reward_func/mean": 0.30022096633911133, "rewards/reward_func/std": 0.4599035382270813, "step": 3811, "step_time": 23.86600050330162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 164.0625, "completions/mean_terminated_length": 164.0625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.428058497607708, "epoch": 0.17656322371468272, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022345797624439, "kl": 0.0023550239857286215, "learning_rate": 9.646966188050024e-07, "loss": 0.0001, "num_tokens": 104870454.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3812, "step_time": 20.78255834430456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 163.4375, "completions/mean_terminated_length": 163.4375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3997066244482994, "epoch": 0.17660954145437702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022781190928071737, "kl": 0.002448283543344587, "learning_rate": 9.646873552570635e-07, "loss": 0.0001, "num_tokens": 104923053.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3813, "step_time": 24.927198097109795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 208.5625, "completions/mean_terminated_length": 208.5625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.44674404710531235, "epoch": 0.17665585919407134, "frac_reward_zero_std": 0.0, "grad_norm": 0.08394753187894821, "kl": 0.007794674369506538, "learning_rate": 9.646780917091244e-07, "loss": -0.0347, "num_tokens": 104963014.0, "reward": 0.00010168392327614129, "reward_std": 0.00040673569310456514, "rewards/reward_func/mean": 0.00010168392327614129, "rewards/reward_func/std": 0.0004067357222083956, "step": 3814, "step_time": 25.669972147792578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.16508163511753082, "epoch": 0.17670217693376564, "frac_reward_zero_std": 1.0, "grad_norm": 0.001488029258325696, "kl": 0.001016217895084992, "learning_rate": 9.646688281611858e-07, "loss": 0.0001, "num_tokens": 105009382.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 3815, "step_time": 23.64604101702571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.17832287400960922, "epoch": 0.17674849467345993, "frac_reward_zero_std": 1.0, "grad_norm": 0.005954586435109377, "kl": 0.004308550618588924, "learning_rate": 9.64659564613247e-07, "loss": 0.0002, "num_tokens": 105051947.0, "reward": 0.9622687101364136, "reward_std": 0.0, "rewards/reward_func/mean": 0.9622687101364136, "rewards/reward_func/std": 0.0, "step": 3816, "step_time": 23.484891396015882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 148.1875, "completions/mean_terminated_length": 148.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.41103046387434006, "epoch": 0.17679481241315423, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023694795090705156, "kl": 0.002204233198426664, "learning_rate": 9.64650301065308e-07, "loss": 0.0001, "num_tokens": 105085278.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3817, "step_time": 18.35793075338006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.44671790301799774, "epoch": 0.17684113015284855, "frac_reward_zero_std": 0.0, "grad_norm": 0.09875843673944473, "kl": 0.006757065188139677, "learning_rate": 9.646410375173691e-07, "loss": 0.0929, "num_tokens": 105109472.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 3818, "step_time": 26.869915205985308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3203148618340492, "epoch": 0.17688744789254285, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024329540319740772, "kl": 0.002018020866671577, "learning_rate": 9.646317739694303e-07, "loss": 0.0001, "num_tokens": 105131112.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3819, "step_time": 14.587321132421494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 264.9375, "completions/mean_terminated_length": 264.9375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.18735438585281372, "epoch": 0.17693376563223714, "frac_reward_zero_std": 0.0, "grad_norm": 0.11418405920267105, "kl": 0.004360408231150359, "learning_rate": 9.646225104214914e-07, "loss": 0.0001, "num_tokens": 105170487.0, "reward": 0.9792701601982117, "reward_std": 0.05159161239862442, "rewards/reward_func/mean": 0.9792701601982117, "rewards/reward_func/std": 0.051591601222753525, "step": 3820, "step_time": 27.922769486904144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.18828526511788368, "epoch": 0.17698008337193144, "frac_reward_zero_std": 1.0, "grad_norm": 0.020355554297566414, "kl": 0.012405019253492355, "learning_rate": 9.646132468735525e-07, "loss": 0.0006, "num_tokens": 105193314.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 3821, "step_time": 19.056558277457952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 157.125, "completions/mean_terminated_length": 157.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.2825987935066223, "epoch": 0.17702640111162576, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018772652838379145, "kl": 0.0016300764691550285, "learning_rate": 9.646039833256136e-07, "loss": 0.0001, "num_tokens": 105215508.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3822, "step_time": 17.652187269181013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 125.0625, "completions/mean_terminated_length": 125.0625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.24263647943735123, "epoch": 0.17707271885132006, "frac_reward_zero_std": 1.0, "grad_norm": 0.00283534056507051, "kl": 0.001963997376151383, "learning_rate": 9.645947197776748e-07, "loss": 0.0001, "num_tokens": 105237653.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3823, "step_time": 14.83386355638504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 131.25, "completions/mean_terminated_length": 131.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.31897375732660294, "epoch": 0.17711903659101436, "frac_reward_zero_std": 1.0, "grad_norm": 0.00894392840564251, "kl": 0.004011940793134272, "learning_rate": 9.645854562297359e-07, "loss": 0.0002, "num_tokens": 105266265.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3824, "step_time": 15.94401427730918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 121.25, "completions/mean_terminated_length": 121.25, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.30347325652837753, "epoch": 0.17716535433070865, "frac_reward_zero_std": 1.0, "grad_norm": 0.005716771353036165, "kl": 0.002859598316717893, "learning_rate": 9.645761926817972e-07, "loss": 0.0001, "num_tokens": 105286285.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3825, "step_time": 14.27716787904501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.21537592262029648, "epoch": 0.17721167207040298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0053020622581243515, "kl": 0.003980011155363172, "learning_rate": 9.645669291338581e-07, "loss": 0.0002, "num_tokens": 105317613.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3826, "step_time": 23.069328784942627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.29018886387348175, "epoch": 0.17725798981009727, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030249236151576042, "kl": 0.0020077748922631145, "learning_rate": 9.645576655859193e-07, "loss": 0.0001, "num_tokens": 105353901.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3827, "step_time": 19.266102500259876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 184.6875, "completions/mean_terminated_length": 184.6875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.21682461351156235, "epoch": 0.17730430754979157, "frac_reward_zero_std": 1.0, "grad_norm": 0.009777910076081753, "kl": 0.009190604905597866, "learning_rate": 9.645484020379804e-07, "loss": 0.0005, "num_tokens": 105375112.0, "reward": 0.7589176297187805, "reward_std": 0.0, "rewards/reward_func/mean": 0.7589176297187805, "rewards/reward_func/std": 0.0, "step": 3828, "step_time": 19.212503224611282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.5217312127351761, "epoch": 0.17735062528948586, "frac_reward_zero_std": 0.0, "grad_norm": 0.10653503984212875, "kl": 0.009533015778288245, "learning_rate": 9.645391384900417e-07, "loss": -0.0397, "num_tokens": 105397036.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 3829, "step_time": 24.044645447283983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 199.5625, "completions/mean_terminated_length": 199.5625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.29278120398521423, "epoch": 0.1773969430291802, "frac_reward_zero_std": 0.0, "grad_norm": 0.10548482090234756, "kl": 0.006157454918138683, "learning_rate": 9.645298749421029e-07, "loss": 0.0295, "num_tokens": 105433477.0, "reward": 0.8918563723564148, "reward_std": 0.2955045998096466, "rewards/reward_func/mean": 0.8918563723564148, "rewards/reward_func/std": 0.295504629611969, "step": 3830, "step_time": 23.892690498381853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 169.3125, "completions/mean_terminated_length": 169.3125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.27810313552618027, "epoch": 0.17744326076887448, "frac_reward_zero_std": 1.0, "grad_norm": 0.005729720462113619, "kl": 0.004573135694954544, "learning_rate": 9.64520611394164e-07, "loss": 0.0002, "num_tokens": 105454746.0, "reward": 0.24073302745819092, "reward_std": 0.0, "rewards/reward_func/mean": 0.24073302745819092, "rewards/reward_func/std": 0.0, "step": 3831, "step_time": 17.41228273883462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 163.4375, "completions/mean_terminated_length": 163.4375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.12800641171634197, "epoch": 0.17748957850856878, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016617050860077143, "kl": 0.0012654691090574488, "learning_rate": 9.64511347846225e-07, "loss": 0.0001, "num_tokens": 105476273.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3832, "step_time": 16.543955411762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.189119603484869, "epoch": 0.17753589624826308, "frac_reward_zero_std": 1.0, "grad_norm": 0.006521198898553848, "kl": 0.003373265906702727, "learning_rate": 9.645020842982862e-07, "loss": 0.0002, "num_tokens": 105499885.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 3833, "step_time": 17.39403572306037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 165.0, "completions/mean_terminated_length": 165.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.36929506063461304, "epoch": 0.1775822139879574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044189090840518475, "kl": 0.0037194338510744274, "learning_rate": 9.644928207503474e-07, "loss": 0.0002, "num_tokens": 105525821.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3834, "step_time": 17.734925776720047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 170.75, "completions/mean_terminated_length": 170.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.34637551009655, "epoch": 0.1776285317276517, "frac_reward_zero_std": 0.0, "grad_norm": 0.13116101920604706, "kl": 0.013561037834733725, "learning_rate": 9.644835572024085e-07, "loss": -0.0654, "num_tokens": 105546985.0, "reward": 0.17457427084445953, "reward_std": 0.37532341480255127, "rewards/reward_func/mean": 0.17457427084445953, "rewards/reward_func/std": 0.3753233850002289, "step": 3835, "step_time": 18.447752952575684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 168.8125, "completions/mean_terminated_length": 168.8125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.19035568088293076, "epoch": 0.177674849467346, "frac_reward_zero_std": 1.0, "grad_norm": 0.002532113343477249, "kl": 0.002177273971028626, "learning_rate": 9.644742936544696e-07, "loss": 0.0001, "num_tokens": 105569606.0, "reward": 0.5139832496643066, "reward_std": 0.0, "rewards/reward_func/mean": 0.5139832496643066, "rewards/reward_func/std": 0.0, "step": 3836, "step_time": 18.159806467592716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.17702282220125198, "epoch": 0.1777211672070403, "frac_reward_zero_std": 1.0, "grad_norm": 0.002190358005464077, "kl": 0.0014314647996798158, "learning_rate": 9.644650301065307e-07, "loss": 0.0001, "num_tokens": 105594878.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 3837, "step_time": 17.425941973924637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 237.5625, "completions/mean_terminated_length": 237.5625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.1605139933526516, "epoch": 0.1777674849467346, "frac_reward_zero_std": 0.0, "grad_norm": 0.09193283319473267, "kl": 0.003897265298292041, "learning_rate": 9.644557665585919e-07, "loss": -0.0262, "num_tokens": 105622295.0, "reward": 0.7201932668685913, "reward_std": 0.004916071891784668, "rewards/reward_func/mean": 0.7201932668685913, "rewards/reward_func/std": 0.004916071891784668, "step": 3838, "step_time": 24.34452408924699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 136.625, "completions/mean_terminated_length": 136.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.33212994784116745, "epoch": 0.1778138026864289, "frac_reward_zero_std": 1.0, "grad_norm": 0.00414050230756402, "kl": 0.002761340991128236, "learning_rate": 9.64446503010653e-07, "loss": 0.0001, "num_tokens": 105647281.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3839, "step_time": 16.613259498029947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 137.9375, "completions/mean_terminated_length": 137.9375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.2578519880771637, "epoch": 0.1778601204261232, "frac_reward_zero_std": 1.0, "grad_norm": 0.004538194742053747, "kl": 0.00209065355011262, "learning_rate": 9.64437239462714e-07, "loss": 0.0001, "num_tokens": 105666992.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3840, "step_time": 14.258984357118607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 134.3125, "completions/mean_terminated_length": 134.3125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.31515443325042725, "epoch": 0.1779064381658175, "frac_reward_zero_std": 1.0, "grad_norm": 0.007335335481911898, "kl": 0.004559081397019327, "learning_rate": 9.644279759147752e-07, "loss": 0.0002, "num_tokens": 105689077.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3841, "step_time": 16.859371077269316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.1951061300933361, "epoch": 0.17795275590551182, "frac_reward_zero_std": 0.0, "grad_norm": 0.13436013460159302, "kl": 0.006344126304611564, "learning_rate": 9.644187123668366e-07, "loss": -0.0208, "num_tokens": 105747601.0, "reward": 0.9688286781311035, "reward_std": 0.06701634079217911, "rewards/reward_func/mean": 0.9688286781311035, "rewards/reward_func/std": 0.06701634079217911, "step": 3842, "step_time": 29.210843361914158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 173.6875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3495946228504181, "epoch": 0.17799907364520612, "frac_reward_zero_std": 1.0, "grad_norm": 0.009424271993339062, "kl": 0.006038697552867234, "learning_rate": 9.644094488188977e-07, "loss": 0.0003, "num_tokens": 105771196.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3843, "step_time": 17.927072402089834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.33498673886060715, "epoch": 0.17804539138490041, "frac_reward_zero_std": 1.0, "grad_norm": 0.005292301531881094, "kl": 0.0030770490411669016, "learning_rate": 9.644001852709588e-07, "loss": 0.0002, "num_tokens": 105798036.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3844, "step_time": 15.143288355320692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.13562552630901337, "epoch": 0.1780917091245947, "frac_reward_zero_std": 1.0, "grad_norm": 0.002276982879266143, "kl": 0.0014080112450756133, "learning_rate": 9.6439092172302e-07, "loss": 0.0001, "num_tokens": 105830460.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3845, "step_time": 20.715296521782875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 138.375, "completions/mean_terminated_length": 138.375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.24147973209619522, "epoch": 0.17813802686428903, "frac_reward_zero_std": 1.0, "grad_norm": 0.003436050144955516, "kl": 0.0024765877751633525, "learning_rate": 9.64381658175081e-07, "loss": 0.0001, "num_tokens": 105850178.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3846, "step_time": 14.213184762746096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.27036910876631737, "epoch": 0.17818434460398333, "frac_reward_zero_std": 1.0, "grad_norm": 0.004406011663377285, "kl": 0.002435957605484873, "learning_rate": 9.643723946271422e-07, "loss": 0.0001, "num_tokens": 105871475.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3847, "step_time": 14.044939454644918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3302168771624565, "epoch": 0.17823066234367763, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024486184120178223, "kl": 0.00217066629556939, "learning_rate": 9.643631310792033e-07, "loss": 0.0001, "num_tokens": 105903410.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3848, "step_time": 19.308651093393564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.5138166099786758, "epoch": 0.17827698008337192, "frac_reward_zero_std": 0.0, "grad_norm": 0.12125244736671448, "kl": 0.007284591440111399, "learning_rate": 9.643538675312644e-07, "loss": 0.0377, "num_tokens": 105929720.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3849, "step_time": 28.40598590299487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 157.1875, "completions/mean_terminated_length": 157.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3992660269141197, "epoch": 0.17832329782306625, "frac_reward_zero_std": 1.0, "grad_norm": 0.004069524817168713, "kl": 0.002859303029254079, "learning_rate": 9.643446039833256e-07, "loss": 0.0001, "num_tokens": 105962923.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3850, "step_time": 19.518219359219074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.2323986180126667, "epoch": 0.17836961556276054, "frac_reward_zero_std": 1.0, "grad_norm": 0.002475576940923929, "kl": 0.001692907593678683, "learning_rate": 9.643353404353867e-07, "loss": 0.0001, "num_tokens": 105999181.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3851, "step_time": 24.847377281636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 156.375, "completions/mean_terminated_length": 156.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.1864527016878128, "epoch": 0.17841593330245484, "frac_reward_zero_std": 1.0, "grad_norm": 0.005519297439604998, "kl": 0.002675230032764375, "learning_rate": 9.643260768874478e-07, "loss": 0.0001, "num_tokens": 106026419.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 3852, "step_time": 17.847322486341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.36013707518577576, "epoch": 0.17846225104214913, "frac_reward_zero_std": 1.0, "grad_norm": 0.005480221938341856, "kl": 0.004316116275731474, "learning_rate": 9.64316813339509e-07, "loss": 0.0002, "num_tokens": 106047558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3853, "step_time": 16.41007661446929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 148.4375, "completions/mean_terminated_length": 148.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.24752887338399887, "epoch": 0.17850856878184346, "frac_reward_zero_std": 0.0, "grad_norm": 0.14251220226287842, "kl": 0.005275412113405764, "learning_rate": 9.6430754979157e-07, "loss": -0.0577, "num_tokens": 106069421.0, "reward": 0.9293943643569946, "reward_std": 0.035030219703912735, "rewards/reward_func/mean": 0.9293943643569946, "rewards/reward_func/std": 0.03503022342920303, "step": 3854, "step_time": 17.581236638128757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 159.3125, "completions/mean_terminated_length": 159.3125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.35773012042045593, "epoch": 0.17855488652153775, "frac_reward_zero_std": 1.0, "grad_norm": 0.009938415139913559, "kl": 0.008871463127434254, "learning_rate": 9.642982862436314e-07, "loss": 0.0004, "num_tokens": 106093938.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3855, "step_time": 18.866581570357084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.23759281262755394, "epoch": 0.17860120426123205, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034014505799859762, "kl": 0.0020460054656723514, "learning_rate": 9.642890226956925e-07, "loss": 0.0001, "num_tokens": 106114141.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3856, "step_time": 14.944848220795393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 127.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3074854612350464, "epoch": 0.17864752200092635, "frac_reward_zero_std": 1.0, "grad_norm": 0.004036040045320988, "kl": 0.0027269473066553473, "learning_rate": 9.642797591477534e-07, "loss": 0.0001, "num_tokens": 106135895.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3857, "step_time": 15.013610441237688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 186.0, "completions/mean_terminated_length": 186.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.4452502205967903, "epoch": 0.17869383974062067, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027183243073523045, "kl": 0.0027735581970773637, "learning_rate": 9.642704955998146e-07, "loss": 0.0001, "num_tokens": 106165703.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3858, "step_time": 20.652802895754576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 130.125, "completions/mean_terminated_length": 130.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.20212046429514885, "epoch": 0.17874015748031497, "frac_reward_zero_std": 1.0, "grad_norm": 0.012862884439527988, "kl": 0.0025874590792227536, "learning_rate": 9.64261232051876e-07, "loss": 0.0001, "num_tokens": 106185305.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3859, "step_time": 13.375053532421589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 195.3125, "completions/mean_terminated_length": 195.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.37439778447151184, "epoch": 0.17878647522000926, "frac_reward_zero_std": 0.0, "grad_norm": 0.11001267284154892, "kl": 0.008875812869518995, "learning_rate": 9.64251968503937e-07, "loss": 0.0538, "num_tokens": 106207662.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3860, "step_time": 21.003232218325138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.26884597167372704, "epoch": 0.17883279295970356, "frac_reward_zero_std": 0.0, "grad_norm": 0.10725853592157364, "kl": 0.0035685841576196253, "learning_rate": 9.642427049559981e-07, "loss": 0.0107, "num_tokens": 106231493.0, "reward": 0.12200583517551422, "reward_std": 0.004130990710109472, "rewards/reward_func/mean": 0.12200583517551422, "rewards/reward_func/std": 0.004130990710109472, "step": 3861, "step_time": 18.338267598301172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.18928562477231026, "epoch": 0.17887911069939788, "frac_reward_zero_std": 1.0, "grad_norm": 0.016471249982714653, "kl": 0.024105083663016558, "learning_rate": 9.642334414080593e-07, "loss": 0.0012, "num_tokens": 106261037.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3862, "step_time": 20.131963800638914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3357459381222725, "epoch": 0.17892542843909218, "frac_reward_zero_std": 0.0, "grad_norm": 0.14837013185024261, "kl": 0.004607123322784901, "learning_rate": 9.642241778601204e-07, "loss": 0.0085, "num_tokens": 106293321.0, "reward": 0.4073399603366852, "reward_std": 0.4770277142524719, "rewards/reward_func/mean": 0.4073399603366852, "rewards/reward_func/std": 0.47702768445014954, "step": 3863, "step_time": 23.77440858259797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 168.0625, "completions/mean_terminated_length": 168.0625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.1621304452419281, "epoch": 0.17897174617878647, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033217810560017824, "kl": 0.002067015739157796, "learning_rate": 9.642149143121815e-07, "loss": 0.0001, "num_tokens": 106330346.0, "reward": 0.39511775970458984, "reward_std": 0.0, "rewards/reward_func/mean": 0.39511775970458984, "rewards/reward_func/std": 0.0, "step": 3864, "step_time": 20.833427734673023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 132.9375, "completions/mean_terminated_length": 132.9375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3487633988261223, "epoch": 0.17901806391848077, "frac_reward_zero_std": 1.0, "grad_norm": 0.002550463890656829, "kl": 0.002032527787378058, "learning_rate": 9.642056507642426e-07, "loss": 0.0001, "num_tokens": 106352617.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3865, "step_time": 14.310365248471498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 197.0625, "completions/mean_terminated_length": 197.0625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.322141170501709, "epoch": 0.1790643816581751, "frac_reward_zero_std": 1.0, "grad_norm": 0.01066620834171772, "kl": 0.010315066552720964, "learning_rate": 9.641963872163038e-07, "loss": 0.0005, "num_tokens": 106374698.0, "reward": 0.24659696221351624, "reward_std": 0.0, "rewards/reward_func/mean": 0.24659696221351624, "rewards/reward_func/std": 0.0, "step": 3866, "step_time": 20.995813205838203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4266778379678726, "epoch": 0.1791106993978694, "frac_reward_zero_std": 1.0, "grad_norm": 0.006910845171660185, "kl": 0.005342388059943914, "learning_rate": 9.64187123668365e-07, "loss": 0.0003, "num_tokens": 106412214.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3867, "step_time": 28.35900116711855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.24351060763001442, "epoch": 0.17915701713756368, "frac_reward_zero_std": 1.0, "grad_norm": 0.003948332276195288, "kl": 0.0018374093051534146, "learning_rate": 9.64177860120426e-07, "loss": 0.0001, "num_tokens": 106431484.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3868, "step_time": 13.185180019587278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.18132193014025688, "epoch": 0.17920333487725798, "frac_reward_zero_std": 0.0, "grad_norm": 0.12919850647449493, "kl": 0.003018807154148817, "learning_rate": 9.641685965724871e-07, "loss": -0.04, "num_tokens": 106452213.0, "reward": 0.0763099268078804, "reward_std": 0.22499586641788483, "rewards/reward_func/mean": 0.0763099268078804, "rewards/reward_func/std": 0.22499586641788483, "step": 3869, "step_time": 15.816828023642302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.32873932272195816, "epoch": 0.1792496526169523, "frac_reward_zero_std": 1.0, "grad_norm": 0.016376294195652008, "kl": 0.012700323015451431, "learning_rate": 9.641593330245483e-07, "loss": 0.0006, "num_tokens": 106476683.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3870, "step_time": 18.558602813631296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 121.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2800655737519264, "epoch": 0.1792959703566466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018830408807843924, "kl": 0.0016500942001584917, "learning_rate": 9.641500694766094e-07, "loss": 0.0001, "num_tokens": 106496441.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3871, "step_time": 13.768268425017595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.1875, "completions/mean_terminated_length": 132.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2514149695634842, "epoch": 0.1793422880963409, "frac_reward_zero_std": 1.0, "grad_norm": 0.00770987942814827, "kl": 0.003763327025808394, "learning_rate": 9.641408059286707e-07, "loss": 0.0002, "num_tokens": 106517340.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3872, "step_time": 14.451893799006939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 164.9375, "completions/mean_terminated_length": 164.9375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.47073114663362503, "epoch": 0.1793886058360352, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025714649818837643, "kl": 0.0025621079839766026, "learning_rate": 9.641315423807319e-07, "loss": 0.0001, "num_tokens": 106548475.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3873, "step_time": 19.580643940716982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 130.4375, "completions/mean_terminated_length": 130.4375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2749612033367157, "epoch": 0.17943492357572952, "frac_reward_zero_std": 1.0, "grad_norm": 0.00378880905918777, "kl": 0.002403245773166418, "learning_rate": 9.64122278832793e-07, "loss": 0.0001, "num_tokens": 106572802.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3874, "step_time": 15.559952519834042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 189.0625, "completions/mean_terminated_length": 189.0625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.17167653515934944, "epoch": 0.1794812413154238, "frac_reward_zero_std": 1.0, "grad_norm": 0.002166612772271037, "kl": 0.001824690873036161, "learning_rate": 9.64113015284854e-07, "loss": 0.0001, "num_tokens": 106610067.0, "reward": 0.8970773816108704, "reward_std": 0.0, "rewards/reward_func/mean": 0.8970773816108704, "rewards/reward_func/std": 0.0, "step": 3875, "step_time": 24.00137247145176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2034187652170658, "epoch": 0.1795275590551181, "frac_reward_zero_std": 1.0, "grad_norm": 0.004419104196131229, "kl": 0.004266760079190135, "learning_rate": 9.641037517369152e-07, "loss": 0.0002, "num_tokens": 106646567.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3876, "step_time": 25.514030795544386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2660621926188469, "epoch": 0.1795738767948124, "frac_reward_zero_std": 0.0, "grad_norm": 0.11790023744106293, "kl": 0.025686467299237847, "learning_rate": 9.640944881889764e-07, "loss": -0.0725, "num_tokens": 106681793.0, "reward": 0.7919888496398926, "reward_std": 0.3820759356021881, "rewards/reward_func/mean": 0.7919888496398926, "rewards/reward_func/std": 0.3820759654045105, "step": 3877, "step_time": 23.93250075355172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 302.5625, "completions/mean_terminated_length": 302.5625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "entropy": 0.2444363832473755, "epoch": 0.17962019453450673, "frac_reward_zero_std": 1.0, "grad_norm": 0.004091878887265921, "kl": 0.0036927127512171865, "learning_rate": 9.640852246410375e-07, "loss": 0.0002, "num_tokens": 106709322.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3878, "step_time": 27.31113762408495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 173.25, "completions/mean_terminated_length": 173.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.44217732548713684, "epoch": 0.17966651227420102, "frac_reward_zero_std": 0.0, "grad_norm": 0.16478723287582397, "kl": 0.008424879633821547, "learning_rate": 9.640759610930986e-07, "loss": -0.0482, "num_tokens": 106742558.0, "reward": 0.05278949812054634, "reward_std": 0.21115797758102417, "rewards/reward_func/mean": 0.05278949812054634, "rewards/reward_func/std": 0.21115799248218536, "step": 3879, "step_time": 22.040966276079416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 172.9375, "completions/mean_terminated_length": 172.9375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.2926242798566818, "epoch": 0.17971283001389532, "frac_reward_zero_std": 0.0, "grad_norm": 0.13188034296035767, "kl": 0.004058061807882041, "learning_rate": 9.640666975451597e-07, "loss": -0.0057, "num_tokens": 106779709.0, "reward": 0.4177449345588684, "reward_std": 0.11502572894096375, "rewards/reward_func/mean": 0.4177449345588684, "rewards/reward_func/std": 0.11502573639154434, "step": 3880, "step_time": 21.731445774435997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 125.3125, "completions/mean_terminated_length": 125.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2763097584247589, "epoch": 0.17975914775358962, "frac_reward_zero_std": 1.0, "grad_norm": 0.003961433190852404, "kl": 0.0024688647827133536, "learning_rate": 9.640574339972209e-07, "loss": 0.0001, "num_tokens": 106802610.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3881, "step_time": 14.127445373684168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 202.5625, "completions/mean_terminated_length": 202.5625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.23512405157089233, "epoch": 0.17980546549328394, "frac_reward_zero_std": 1.0, "grad_norm": 0.001926533761434257, "kl": 0.002032506628893316, "learning_rate": 9.64048170449282e-07, "loss": 0.0001, "num_tokens": 106838955.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3882, "step_time": 23.26013696938753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 129.5625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2327059954404831, "epoch": 0.17985178323297824, "frac_reward_zero_std": 1.0, "grad_norm": 0.002668160479515791, "kl": 0.0015750114107504487, "learning_rate": 9.640389069013431e-07, "loss": 0.0001, "num_tokens": 106859700.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3883, "step_time": 13.915167830884457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 104.0625, "completions/mean_terminated_length": 104.0625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.33782267570495605, "epoch": 0.17989810097267253, "frac_reward_zero_std": 1.0, "grad_norm": 0.005571055691689253, "kl": 0.002933368261437863, "learning_rate": 9.640296433534042e-07, "loss": 0.0001, "num_tokens": 106880085.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3884, "step_time": 12.020050313323736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 243.9375, "completions/mean_terminated_length": 243.9375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.2659078761935234, "epoch": 0.17994441871236683, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036406246945261955, "kl": 0.006943120853975415, "learning_rate": 9.640203798054656e-07, "loss": 0.0003, "num_tokens": 106908708.0, "reward": 0.740818202495575, "reward_std": 0.0, "rewards/reward_func/mean": 0.740818202495575, "rewards/reward_func/std": 0.0, "step": 3885, "step_time": 25.21360557153821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 154.1875, "completions/mean_terminated_length": 154.1875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.24740813672542572, "epoch": 0.17999073645206115, "frac_reward_zero_std": 1.0, "grad_norm": 0.004300672560930252, "kl": 0.0032012086594477296, "learning_rate": 9.640111162575267e-07, "loss": 0.0002, "num_tokens": 106928487.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3886, "step_time": 16.44284963980317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 152.6875, "completions/mean_terminated_length": 152.6875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.20444755628705025, "epoch": 0.18003705419175545, "frac_reward_zero_std": 0.0, "grad_norm": 0.11522559076547623, "kl": 0.06768784299492836, "learning_rate": 9.640018527095878e-07, "loss": -0.0715, "num_tokens": 106950498.0, "reward": 0.6712665557861328, "reward_std": 0.21674343943595886, "rewards/reward_func/mean": 0.6712665557861328, "rewards/reward_func/std": 0.21674346923828125, "step": 3887, "step_time": 20.053553327918053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 146.25, "completions/mean_terminated_length": 146.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.35008594393730164, "epoch": 0.18008337193144974, "frac_reward_zero_std": 1.0, "grad_norm": 0.006665939465165138, "kl": 0.004400743579026312, "learning_rate": 9.639925891616487e-07, "loss": 0.0002, "num_tokens": 106973222.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3888, "step_time": 15.803185060620308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 250.1875, "completions/mean_terminated_length": 250.1875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "entropy": 0.19690639898180962, "epoch": 0.18012968967114404, "frac_reward_zero_std": 0.0, "grad_norm": 0.09986148774623871, "kl": 0.011436095228418708, "learning_rate": 9.6398332561371e-07, "loss": -0.0021, "num_tokens": 107003321.0, "reward": 0.8035168647766113, "reward_std": 0.13681329786777496, "rewards/reward_func/mean": 0.8035168647766113, "rewards/reward_func/std": 0.13681329786777496, "step": 3889, "step_time": 24.08195485919714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 118.75, "completions/mean_terminated_length": 118.75, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.23816321045160294, "epoch": 0.18017600741083836, "frac_reward_zero_std": 1.0, "grad_norm": 0.004808017518371344, "kl": 0.0027467715553939342, "learning_rate": 9.639740620657712e-07, "loss": 0.0001, "num_tokens": 107023221.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3890, "step_time": 14.073711056262255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 171.125, "completions/mean_terminated_length": 171.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2762995511293411, "epoch": 0.18022232515053266, "frac_reward_zero_std": 0.0, "grad_norm": 0.10893756151199341, "kl": 0.005113076651468873, "learning_rate": 9.639647985178323e-07, "loss": -0.0393, "num_tokens": 107044679.0, "reward": 0.8529131412506104, "reward_std": 0.07281851023435593, "rewards/reward_func/mean": 0.8529131412506104, "rewards/reward_func/std": 0.07281851023435593, "step": 3891, "step_time": 18.657626669853926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 204.375, "completions/mean_terminated_length": 204.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.14512208476662636, "epoch": 0.18026864289022695, "frac_reward_zero_std": 0.0, "grad_norm": 0.08163206279277802, "kl": 0.005026340892072767, "learning_rate": 9.639555349698934e-07, "loss": -0.0171, "num_tokens": 107067197.0, "reward": 0.990176796913147, "reward_std": 0.017572296783328056, "rewards/reward_func/mean": 0.990176796913147, "rewards/reward_func/std": 0.017572306096553802, "step": 3892, "step_time": 19.930472690612078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.34035804867744446, "epoch": 0.18031496062992125, "frac_reward_zero_std": 1.0, "grad_norm": 0.006593564059585333, "kl": 0.0030465046875178814, "learning_rate": 9.639462714219546e-07, "loss": 0.0002, "num_tokens": 107096891.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3893, "step_time": 17.54202764481306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 222.9375, "completions/mean_terminated_length": 222.9375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.19629066810011864, "epoch": 0.18036127836961557, "frac_reward_zero_std": 1.0, "grad_norm": 0.010079527273774147, "kl": 0.07693048194050789, "learning_rate": 9.639370078740157e-07, "loss": 0.0038, "num_tokens": 107123210.0, "reward": 0.8542837500572205, "reward_std": 0.0, "rewards/reward_func/mean": 0.8542837500572205, "rewards/reward_func/std": 0.0, "step": 3894, "step_time": 22.091420751065016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.25358671322464943, "epoch": 0.18040759610930987, "frac_reward_zero_std": 0.0, "grad_norm": 0.10642461478710175, "kl": 0.006079294253140688, "learning_rate": 9.639277443260768e-07, "loss": -0.0101, "num_tokens": 107157630.0, "reward": 0.9601125717163086, "reward_std": 0.010636663064360619, "rewards/reward_func/mean": 0.9601125717163086, "rewards/reward_func/std": 0.010636658407747746, "step": 3895, "step_time": 25.84510939568281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 135.1875, "completions/mean_terminated_length": 135.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.31263020634651184, "epoch": 0.18045391384900417, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028968616388738155, "kl": 0.0021427946048788726, "learning_rate": 9.63918480778138e-07, "loss": 0.0001, "num_tokens": 107178769.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3896, "step_time": 14.797037236392498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 124.3125, "completions/mean_terminated_length": 124.3125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.273935966193676, "epoch": 0.18050023158869846, "frac_reward_zero_std": 1.0, "grad_norm": 0.004854854661971331, "kl": 0.0026416799519211054, "learning_rate": 9.63909217230199e-07, "loss": 0.0001, "num_tokens": 107200022.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3897, "step_time": 14.565889682620764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.41036058217287064, "epoch": 0.18054654932839279, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051071178168058395, "kl": 0.004573448677547276, "learning_rate": 9.638999536822602e-07, "loss": 0.0002, "num_tokens": 107231082.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3898, "step_time": 23.072204168885946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.28761573135852814, "epoch": 0.18059286706808708, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029935636557638645, "kl": 0.0024007526808418334, "learning_rate": 9.638906901343215e-07, "loss": 0.0001, "num_tokens": 107254600.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3899, "step_time": 14.77840093523264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 165.9375, "completions/mean_terminated_length": 165.9375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2848445922136307, "epoch": 0.18063918480778138, "frac_reward_zero_std": 0.0, "grad_norm": 0.12299530953168869, "kl": 0.025952158961445093, "learning_rate": 9.638814265863824e-07, "loss": 0.0335, "num_tokens": 107276135.0, "reward": 0.8355741500854492, "reward_std": 0.056173864752054214, "rewards/reward_func/mean": 0.8355741500854492, "rewards/reward_func/std": 0.05617387965321541, "step": 3900, "step_time": 17.51611840352416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 158.0625, "completions/mean_terminated_length": 158.0625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2537331245839596, "epoch": 0.18068550254747567, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023763119243085384, "kl": 0.0018065246695186943, "learning_rate": 9.638721630384436e-07, "loss": 0.0001, "num_tokens": 107305352.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3901, "step_time": 18.49169960990548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 175.4375, "completions/mean_terminated_length": 175.4375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.311426617205143, "epoch": 0.18073182028717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038535951171070337, "kl": 0.002921569102909416, "learning_rate": 9.63862899490505e-07, "loss": 0.0001, "num_tokens": 107330671.0, "reward": 0.3678794503211975, "reward_std": 0.0, "rewards/reward_func/mean": 0.3678794503211975, "rewards/reward_func/std": 0.0, "step": 3902, "step_time": 19.04279673472047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.37105194479227066, "epoch": 0.1807781380268643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025385727640241385, "kl": 0.0021450287313200533, "learning_rate": 9.63853635942566e-07, "loss": 0.0001, "num_tokens": 107359426.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3903, "step_time": 18.226160261780024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 126.625, "completions/mean_terminated_length": 126.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3110702708363533, "epoch": 0.1808244557665586, "frac_reward_zero_std": 1.0, "grad_norm": 0.002151668770238757, "kl": 0.0017418436182197183, "learning_rate": 9.638443723946272e-07, "loss": 0.0001, "num_tokens": 107395036.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3904, "step_time": 17.367747947573662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "entropy": 0.14336872100830078, "epoch": 0.18087077350625289, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020849378779530525, "kl": 0.001666792668402195, "learning_rate": 9.638351088466883e-07, "loss": 0.0001, "num_tokens": 107425334.0, "reward": 0.9622687101364136, "reward_std": 0.0, "rewards/reward_func/mean": 0.9622687101364136, "rewards/reward_func/std": 0.0, "step": 3905, "step_time": 25.53689743205905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.455336295068264, "epoch": 0.1809170912459472, "frac_reward_zero_std": 0.0, "grad_norm": 0.0899076759815216, "kl": 0.00596992252394557, "learning_rate": 9.638258452987494e-07, "loss": -0.0499, "num_tokens": 107461126.0, "reward": 0.43209439516067505, "reward_std": 0.4501776397228241, "rewards/reward_func/mean": 0.43209439516067505, "rewards/reward_func/std": 0.4501776397228241, "step": 3906, "step_time": 28.96193305402994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 107.5625, "completions/mean_terminated_length": 107.5625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.25259115546941757, "epoch": 0.1809634089856415, "frac_reward_zero_std": 1.0, "grad_norm": 0.004304500296711922, "kl": 0.002399189746938646, "learning_rate": 9.638165817508105e-07, "loss": 0.0001, "num_tokens": 107481247.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3907, "step_time": 13.515427503734827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 193.8125, "completions/mean_terminated_length": 193.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4437813311815262, "epoch": 0.1810097267253358, "frac_reward_zero_std": 1.0, "grad_norm": 0.008852921426296234, "kl": 0.0069910825695842505, "learning_rate": 9.638073182028717e-07, "loss": 0.0003, "num_tokens": 107506556.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3908, "step_time": 20.957769952714443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 163.0, "completions/mean_terminated_length": 163.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.20349428057670593, "epoch": 0.1810560444650301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017322602216154337, "kl": 0.004391728434711695, "learning_rate": 9.637980546549328e-07, "loss": 0.0002, "num_tokens": 107529180.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 3909, "step_time": 17.231120854616165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 136.5625, "completions/mean_terminated_length": 136.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3487413823604584, "epoch": 0.18110236220472442, "frac_reward_zero_std": 1.0, "grad_norm": 0.004991558846086264, "kl": 0.0034653438488021493, "learning_rate": 9.63788791106994e-07, "loss": 0.0002, "num_tokens": 107565125.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3910, "step_time": 17.688213545829058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 284.4375, "completions/mean_terminated_length": 284.4375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.18257901072502136, "epoch": 0.18114867994441872, "frac_reward_zero_std": 1.0, "grad_norm": 0.003655927488580346, "kl": 0.003254766226746142, "learning_rate": 9.63779527559055e-07, "loss": 0.0002, "num_tokens": 107598460.0, "reward": 0.8920138478279114, "reward_std": 0.0, "rewards/reward_func/mean": 0.8920138478279114, "rewards/reward_func/std": 0.0, "step": 3911, "step_time": 27.935412857681513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.20604503899812698, "epoch": 0.181194997684113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031442560721188784, "kl": 0.00205564804491587, "learning_rate": 9.637702640111162e-07, "loss": 0.0001, "num_tokens": 107618178.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3912, "step_time": 13.054497182369232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.30461709946393967, "epoch": 0.1812413154238073, "frac_reward_zero_std": 0.0, "grad_norm": 0.07777665555477142, "kl": 0.005089417682029307, "learning_rate": 9.637610004631773e-07, "loss": -0.0683, "num_tokens": 107658390.0, "reward": 0.3690711259841919, "reward_std": 0.15417903661727905, "rewards/reward_func/mean": 0.3690711259841919, "rewards/reward_func/std": 0.15417903661727905, "step": 3913, "step_time": 33.33389285206795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.27665771543979645, "epoch": 0.18128763316350163, "frac_reward_zero_std": 1.0, "grad_norm": 0.02194824256002903, "kl": 0.007084732060320675, "learning_rate": 9.637517369152384e-07, "loss": 0.0004, "num_tokens": 107678027.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3914, "step_time": 14.059442419558764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.17007062211632729, "epoch": 0.18133395090319593, "frac_reward_zero_std": 1.0, "grad_norm": 0.005854338873177767, "kl": 0.004630892304703593, "learning_rate": 9.637424733672997e-07, "loss": 0.0002, "num_tokens": 107699773.0, "reward": 0.9574533700942993, "reward_std": 0.0, "rewards/reward_func/mean": 0.9574533700942993, "rewards/reward_func/std": 0.0, "step": 3915, "step_time": 17.851514037698507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.25144483149051666, "epoch": 0.18138026864289022, "frac_reward_zero_std": 1.0, "grad_norm": 0.007408967707306147, "kl": 0.006645939662121236, "learning_rate": 9.637332098193609e-07, "loss": 0.0003, "num_tokens": 107722841.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3916, "step_time": 18.072697196155787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 154.125, "completions/mean_terminated_length": 154.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.15302323177456856, "epoch": 0.18142658638258452, "frac_reward_zero_std": 1.0, "grad_norm": 0.015116202645003796, "kl": 0.008419914287514985, "learning_rate": 9.63723946271422e-07, "loss": 0.0004, "num_tokens": 107749499.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3917, "step_time": 18.053609509021044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 192.1875, "completions/mean_terminated_length": 192.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3846072405576706, "epoch": 0.18147290412227884, "frac_reward_zero_std": 1.0, "grad_norm": 0.010995019227266312, "kl": 0.00756355409976095, "learning_rate": 9.63714682723483e-07, "loss": 0.0004, "num_tokens": 107773678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3918, "step_time": 20.119005125015974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 226.0, "completions/mean_terminated_length": 226.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.40393929183483124, "epoch": 0.18151922186197314, "frac_reward_zero_std": 0.0, "grad_norm": 0.09263960272073746, "kl": 0.0070364398416131735, "learning_rate": 9.637054191755442e-07, "loss": -0.0306, "num_tokens": 107806110.0, "reward": 0.7071548700332642, "reward_std": 0.42166566848754883, "rewards/reward_func/mean": 0.7071548700332642, "rewards/reward_func/std": 0.4216656982898712, "step": 3919, "step_time": 26.11157711967826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 150.375, "completions/mean_terminated_length": 150.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3409520164132118, "epoch": 0.18156553960166744, "frac_reward_zero_std": 1.0, "grad_norm": 0.002416732022538781, "kl": 0.0018317789945285767, "learning_rate": 9.636961556276054e-07, "loss": 0.0001, "num_tokens": 107841892.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3920, "step_time": 18.842884566634893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 167.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.18287443369627, "epoch": 0.18161185734136173, "frac_reward_zero_std": 1.0, "grad_norm": 0.005541726481169462, "kl": 0.013160837115719914, "learning_rate": 9.636868920796665e-07, "loss": 0.0007, "num_tokens": 107875090.0, "reward": 0.23457029461860657, "reward_std": 0.0, "rewards/reward_func/mean": 0.23457029461860657, "rewards/reward_func/std": 0.0, "step": 3921, "step_time": 19.289226531982422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 179.9375, "completions/mean_terminated_length": 179.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.15644804388284683, "epoch": 0.18165817508105606, "frac_reward_zero_std": 0.0, "grad_norm": 0.09397299587726593, "kl": 0.0019102707447018474, "learning_rate": 9.636776285317276e-07, "loss": 0.0415, "num_tokens": 107903361.0, "reward": 0.8961285352706909, "reward_std": 0.019452031701803207, "rewards/reward_func/mean": 0.8961285352706909, "rewards/reward_func/std": 0.019452018663287163, "step": 3922, "step_time": 19.803660698235035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 242.6875, "completions/mean_terminated_length": 242.6875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.2391553744673729, "epoch": 0.18170449282075035, "frac_reward_zero_std": 0.0, "grad_norm": 0.13449817895889282, "kl": 0.00538853625766933, "learning_rate": 9.636683649837887e-07, "loss": -0.0743, "num_tokens": 107926412.0, "reward": 0.5545538663864136, "reward_std": 0.09470558166503906, "rewards/reward_func/mean": 0.5545538663864136, "rewards/reward_func/std": 0.09470557421445847, "step": 3923, "step_time": 26.411487139761448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 201.1875, "completions/mean_terminated_length": 201.1875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.37411340326070786, "epoch": 0.18175081056044465, "frac_reward_zero_std": 0.0, "grad_norm": 0.10074204206466675, "kl": 0.004191602231003344, "learning_rate": 9.636591014358499e-07, "loss": 0.1088, "num_tokens": 107948127.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 3924, "step_time": 27.94429662078619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.4033663347363472, "epoch": 0.18179712830013894, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025540878996253014, "kl": 0.002043678832706064, "learning_rate": 9.63649837887911e-07, "loss": 0.0001, "num_tokens": 107978665.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3925, "step_time": 18.012788832187653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 175.5, "completions/mean_terminated_length": 175.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.41431621462106705, "epoch": 0.18184344603983327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029174371156841516, "kl": 0.002668401808477938, "learning_rate": 9.636405743399721e-07, "loss": 0.0001, "num_tokens": 108012897.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3926, "step_time": 21.906016305088997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 145.3125, "completions/mean_terminated_length": 145.3125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.24583029001951218, "epoch": 0.18188976377952756, "frac_reward_zero_std": 1.0, "grad_norm": 0.017002010717988014, "kl": 0.008719731937162578, "learning_rate": 9.636313107920332e-07, "loss": 0.0004, "num_tokens": 108033270.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3927, "step_time": 15.884957481175661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.32335077971220016, "epoch": 0.18193608151922186, "frac_reward_zero_std": 1.0, "grad_norm": 0.007770972326397896, "kl": 0.004017679311800748, "learning_rate": 9.636220472440944e-07, "loss": 0.0002, "num_tokens": 108054960.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3928, "step_time": 14.316983543336391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.2963094189763069, "epoch": 0.18198239925891616, "frac_reward_zero_std": 1.0, "grad_norm": 0.003596608992666006, "kl": 0.002422891091555357, "learning_rate": 9.636127836961557e-07, "loss": 0.0001, "num_tokens": 108078562.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3929, "step_time": 16.01212800294161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 202.4375, "completions/mean_terminated_length": 202.4375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.34484798461198807, "epoch": 0.18202871699861048, "frac_reward_zero_std": 1.0, "grad_norm": 0.01932627521455288, "kl": 0.013576515251770616, "learning_rate": 9.636035201482168e-07, "loss": 0.0007, "num_tokens": 108107961.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3930, "step_time": 21.849462278187275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2778580114245415, "epoch": 0.18207503473830478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022489915136247873, "kl": 0.001488033216446638, "learning_rate": 9.635942566002777e-07, "loss": 0.0001, "num_tokens": 108129617.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3931, "step_time": 14.768348969519138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.17658758535981178, "epoch": 0.18212135247799907, "frac_reward_zero_std": 0.0, "grad_norm": 0.10897427052259445, "kl": 0.004015539278043434, "learning_rate": 9.63584993052339e-07, "loss": -0.0634, "num_tokens": 108157170.0, "reward": 0.8715384006500244, "reward_std": 0.22949376702308655, "rewards/reward_func/mean": 0.8715384006500244, "rewards/reward_func/std": 0.22949376702308655, "step": 3932, "step_time": 22.75570983439684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 194.0625, "completions/mean_terminated_length": 194.0625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.43365979939699173, "epoch": 0.18216767021769337, "frac_reward_zero_std": 1.0, "grad_norm": 0.004585118032991886, "kl": 0.0038146452279761434, "learning_rate": 9.635757295044002e-07, "loss": 0.0002, "num_tokens": 108190739.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3933, "step_time": 25.706376645714045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.23387249931693077, "epoch": 0.1822139879573877, "frac_reward_zero_std": 1.0, "grad_norm": 0.00591407623142004, "kl": 0.005576587747782469, "learning_rate": 9.635664659564613e-07, "loss": 0.0003, "num_tokens": 108210977.0, "reward": 0.8070557117462158, "reward_std": 0.0, "rewards/reward_func/mean": 0.8070557117462158, "rewards/reward_func/std": 0.0, "step": 3934, "step_time": 15.148271512240171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4190472513437271, "epoch": 0.182260305697082, "frac_reward_zero_std": 1.0, "grad_norm": 0.007171202916651964, "kl": 0.003614607499912381, "learning_rate": 9.635572024085225e-07, "loss": 0.0002, "num_tokens": 108246469.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3935, "step_time": 22.09712702408433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.38352401554584503, "epoch": 0.18230662343677628, "frac_reward_zero_std": 0.0, "grad_norm": 0.12347492575645447, "kl": 0.005324090830981731, "learning_rate": 9.635479388605836e-07, "loss": 0.0739, "num_tokens": 108274357.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 3936, "step_time": 26.792213916778564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 144.125, "completions/mean_terminated_length": 144.125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.34267380833625793, "epoch": 0.18235294117647058, "frac_reward_zero_std": 1.0, "grad_norm": 0.002206089673563838, "kl": 0.0022476864396594465, "learning_rate": 9.635386753126447e-07, "loss": 0.0001, "num_tokens": 108326567.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3937, "step_time": 23.410325340926647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.23739049211144447, "epoch": 0.1823992589161649, "frac_reward_zero_std": 0.0, "grad_norm": 0.09990550577640533, "kl": 0.0032598910038359463, "learning_rate": 9.635294117647058e-07, "loss": 0.1541, "num_tokens": 108347877.0, "reward": 0.7684125304222107, "reward_std": 0.23634617030620575, "rewards/reward_func/mean": 0.7684125304222107, "rewards/reward_func/std": 0.23634617030620575, "step": 3938, "step_time": 23.93851326778531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 244.375, "completions/mean_terminated_length": 244.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.49285295605659485, "epoch": 0.1824455766558592, "frac_reward_zero_std": 0.0, "grad_norm": 0.0933506041765213, "kl": 0.006029486772604287, "learning_rate": 9.63520148216767e-07, "loss": 0.02, "num_tokens": 108371755.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.4787135720252991, "step": 3939, "step_time": 27.82623726502061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.28407369554042816, "epoch": 0.1824918943955535, "frac_reward_zero_std": 1.0, "grad_norm": 0.006024600937962532, "kl": 0.0027144767227582633, "learning_rate": 9.63510884668828e-07, "loss": 0.0001, "num_tokens": 108393109.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3940, "step_time": 16.098177798092365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 133.75, "completions/mean_terminated_length": 133.75, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2104021981358528, "epoch": 0.1825382121352478, "frac_reward_zero_std": 1.0, "grad_norm": 0.003316520946100354, "kl": 0.0015981484903022647, "learning_rate": 9.635016211208892e-07, "loss": 0.0001, "num_tokens": 108412769.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3941, "step_time": 13.816600162535906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 201.9375, "completions/mean_terminated_length": 201.9375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.41445229202508926, "epoch": 0.18258452987494211, "frac_reward_zero_std": 0.0, "grad_norm": 0.11863738298416138, "kl": 0.007025412167422473, "learning_rate": 9.634923575729505e-07, "loss": 0.1255, "num_tokens": 108444912.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 3942, "step_time": 27.154757909476757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 114.3125, "completions/mean_terminated_length": 114.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2739161103963852, "epoch": 0.1826308476146364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024053254164755344, "kl": 0.0017553191573824733, "learning_rate": 9.634830940250114e-07, "loss": 0.0001, "num_tokens": 108465525.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3943, "step_time": 12.522561896592379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 203.6875, "completions/mean_terminated_length": 203.6875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3565376326441765, "epoch": 0.1826771653543307, "frac_reward_zero_std": 1.0, "grad_norm": 0.012317704036831856, "kl": 0.007949843886308372, "learning_rate": 9.634738304770726e-07, "loss": 0.0004, "num_tokens": 108492976.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3944, "step_time": 23.16466485336423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "entropy": 0.22141293436288834, "epoch": 0.182723483094025, "frac_reward_zero_std": 1.0, "grad_norm": 0.00474140141159296, "kl": 0.004121715668588877, "learning_rate": 9.634645669291337e-07, "loss": 0.0002, "num_tokens": 108515732.0, "reward": 0.7206611633300781, "reward_std": 0.0, "rewards/reward_func/mean": 0.7206611633300781, "rewards/reward_func/std": 0.0, "step": 3945, "step_time": 21.747199185192585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 187.6875, "completions/mean_terminated_length": 187.6875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.1961042806506157, "epoch": 0.18276980083371933, "frac_reward_zero_std": 1.0, "grad_norm": 0.009400580078363419, "kl": 0.005008580919820815, "learning_rate": 9.63455303381195e-07, "loss": 0.0002, "num_tokens": 108540591.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 3946, "step_time": 19.772193666547537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.3062416911125183, "epoch": 0.18281611857341362, "frac_reward_zero_std": 0.0, "grad_norm": 0.12036958336830139, "kl": 0.00928862695582211, "learning_rate": 9.634460398332562e-07, "loss": 0.0938, "num_tokens": 108564645.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 3947, "step_time": 21.514170866459608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 138.5625, "completions/mean_terminated_length": 138.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3367481976747513, "epoch": 0.18286243631310792, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019740720745176077, "kl": 0.0016995065961964428, "learning_rate": 9.634367762853173e-07, "loss": 0.0001, "num_tokens": 108598174.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3948, "step_time": 18.64675521478057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.21268099546432495, "epoch": 0.1829087540528022, "frac_reward_zero_std": 0.0, "grad_norm": 0.09837204217910767, "kl": 0.0037456040736287832, "learning_rate": 9.634275127373784e-07, "loss": 0.0125, "num_tokens": 108626387.0, "reward": 0.990892767906189, "reward_std": 0.03642905503511429, "rewards/reward_func/mean": 0.990892767906189, "rewards/reward_func/std": 0.036429062485694885, "step": 3949, "step_time": 19.300760619342327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 169.4375, "completions/mean_terminated_length": 169.4375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3538523018360138, "epoch": 0.18295507179249654, "frac_reward_zero_std": 1.0, "grad_norm": 0.003690734039992094, "kl": 0.0029123123385943472, "learning_rate": 9.634182491894395e-07, "loss": 0.0001, "num_tokens": 108661610.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3950, "step_time": 20.566821537911892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.35938917845487595, "epoch": 0.18300138953219083, "frac_reward_zero_std": 0.0, "grad_norm": 0.10962846130132675, "kl": 0.006495502311736345, "learning_rate": 9.634089856415007e-07, "loss": -0.0847, "num_tokens": 108699452.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 3951, "step_time": 25.562778376042843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3935577720403671, "epoch": 0.18304770727188513, "frac_reward_zero_std": 1.0, "grad_norm": 0.006419269368052483, "kl": 0.004049824550747871, "learning_rate": 9.633997220935618e-07, "loss": 0.0002, "num_tokens": 108729420.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3952, "step_time": 19.729410778731108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 124.5625, "completions/mean_terminated_length": 124.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.1442272663116455, "epoch": 0.18309402501157943, "frac_reward_zero_std": 1.0, "grad_norm": 0.00809891615062952, "kl": 0.0035606708261184394, "learning_rate": 9.63390458545623e-07, "loss": 0.0002, "num_tokens": 108749749.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 3953, "step_time": 14.20975099503994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 194.4375, "completions/mean_terminated_length": 194.4375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.34076155722141266, "epoch": 0.18314034275127375, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029215000104159117, "kl": 0.0025104266533162445, "learning_rate": 9.63381194997684e-07, "loss": 0.0001, "num_tokens": 108777292.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3954, "step_time": 20.822266440838575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 196.8125, "completions/mean_terminated_length": 196.8125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.37988148629665375, "epoch": 0.18318666049096805, "frac_reward_zero_std": 0.0, "grad_norm": 0.10158326476812363, "kl": 0.011375895468518138, "learning_rate": 9.633719314497452e-07, "loss": 0.0732, "num_tokens": 108801769.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 3955, "step_time": 22.12293777242303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 174.25, "completions/mean_terminated_length": 174.25, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.1992478109896183, "epoch": 0.18323297823066234, "frac_reward_zero_std": 1.0, "grad_norm": 0.00272434507496655, "kl": 0.002358984900638461, "learning_rate": 9.633626679018063e-07, "loss": 0.0001, "num_tokens": 108831309.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 3956, "step_time": 20.238527458161116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 148.4375, "completions/mean_terminated_length": 148.4375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.34961116313934326, "epoch": 0.18327929597035664, "frac_reward_zero_std": 1.0, "grad_norm": 0.012762676924467087, "kl": 0.004101846134290099, "learning_rate": 9.633534043538674e-07, "loss": 0.0002, "num_tokens": 108855348.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3957, "step_time": 16.884622506797314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2958749383687973, "epoch": 0.18332561371005096, "frac_reward_zero_std": 1.0, "grad_norm": 0.004096558783203363, "kl": 0.002952422248199582, "learning_rate": 9.633441408059285e-07, "loss": 0.0001, "num_tokens": 108877694.0, "reward": 0.7958667874336243, "reward_std": 0.0, "rewards/reward_func/mean": 0.7958667874336243, "rewards/reward_func/std": 0.0, "step": 3958, "step_time": 17.800376694649458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 323.25, "completions/mean_terminated_length": 323.25, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.11619868874549866, "epoch": 0.18337193144974526, "frac_reward_zero_std": 1.0, "grad_norm": 0.000940678408369422, "kl": 0.0008944468863774091, "learning_rate": 9.633348772579899e-07, "loss": 0.0, "num_tokens": 108907170.0, "reward": 0.6894580721855164, "reward_std": 0.0, "rewards/reward_func/mean": 0.6894580721855164, "rewards/reward_func/std": 0.0, "step": 3959, "step_time": 30.210374638438225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 129.5625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2954455837607384, "epoch": 0.18341824918943955, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016661534318700433, "kl": 0.001608467398909852, "learning_rate": 9.63325613710051e-07, "loss": 0.0001, "num_tokens": 108928395.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3960, "step_time": 14.256221912801266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2928720563650131, "epoch": 0.18346456692913385, "frac_reward_zero_std": 1.0, "grad_norm": 0.002274853875860572, "kl": 0.0017797103500925004, "learning_rate": 9.63316350162112e-07, "loss": 0.0001, "num_tokens": 108951532.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3961, "step_time": 15.395901620388031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.29976020008325577, "epoch": 0.18351088466882817, "frac_reward_zero_std": 1.0, "grad_norm": 0.009205182082951069, "kl": 0.003424542141146958, "learning_rate": 9.633070866141732e-07, "loss": 0.0002, "num_tokens": 108971841.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3962, "step_time": 15.533253353089094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.41104743629693985, "epoch": 0.18355720240852247, "frac_reward_zero_std": 0.0, "grad_norm": 0.11541653424501419, "kl": 0.012414152850396931, "learning_rate": 9.632978230662344e-07, "loss": 0.0011, "num_tokens": 108993067.0, "reward": 0.01748797297477722, "reward_std": 0.0179652851074934, "rewards/reward_func/mean": 0.01748797297477722, "rewards/reward_func/std": 0.0179652851074934, "step": 3963, "step_time": 23.979558132588863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 138.875, "completions/mean_terminated_length": 138.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.33035293966531754, "epoch": 0.18360352014821676, "frac_reward_zero_std": 1.0, "grad_norm": 0.003688642056658864, "kl": 0.0030114661203697324, "learning_rate": 9.632885595182955e-07, "loss": 0.0002, "num_tokens": 109013193.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3964, "step_time": 15.232430435717106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 179.4375, "completions/mean_terminated_length": 179.4375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2221505045890808, "epoch": 0.18364983788791106, "frac_reward_zero_std": 1.0, "grad_norm": 0.003660363843664527, "kl": 0.00352401816053316, "learning_rate": 9.632792959703566e-07, "loss": 0.0002, "num_tokens": 109034832.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3965, "step_time": 20.020007949322462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.4312363564968109, "epoch": 0.18369615562760538, "frac_reward_zero_std": 1.0, "grad_norm": 0.005175455939024687, "kl": 0.004027256276458502, "learning_rate": 9.632700324224177e-07, "loss": 0.0002, "num_tokens": 109066414.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3966, "step_time": 22.746452674269676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 167.8125, "completions/mean_terminated_length": 167.8125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.2729054242372513, "epoch": 0.18374247336729968, "frac_reward_zero_std": 1.0, "grad_norm": 0.009631850756704807, "kl": 0.00643218751065433, "learning_rate": 9.632607688744789e-07, "loss": 0.0003, "num_tokens": 109092651.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3967, "step_time": 17.775084663182497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 123.5625, "completions/mean_terminated_length": 123.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2562769018113613, "epoch": 0.18378879110699398, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029653096571564674, "kl": 0.0018083448521792889, "learning_rate": 9.6325150532654e-07, "loss": 0.0001, "num_tokens": 109114180.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3968, "step_time": 14.097911071032286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2452106587588787, "epoch": 0.18383510884668827, "frac_reward_zero_std": 0.0, "grad_norm": 0.1170409545302391, "kl": 0.02185472333803773, "learning_rate": 9.632422417786011e-07, "loss": -0.0253, "num_tokens": 109145978.0, "reward": 0.9886171817779541, "reward_std": 0.017437174916267395, "rewards/reward_func/mean": 0.9886171817779541, "rewards/reward_func/std": 0.017437167465686798, "step": 3969, "step_time": 22.349928356707096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.1739608719944954, "epoch": 0.1838814265863826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022780471481382847, "kl": 0.0016107640403788537, "learning_rate": 9.632329782306622e-07, "loss": 0.0001, "num_tokens": 109170277.0, "reward": 0.9394130706787109, "reward_std": 0.0, "rewards/reward_func/mean": 0.9394130706787109, "rewards/reward_func/std": 0.0, "step": 3970, "step_time": 17.16651639714837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.30899014323949814, "epoch": 0.1839277443260769, "frac_reward_zero_std": 1.0, "grad_norm": 0.005675748456269503, "kl": 0.002892179589252919, "learning_rate": 9.632237146827234e-07, "loss": 0.0001, "num_tokens": 109192277.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3971, "step_time": 15.727838676422834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 162.1875, "completions/mean_terminated_length": 162.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3050469681620598, "epoch": 0.1839740620657712, "frac_reward_zero_std": 1.0, "grad_norm": 0.003201049519702792, "kl": 0.0022730419295839965, "learning_rate": 9.632144511347847e-07, "loss": 0.0001, "num_tokens": 109214296.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3972, "step_time": 16.43618332967162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 207.5625, "completions/mean_terminated_length": 207.5625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.39574429392814636, "epoch": 0.18402037980546548, "frac_reward_zero_std": 0.0, "grad_norm": 0.1119006797671318, "kl": 0.009786227601580322, "learning_rate": 9.632051875868458e-07, "loss": 0.025, "num_tokens": 109237921.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 3973, "step_time": 23.224096555262804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.21707920357584953, "epoch": 0.1840666975451598, "frac_reward_zero_std": 0.0, "grad_norm": 0.1409747302532196, "kl": 0.028514136094599962, "learning_rate": 9.631959240389067e-07, "loss": -0.0607, "num_tokens": 109275449.0, "reward": 0.5498151779174805, "reward_std": 0.24936477839946747, "rewards/reward_func/mean": 0.5498151779174805, "rewards/reward_func/std": 0.24936480820178986, "step": 3974, "step_time": 23.64865927770734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.308023177087307, "epoch": 0.1841130152848541, "frac_reward_zero_std": 1.0, "grad_norm": 0.008185689337551594, "kl": 0.008744140737690032, "learning_rate": 9.631866604909679e-07, "loss": 0.0004, "num_tokens": 109300765.0, "reward": 0.3992621898651123, "reward_std": 0.0, "rewards/reward_func/mean": 0.3992621898651123, "rewards/reward_func/std": 0.0, "step": 3975, "step_time": 24.08602061495185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 168.25, "completions/mean_terminated_length": 168.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.18104015290737152, "epoch": 0.1841593330245484, "frac_reward_zero_std": 0.0, "grad_norm": 0.1370517462491989, "kl": 0.0021947959903627634, "learning_rate": 9.631773969430292e-07, "loss": -0.0095, "num_tokens": 109336433.0, "reward": 0.8720898628234863, "reward_std": 0.0024667978286743164, "rewards/reward_func/mean": 0.8720898628234863, "rewards/reward_func/std": 0.0024667978286743164, "step": 3976, "step_time": 20.16820090636611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 204.75, "completions/mean_terminated_length": 204.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.2309229113161564, "epoch": 0.1842056507642427, "frac_reward_zero_std": 0.0, "grad_norm": 0.11665613204240799, "kl": 0.007314708083868027, "learning_rate": 9.631681333950903e-07, "loss": 0.0111, "num_tokens": 109365101.0, "reward": 0.7043023109436035, "reward_std": 0.22019338607788086, "rewards/reward_func/mean": 0.7043023109436035, "rewards/reward_func/std": 0.22019338607788086, "step": 3977, "step_time": 22.09240308776498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 166.25, "completions/mean_terminated_length": 166.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.40785839408636093, "epoch": 0.18425196850393702, "frac_reward_zero_std": 1.0, "grad_norm": 0.003447173163294792, "kl": 0.0027228702674619853, "learning_rate": 9.631588698471515e-07, "loss": 0.0001, "num_tokens": 109404561.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3978, "step_time": 21.196227714419365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.23304549604654312, "epoch": 0.18429828624363132, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072014289908111095, "kl": 0.00271628238260746, "learning_rate": 9.631496062992126e-07, "loss": 0.0001, "num_tokens": 109424985.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3979, "step_time": 16.092280738055706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2900601103901863, "epoch": 0.1843446039833256, "frac_reward_zero_std": 1.0, "grad_norm": 0.001795622636564076, "kl": 0.0015797052474226803, "learning_rate": 9.631403427512737e-07, "loss": 0.0001, "num_tokens": 109448340.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3980, "step_time": 15.006452061235905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 147.5625, "completions/mean_terminated_length": 147.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.40439480543136597, "epoch": 0.1843909217230199, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036746393889188766, "kl": 0.0026609241031110287, "learning_rate": 9.631310792033348e-07, "loss": 0.0001, "num_tokens": 109471469.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3981, "step_time": 16.04165256768465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 193.875, "completions/mean_terminated_length": 193.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.16588760539889336, "epoch": 0.18443723946271423, "frac_reward_zero_std": 1.0, "grad_norm": 0.002614664612337947, "kl": 0.0015347810985986143, "learning_rate": 9.63121815655396e-07, "loss": 0.0001, "num_tokens": 109517227.0, "reward": 0.9181891679763794, "reward_std": 0.0, "rewards/reward_func/mean": 0.9181891679763794, "rewards/reward_func/std": 0.0, "step": 3982, "step_time": 24.127741757780313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 190.9375, "completions/mean_terminated_length": 190.9375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.20520596951246262, "epoch": 0.18448355720240853, "frac_reward_zero_std": 0.0, "grad_norm": 0.09583383798599243, "kl": 0.0028633428155444562, "learning_rate": 9.63112552107457e-07, "loss": 0.0318, "num_tokens": 109540026.0, "reward": 0.9898674488067627, "reward_std": 0.02178443968296051, "rewards/reward_func/mean": 0.9898674488067627, "rewards/reward_func/std": 0.02178444340825081, "step": 3983, "step_time": 19.54361553490162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.17532089725136757, "epoch": 0.18452987494210282, "frac_reward_zero_std": 0.0, "grad_norm": 0.11145468056201935, "kl": 0.02424334827810526, "learning_rate": 9.631032885595182e-07, "loss": -0.0232, "num_tokens": 109565951.0, "reward": 0.9905068874359131, "reward_std": 0.012657547369599342, "rewards/reward_func/mean": 0.9905068874359131, "rewards/reward_func/std": 0.012657553888857365, "step": 3984, "step_time": 26.740829281508923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 192.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4750644564628601, "epoch": 0.18457619268179712, "frac_reward_zero_std": 1.0, "grad_norm": 0.004511016421020031, "kl": 0.0034640480880625546, "learning_rate": 9.630940250115795e-07, "loss": 0.0002, "num_tokens": 109591043.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3985, "step_time": 21.56521673128009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.28628551959991455, "epoch": 0.18462251042149144, "frac_reward_zero_std": 1.0, "grad_norm": 0.01212072093039751, "kl": 0.0029937040817458183, "learning_rate": 9.630847614636405e-07, "loss": 0.0002, "num_tokens": 109611773.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3986, "step_time": 12.988273493945599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.1995968595147133, "epoch": 0.18466882816118574, "frac_reward_zero_std": 1.0, "grad_norm": 0.005069428123533726, "kl": 0.002444858255330473, "learning_rate": 9.630754979157016e-07, "loss": 0.0001, "num_tokens": 109631237.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3987, "step_time": 13.890676397830248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 151.0625, "completions/mean_terminated_length": 151.0625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.33534102141857147, "epoch": 0.18471514590088003, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027537329588085413, "kl": 0.002736641326919198, "learning_rate": 9.630662343677627e-07, "loss": 0.0001, "num_tokens": 109656006.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3988, "step_time": 17.098580598831177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 134.125, "completions/mean_terminated_length": 134.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3812548294663429, "epoch": 0.18476146364057433, "frac_reward_zero_std": 1.0, "grad_norm": 0.005437885876744986, "kl": 0.0031718131504021585, "learning_rate": 9.63056970819824e-07, "loss": 0.0002, "num_tokens": 109679448.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3989, "step_time": 15.155997782945633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.18073805794119835, "epoch": 0.18480778138026865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018739913357421756, "kl": 0.001899892173241824, "learning_rate": 9.630477072718852e-07, "loss": 0.0001, "num_tokens": 109702124.0, "reward": 0.6431870460510254, "reward_std": 0.0, "rewards/reward_func/mean": 0.6431870460510254, "rewards/reward_func/std": 0.0, "step": 3990, "step_time": 15.018368661403656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 182.8125, "completions/mean_terminated_length": 182.8125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.31624697893857956, "epoch": 0.18485409911996295, "frac_reward_zero_std": 0.0, "grad_norm": 0.12621982395648956, "kl": 0.010969286668114364, "learning_rate": 9.630384437239463e-07, "loss": -0.0211, "num_tokens": 109723609.0, "reward": 0.30878984928131104, "reward_std": 0.4730664789676666, "rewards/reward_func/mean": 0.30878984928131104, "rewards/reward_func/std": 0.4730664789676666, "step": 3991, "step_time": 19.846620678901672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.36934104561805725, "epoch": 0.18490041685965725, "frac_reward_zero_std": 1.0, "grad_norm": 0.005815388169139624, "kl": 0.0047956041526049376, "learning_rate": 9.630291801760074e-07, "loss": 0.0002, "num_tokens": 109759727.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3992, "step_time": 19.237177781760693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.39627500623464584, "epoch": 0.18494673459935154, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027054885867983103, "kl": 0.0024817449739202857, "learning_rate": 9.630199166280685e-07, "loss": 0.0001, "num_tokens": 109804080.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3993, "step_time": 25.613412898033857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.25153781473636627, "epoch": 0.18499305233904587, "frac_reward_zero_std": 1.0, "grad_norm": 0.006807493511587381, "kl": 0.0034551440621726215, "learning_rate": 9.630106530801297e-07, "loss": 0.0002, "num_tokens": 109824216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3994, "step_time": 14.718906585127115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.1745079718530178, "epoch": 0.18503937007874016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036642674822360277, "kl": 0.002100349054671824, "learning_rate": 9.630013895321908e-07, "loss": 0.0001, "num_tokens": 109850052.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 3995, "step_time": 19.60317961871624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 167.625, "completions/mean_terminated_length": 167.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.43344592303037643, "epoch": 0.18508568781843446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022423649206757545, "kl": 0.0022812695242464542, "learning_rate": 9.62992125984252e-07, "loss": 0.0001, "num_tokens": 109901406.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 3996, "step_time": 25.30669067800045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 208.0625, "completions/mean_terminated_length": 208.0625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.20702053233981133, "epoch": 0.18513200555812875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035511634778231382, "kl": 0.010278586996719241, "learning_rate": 9.62982862436313e-07, "loss": 0.0005, "num_tokens": 109924015.0, "reward": 0.6726685166358948, "reward_std": 0.0, "rewards/reward_func/mean": 0.6726685166358948, "rewards/reward_func/std": 0.0, "step": 3997, "step_time": 21.41838786378503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2692618891596794, "epoch": 0.18517832329782308, "frac_reward_zero_std": 1.0, "grad_norm": 0.005457677878439426, "kl": 0.004745265352539718, "learning_rate": 9.629735988883742e-07, "loss": 0.0002, "num_tokens": 109948218.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3998, "step_time": 19.842437531799078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.22472453117370605, "epoch": 0.18522464103751737, "frac_reward_zero_std": 1.0, "grad_norm": 0.007122904062271118, "kl": 0.0067714008037000895, "learning_rate": 9.629643353404353e-07, "loss": 0.0003, "num_tokens": 109983012.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 3999, "step_time": 35.84325436875224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.29957157373428345, "epoch": 0.18527095877721167, "frac_reward_zero_std": 0.0, "grad_norm": 0.08676525950431824, "kl": 0.007300138706341386, "learning_rate": 9.629550717924964e-07, "loss": -0.0123, "num_tokens": 110013420.0, "reward": 0.7593790292739868, "reward_std": 0.38766154646873474, "rewards/reward_func/mean": 0.7593790292739868, "rewards/reward_func/std": 0.38766157627105713, "step": 4000, "step_time": 28.48399420455098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 130.8125, "completions/mean_terminated_length": 130.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.24999059736728668, "epoch": 0.18531727651690597, "frac_reward_zero_std": 1.0, "grad_norm": 0.00222027231939137, "kl": 0.0014751394337508827, "learning_rate": 9.629458082445575e-07, "loss": 0.0001, "num_tokens": 110033033.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4001, "step_time": 13.585205253213644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3456791341304779, "epoch": 0.1853635942566003, "frac_reward_zero_std": 1.0, "grad_norm": 0.006335800979286432, "kl": 0.00560496945399791, "learning_rate": 9.629365446966189e-07, "loss": 0.0003, "num_tokens": 110056103.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4002, "step_time": 19.210248716175556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 129.125, "completions/mean_terminated_length": 129.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.21131492778658867, "epoch": 0.18540991199629459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027449503540992737, "kl": 0.001373827486531809, "learning_rate": 9.6292728114868e-07, "loss": 0.0001, "num_tokens": 110077577.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4003, "step_time": 14.045453313738108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 109.375, "completions/mean_terminated_length": 109.375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.27837956696748734, "epoch": 0.18545622973598888, "frac_reward_zero_std": 1.0, "grad_norm": 0.006750398315489292, "kl": 0.004190101637504995, "learning_rate": 9.629180176007411e-07, "loss": 0.0002, "num_tokens": 110096799.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4004, "step_time": 13.59730527177453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 217.1875, "completions/mean_terminated_length": 217.1875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.16105342656373978, "epoch": 0.18550254747568318, "frac_reward_zero_std": 1.0, "grad_norm": 0.005388450343161821, "kl": 0.004298588493838906, "learning_rate": 9.62908754052802e-07, "loss": 0.0002, "num_tokens": 110120658.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4005, "step_time": 20.983232606202364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 122.625, "completions/mean_terminated_length": 122.625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.31485600769519806, "epoch": 0.1855488652153775, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034729966428130865, "kl": 0.002158529096050188, "learning_rate": 9.628994905048634e-07, "loss": 0.0001, "num_tokens": 110142540.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4006, "step_time": 13.971363630145788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.37800244241952896, "epoch": 0.1855951829550718, "frac_reward_zero_std": 1.0, "grad_norm": 0.002700837329030037, "kl": 0.0019029233371838927, "learning_rate": 9.628902269569245e-07, "loss": 0.0001, "num_tokens": 110177631.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4007, "step_time": 21.014577466994524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 190.5, "completions/mean_terminated_length": 190.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.1682998165488243, "epoch": 0.1856415006947661, "frac_reward_zero_std": 1.0, "grad_norm": 0.00239096162840724, "kl": 0.002248078992124647, "learning_rate": 9.628809634089856e-07, "loss": 0.0001, "num_tokens": 110205863.0, "reward": 0.9364250898361206, "reward_std": 0.0, "rewards/reward_func/mean": 0.9364250898361206, "rewards/reward_func/std": 0.0, "step": 4008, "step_time": 20.772039148956537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2897027060389519, "epoch": 0.1856878184344604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018700878135859966, "kl": 0.0014238965668482706, "learning_rate": 9.628716998610468e-07, "loss": 0.0001, "num_tokens": 110230131.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4009, "step_time": 14.397155273705721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 216.6875, "completions/mean_terminated_length": 216.6875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.23186129331588745, "epoch": 0.1857341361741547, "frac_reward_zero_std": 1.0, "grad_norm": 0.01045618299394846, "kl": 0.026143222115933895, "learning_rate": 9.628624363131079e-07, "loss": 0.0013, "num_tokens": 110253662.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4010, "step_time": 20.775734931230545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.23873531445860863, "epoch": 0.185780453913849, "frac_reward_zero_std": 0.0, "grad_norm": 0.14374826848506927, "kl": 0.00922016124241054, "learning_rate": 9.62853172765169e-07, "loss": -0.0157, "num_tokens": 110276806.0, "reward": 0.11073227226734161, "reward_std": 0.2918255031108856, "rewards/reward_func/mean": 0.11073227226734161, "rewards/reward_func/std": 0.2918255031108856, "step": 4011, "step_time": 16.624506752938032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.326849564909935, "epoch": 0.1858267716535433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029834227170795202, "kl": 0.0025584730319678783, "learning_rate": 9.628439092172301e-07, "loss": 0.0001, "num_tokens": 110301152.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4012, "step_time": 15.393717229366302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 214.9375, "completions/mean_terminated_length": 214.9375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.303561232984066, "epoch": 0.1858730893932376, "frac_reward_zero_std": 0.0, "grad_norm": 0.08768731355667114, "kl": 0.00878506456501782, "learning_rate": 9.628346456692913e-07, "loss": 0.0508, "num_tokens": 110339519.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4013, "step_time": 25.936167631298304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.44467893242836, "epoch": 0.18591940713293192, "frac_reward_zero_std": 0.0, "grad_norm": 0.09070470184087753, "kl": 0.00987121183425188, "learning_rate": 9.628253821213524e-07, "loss": -0.0279, "num_tokens": 110362539.0, "reward": 0.758585512638092, "reward_std": 0.38068443536758423, "rewards/reward_func/mean": 0.758585512638092, "rewards/reward_func/std": 0.38068443536758423, "step": 4014, "step_time": 28.47284570708871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.13524066284298897, "epoch": 0.18596572487262622, "frac_reward_zero_std": 1.0, "grad_norm": 0.004189995117485523, "kl": 0.002879993262467906, "learning_rate": 9.628161185734135e-07, "loss": 0.0001, "num_tokens": 110387731.0, "reward": 0.3441537916660309, "reward_std": 0.0, "rewards/reward_func/mean": 0.3441537916660309, "rewards/reward_func/std": 0.0, "step": 4015, "step_time": 15.309558905661106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 169.25, "completions/mean_terminated_length": 169.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.23994186893105507, "epoch": 0.18601204261232052, "frac_reward_zero_std": 1.0, "grad_norm": 0.19447307288646698, "kl": 0.03730075154453516, "learning_rate": 9.628068550254748e-07, "loss": 0.0019, "num_tokens": 110410375.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 4016, "step_time": 19.559921495616436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 166.9375, "completions/mean_terminated_length": 166.9375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.25637052953243256, "epoch": 0.1860583603520148, "frac_reward_zero_std": 1.0, "grad_norm": 0.004210201092064381, "kl": 0.0032640844001434743, "learning_rate": 9.627975914775358e-07, "loss": 0.0002, "num_tokens": 110431078.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 4017, "step_time": 17.773968800902367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.3417166396975517, "epoch": 0.18610467809170914, "frac_reward_zero_std": 0.0, "grad_norm": 0.08424817025661469, "kl": 0.014773907605558634, "learning_rate": 9.627883279295969e-07, "loss": -0.0777, "num_tokens": 110458632.0, "reward": 0.4639071226119995, "reward_std": 0.360614538192749, "rewards/reward_func/mean": 0.4639071226119995, "rewards/reward_func/std": 0.3606145679950714, "step": 4018, "step_time": 29.189542088657618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 183.9375, "completions/mean_terminated_length": 183.9375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3959822431206703, "epoch": 0.18615099583140343, "frac_reward_zero_std": 1.0, "grad_norm": 0.004899558611214161, "kl": 0.004724961938336492, "learning_rate": 9.627790643816582e-07, "loss": 0.0002, "num_tokens": 110480439.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4019, "step_time": 18.707082144916058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 168.4375, "completions/mean_terminated_length": 168.4375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2049410603940487, "epoch": 0.18619731357109773, "frac_reward_zero_std": 1.0, "grad_norm": 0.008570501580834389, "kl": 0.008052483783103526, "learning_rate": 9.627698008337193e-07, "loss": 0.0004, "num_tokens": 110503214.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 4020, "step_time": 17.276382356882095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 235.9375, "completions/mean_terminated_length": 235.9375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.24076197668910027, "epoch": 0.18624363131079202, "frac_reward_zero_std": 0.0, "grad_norm": 0.0847514346241951, "kl": 0.011526466347277164, "learning_rate": 9.627605372857805e-07, "loss": -0.0824, "num_tokens": 110541789.0, "reward": 0.6347875595092773, "reward_std": 0.3249886929988861, "rewards/reward_func/mean": 0.6347875595092773, "rewards/reward_func/std": 0.3249886929988861, "step": 4021, "step_time": 26.530835896730423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.25206584483385086, "epoch": 0.18628994905048635, "frac_reward_zero_std": 0.0, "grad_norm": 0.09857936203479767, "kl": 0.032442583702504635, "learning_rate": 9.627512737378416e-07, "loss": 0.0109, "num_tokens": 110580503.0, "reward": 0.9319667816162109, "reward_std": 0.2495037168264389, "rewards/reward_func/mean": 0.9319667816162109, "rewards/reward_func/std": 0.2495037168264389, "step": 4022, "step_time": 29.375940918922424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 184.3125, "completions/mean_terminated_length": 184.3125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.25172628462314606, "epoch": 0.18633626679018064, "frac_reward_zero_std": 1.0, "grad_norm": 0.008962012827396393, "kl": 0.006980260368436575, "learning_rate": 9.627420101899027e-07, "loss": 0.0003, "num_tokens": 110613516.0, "reward": 0.8742223381996155, "reward_std": 0.0, "rewards/reward_func/mean": 0.8742223381996155, "rewards/reward_func/std": 0.0, "step": 4023, "step_time": 23.098502170294523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 201.5625, "completions/mean_terminated_length": 201.5625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.24033070728182793, "epoch": 0.18638258452987494, "frac_reward_zero_std": 1.0, "grad_norm": 0.002025859896093607, "kl": 0.0017823771049734205, "learning_rate": 9.627327466419638e-07, "loss": 0.0001, "num_tokens": 110668325.0, "reward": 0.5623413324356079, "reward_std": 0.0, "rewards/reward_func/mean": 0.5623413324356079, "rewards/reward_func/std": 0.0, "step": 4024, "step_time": 27.585756354033947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.21306084096431732, "epoch": 0.18642890226956924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013144618133082986, "kl": 0.001131007564254105, "learning_rate": 9.62723483094025e-07, "loss": 0.0001, "num_tokens": 110701565.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4025, "step_time": 24.947600785642862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 166.3125, "completions/mean_terminated_length": 166.3125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.4107777997851372, "epoch": 0.18647522000926356, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022324291057884693, "kl": 0.0021487894118763506, "learning_rate": 9.62714219546086e-07, "loss": 0.0001, "num_tokens": 110733154.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4026, "step_time": 19.024696111679077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 188.875, "completions/mean_terminated_length": 188.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.17104016616940498, "epoch": 0.18652153774895786, "frac_reward_zero_std": 1.0, "grad_norm": 0.002095515839755535, "kl": 0.0012917412968818098, "learning_rate": 9.627049559981472e-07, "loss": 0.0001, "num_tokens": 110764256.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 4027, "step_time": 20.831158369779587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 141.375, "completions/mean_terminated_length": 141.375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.26488659530878067, "epoch": 0.18656785548865215, "frac_reward_zero_std": 1.0, "grad_norm": 0.005742703098803759, "kl": 0.003191460156813264, "learning_rate": 9.626956924502083e-07, "loss": 0.0002, "num_tokens": 110784102.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4028, "step_time": 16.903685934841633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.2235318385064602, "epoch": 0.18661417322834645, "frac_reward_zero_std": 0.0, "grad_norm": 0.19104117155075073, "kl": 0.004938063560985029, "learning_rate": 9.626864289022695e-07, "loss": -0.0354, "num_tokens": 110817136.0, "reward": 0.5958160161972046, "reward_std": 0.10778238624334335, "rewards/reward_func/mean": 0.5958160161972046, "rewards/reward_func/std": 0.10778239369392395, "step": 4029, "step_time": 24.90700952708721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 171.375, "completions/mean_terminated_length": 171.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.256470900028944, "epoch": 0.18666049096804077, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041847629472613335, "kl": 0.004776358720846474, "learning_rate": 9.626771653543306e-07, "loss": 0.0002, "num_tokens": 110839766.0, "reward": 0.1910344958305359, "reward_std": 0.0, "rewards/reward_func/mean": 0.1910344958305359, "rewards/reward_func/std": 0.0, "step": 4030, "step_time": 17.760415386408567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 174.625, "completions/mean_terminated_length": 174.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.4725622311234474, "epoch": 0.18670680870773507, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038015469908714294, "kl": 0.0030454796506091952, "learning_rate": 9.626679018063917e-07, "loss": 0.0002, "num_tokens": 110861312.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4031, "step_time": 18.09712029993534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.37680091708898544, "epoch": 0.18675312644742936, "frac_reward_zero_std": 0.0, "grad_norm": 0.0995647981762886, "kl": 0.008350224932655692, "learning_rate": 9.62658638258453e-07, "loss": 0.0737, "num_tokens": 110894404.0, "reward": 0.4639376401901245, "reward_std": 0.47846540808677673, "rewards/reward_func/mean": 0.4639376401901245, "rewards/reward_func/std": 0.4784654676914215, "step": 4032, "step_time": 26.580738559365273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.33753830194473267, "epoch": 0.18679944418712366, "frac_reward_zero_std": 0.0, "grad_norm": 0.10607600957155228, "kl": 0.013644204940646887, "learning_rate": 9.626493747105142e-07, "loss": -0.0514, "num_tokens": 110919616.0, "reward": 0.6257338523864746, "reward_std": 0.16443493962287903, "rewards/reward_func/mean": 0.6257338523864746, "rewards/reward_func/std": 0.16443493962287903, "step": 4033, "step_time": 23.17309584468603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.33476605266332626, "epoch": 0.18684576192681798, "frac_reward_zero_std": 0.0, "grad_norm": 0.09008869528770447, "kl": 0.010737331118434668, "learning_rate": 9.626401111625753e-07, "loss": -0.0856, "num_tokens": 110947216.0, "reward": 0.36792483925819397, "reward_std": 0.2843818664550781, "rewards/reward_func/mean": 0.36792483925819397, "rewards/reward_func/std": 0.2843818962574005, "step": 4034, "step_time": 28.35039295628667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.23919912800192833, "epoch": 0.18689207966651228, "frac_reward_zero_std": 0.0, "grad_norm": 0.12189947813749313, "kl": 0.030286923982203007, "learning_rate": 9.626308476146362e-07, "loss": 0.0012, "num_tokens": 110984316.0, "reward": 0.38853946328163147, "reward_std": 0.012802098877727985, "rewards/reward_func/mean": 0.38853946328163147, "rewards/reward_func/std": 0.012802098877727985, "step": 4035, "step_time": 27.309667088091373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 145.75, "completions/mean_terminated_length": 145.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3725769445300102, "epoch": 0.18693839740620657, "frac_reward_zero_std": 1.0, "grad_norm": 0.003245528554543853, "kl": 0.0025223796255886555, "learning_rate": 9.626215840666975e-07, "loss": 0.0001, "num_tokens": 111020408.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4036, "step_time": 18.651448875665665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.3189077600836754, "epoch": 0.18698471514590087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030886817257851362, "kl": 0.0024825698928907514, "learning_rate": 9.626123205187587e-07, "loss": 0.0001, "num_tokens": 111045342.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4037, "step_time": 16.30925925076008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.26514882594347, "epoch": 0.1870310328855952, "frac_reward_zero_std": 0.0, "grad_norm": 0.29529812932014465, "kl": 0.015383483376353979, "learning_rate": 9.626030569708198e-07, "loss": 0.1239, "num_tokens": 111066840.0, "reward": 0.3272396922111511, "reward_std": 0.26179173588752747, "rewards/reward_func/mean": 0.3272396922111511, "rewards/reward_func/std": 0.26179173588752747, "step": 4038, "step_time": 20.36912925541401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.478082574903965, "epoch": 0.1870773506252895, "frac_reward_zero_std": 0.0, "grad_norm": 0.1142750084400177, "kl": 0.0057019483065232635, "learning_rate": 9.62593793422881e-07, "loss": 0.2009, "num_tokens": 111091666.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 4039, "step_time": 29.76534355804324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3431262820959091, "epoch": 0.1871236683649838, "frac_reward_zero_std": 1.0, "grad_norm": 0.00506778247654438, "kl": 0.003924763586837798, "learning_rate": 9.62584529874942e-07, "loss": 0.0002, "num_tokens": 111114412.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4040, "step_time": 16.78785802423954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 145.0, "completions/mean_terminated_length": 145.0, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.36675096303224564, "epoch": 0.18716998610467808, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029439309146255255, "kl": 0.0022743356821592897, "learning_rate": 9.625752663270032e-07, "loss": 0.0001, "num_tokens": 111144844.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4041, "step_time": 19.179258815944195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 176.3125, "completions/mean_terminated_length": 176.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2295929342508316, "epoch": 0.1872163038443724, "frac_reward_zero_std": 0.0, "grad_norm": 0.1008884459733963, "kl": 0.00183072779327631, "learning_rate": 9.625660027790643e-07, "loss": -0.0112, "num_tokens": 111167441.0, "reward": 0.9814902544021606, "reward_std": 0.033111222088336945, "rewards/reward_func/mean": 0.9814902544021606, "rewards/reward_func/std": 0.03311121463775635, "step": 4042, "step_time": 20.266985408961773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 155.0625, "completions/mean_terminated_length": 155.0625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3718869537115097, "epoch": 0.1872626215840667, "frac_reward_zero_std": 1.0, "grad_norm": 0.004450637381523848, "kl": 0.003298588388133794, "learning_rate": 9.625567392311254e-07, "loss": 0.0002, "num_tokens": 111203794.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4043, "step_time": 19.718469090759754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.23140939697623253, "epoch": 0.187308939323761, "frac_reward_zero_std": 1.0, "grad_norm": 0.004221647512167692, "kl": 0.0023980710247997195, "learning_rate": 9.625474756831865e-07, "loss": 0.0001, "num_tokens": 111224588.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4044, "step_time": 15.231382239609957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.18196557834744453, "epoch": 0.1873552570634553, "frac_reward_zero_std": 1.0, "grad_norm": 0.002249514451250434, "kl": 0.0017824685492087156, "learning_rate": 9.625382121352477e-07, "loss": 0.0001, "num_tokens": 111247680.0, "reward": 0.8657099008560181, "reward_std": 0.0, "rewards/reward_func/mean": 0.8657099008560181, "rewards/reward_func/std": 0.0, "step": 4045, "step_time": 22.431513603776693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 196.5625, "completions/mean_terminated_length": 196.5625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.40521497279405594, "epoch": 0.18740157480314962, "frac_reward_zero_std": 0.0, "grad_norm": 0.12528221309185028, "kl": 0.011679814429953694, "learning_rate": 9.62528948587309e-07, "loss": -0.0223, "num_tokens": 111286713.0, "reward": 0.0009923388715833426, "reward_std": 0.00396935548633337, "rewards/reward_func/mean": 0.0009923388715833426, "rewards/reward_func/std": 0.00396935548633337, "step": 4046, "step_time": 23.81155974417925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 120.3125, "completions/mean_terminated_length": 120.3125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.257652398198843, "epoch": 0.1874478925428439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028297470416873693, "kl": 0.0019749358762055635, "learning_rate": 9.625196850393701e-07, "loss": 0.0001, "num_tokens": 111306862.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4047, "step_time": 12.967925600707531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 177.0625, "completions/mean_terminated_length": 177.0625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2123894840478897, "epoch": 0.1874942102825382, "frac_reward_zero_std": 0.0, "grad_norm": 0.10444404184818268, "kl": 0.01974491006694734, "learning_rate": 9.62510421491431e-07, "loss": 0.0177, "num_tokens": 111350639.0, "reward": 0.965315580368042, "reward_std": 0.05313253030180931, "rewards/reward_func/mean": 0.965315580368042, "rewards/reward_func/std": 0.05313252657651901, "step": 4048, "step_time": 24.339804265648127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 256.8125, "completions/mean_terminated_length": 256.8125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.44845743477344513, "epoch": 0.1875405280222325, "frac_reward_zero_std": 0.0, "grad_norm": 0.10149309039115906, "kl": 0.005505937500856817, "learning_rate": 9.625011579434924e-07, "loss": 0.3396, "num_tokens": 111381260.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4049, "step_time": 51.83715748041868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.41343196481466293, "epoch": 0.18758684576192683, "frac_reward_zero_std": 1.0, "grad_norm": 0.012865274213254452, "kl": 0.00894459243863821, "learning_rate": 9.624918943955535e-07, "loss": 0.0004, "num_tokens": 111411230.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4050, "step_time": 23.866840578615665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 185.4375, "completions/mean_terminated_length": 185.4375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4249361529946327, "epoch": 0.18763316350162113, "frac_reward_zero_std": 0.0, "grad_norm": 0.13789379596710205, "kl": 0.005413910665083677, "learning_rate": 9.624826308476146e-07, "loss": -0.0281, "num_tokens": 111432597.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 4051, "step_time": 19.753078617155552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 226.1875, "completions/mean_terminated_length": 226.1875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3317742869257927, "epoch": 0.18767948124131542, "frac_reward_zero_std": 0.0, "grad_norm": 0.09370024502277374, "kl": 0.018835734808817506, "learning_rate": 9.624733672996758e-07, "loss": -0.1381, "num_tokens": 111469528.0, "reward": 0.18570634722709656, "reward_std": 0.2210385501384735, "rewards/reward_func/mean": 0.18570634722709656, "rewards/reward_func/std": 0.2210385650396347, "step": 4052, "step_time": 29.399143770337105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 217.0, "completions/mean_terminated_length": 217.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.26461541652679443, "epoch": 0.18772579898100972, "frac_reward_zero_std": 0.0, "grad_norm": 0.07868076115846634, "kl": 0.010406596818938851, "learning_rate": 9.624641037517369e-07, "loss": 0.0148, "num_tokens": 111495064.0, "reward": 0.5141167640686035, "reward_std": 0.21188393235206604, "rewards/reward_func/mean": 0.5141167640686035, "rewards/reward_func/std": 0.21188391745090485, "step": 4053, "step_time": 21.481164783239365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 104.8125, "completions/mean_terminated_length": 104.8125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.32731233537197113, "epoch": 0.18777211672070404, "frac_reward_zero_std": 1.0, "grad_norm": 0.005049731582403183, "kl": 0.0022659313399344683, "learning_rate": 9.62454840203798e-07, "loss": 0.0001, "num_tokens": 111518053.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4054, "step_time": 13.79790012165904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 129.8125, "completions/mean_terminated_length": 129.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3062795028090477, "epoch": 0.18781843446039834, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026553329080343246, "kl": 0.0020594959205482155, "learning_rate": 9.624455766558591e-07, "loss": 0.0001, "num_tokens": 111543938.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4055, "step_time": 16.338610626757145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.32172737270593643, "epoch": 0.18786475220009263, "frac_reward_zero_std": 1.0, "grad_norm": 0.005706567317247391, "kl": 0.003369259589817375, "learning_rate": 9.624363131079203e-07, "loss": 0.0002, "num_tokens": 111564506.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4056, "step_time": 16.800219353288412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.2548220008611679, "epoch": 0.18791106993978693, "frac_reward_zero_std": 0.0, "grad_norm": 0.09054199606180191, "kl": 0.016585303004831076, "learning_rate": 9.624270495599814e-07, "loss": -0.0459, "num_tokens": 111590836.0, "reward": 0.3605729341506958, "reward_std": 0.25982192158699036, "rewards/reward_func/mean": 0.3605729341506958, "rewards/reward_func/std": 0.25982192158699036, "step": 4057, "step_time": 25.715322334319353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.17664026096463203, "epoch": 0.18795738767948125, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039227064698934555, "kl": 0.0031869065715000033, "learning_rate": 9.624177860120425e-07, "loss": 0.0002, "num_tokens": 111617678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4058, "step_time": 18.212814670056105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 181.5, "completions/mean_terminated_length": 181.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.37588997185230255, "epoch": 0.18800370541917555, "frac_reward_zero_std": 0.0, "grad_norm": 0.11496306955814362, "kl": 0.006013693870045245, "learning_rate": 9.624085224641038e-07, "loss": 0.0163, "num_tokens": 111665462.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 4059, "step_time": 25.726170733571053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 125.8125, "completions/mean_terminated_length": 125.8125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.31233925372362137, "epoch": 0.18805002315886984, "frac_reward_zero_std": 1.0, "grad_norm": 0.002167275408282876, "kl": 0.001902395742945373, "learning_rate": 9.623992589161648e-07, "loss": 0.0001, "num_tokens": 111685715.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4060, "step_time": 13.839465118944645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 189.8125, "completions/mean_terminated_length": 189.8125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.22551383078098297, "epoch": 0.18809634089856414, "frac_reward_zero_std": 1.0, "grad_norm": 0.001395753468386829, "kl": 0.0014026453718543053, "learning_rate": 9.623899953682259e-07, "loss": 0.0001, "num_tokens": 111740208.0, "reward": 0.6803749203681946, "reward_std": 0.0, "rewards/reward_func/mean": 0.6803749203681946, "rewards/reward_func/std": 0.0, "step": 4061, "step_time": 30.20871962234378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 233.1875, "completions/mean_terminated_length": 233.1875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.22690307348966599, "epoch": 0.18814265863825846, "frac_reward_zero_std": 0.0, "grad_norm": 0.1081831157207489, "kl": 0.012482823571190238, "learning_rate": 9.623807318202872e-07, "loss": -0.0546, "num_tokens": 111764051.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4062, "step_time": 23.088761750608683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 197.4375, "completions/mean_terminated_length": 197.4375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.4009406939148903, "epoch": 0.18818897637795276, "frac_reward_zero_std": 1.0, "grad_norm": 0.004363492596894503, "kl": 0.003706438699737191, "learning_rate": 9.623714682723483e-07, "loss": 0.0002, "num_tokens": 111795946.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4063, "step_time": 21.760704543441534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 196.0625, "completions/mean_terminated_length": 196.0625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.20653334259986877, "epoch": 0.18823529411764706, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037432056851685047, "kl": 0.0021062337327748537, "learning_rate": 9.623622047244095e-07, "loss": 0.0001, "num_tokens": 111834331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4064, "step_time": 22.563964564353228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 178.8125, "completions/mean_terminated_length": 178.8125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4411425143480301, "epoch": 0.18828161185734135, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034896032884716988, "kl": 0.003495117765851319, "learning_rate": 9.623529411764706e-07, "loss": 0.0002, "num_tokens": 111881560.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4065, "step_time": 24.425624758005142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 195.5625, "completions/mean_terminated_length": 195.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3708682432770729, "epoch": 0.18832792959703568, "frac_reward_zero_std": 1.0, "grad_norm": 0.008655146695673466, "kl": 0.008445442072115839, "learning_rate": 9.623436776285317e-07, "loss": 0.0004, "num_tokens": 111907985.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4066, "step_time": 21.677610144019127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 206.0, "completions/mean_terminated_length": 206.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.21363132447004318, "epoch": 0.18837424733672997, "frac_reward_zero_std": 0.0, "grad_norm": 0.0956958532333374, "kl": 0.061965879052877426, "learning_rate": 9.623344140805928e-07, "loss": -0.0308, "num_tokens": 111929761.0, "reward": 0.9857840538024902, "reward_std": 0.056863654404878616, "rewards/reward_func/mean": 0.9857840538024902, "rewards/reward_func/std": 0.056863654404878616, "step": 4067, "step_time": 19.852060932666063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 127.375, "completions/mean_terminated_length": 127.375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.25247569754719734, "epoch": 0.18842056507642427, "frac_reward_zero_std": 1.0, "grad_norm": 0.01273462362587452, "kl": 0.004691361915320158, "learning_rate": 9.62325150532654e-07, "loss": 0.0002, "num_tokens": 111949223.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4068, "step_time": 13.748459201306105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.22371292486786842, "epoch": 0.18846688281611856, "frac_reward_zero_std": 0.0, "grad_norm": 0.21906466782093048, "kl": 0.017677327734418213, "learning_rate": 9.62315886984715e-07, "loss": 0.0066, "num_tokens": 111969607.0, "reward": 0.9081355333328247, "reward_std": 0.2510214149951935, "rewards/reward_func/mean": 0.9081355333328247, "rewards/reward_func/std": 0.2510214149951935, "step": 4069, "step_time": 16.426375936716795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 132.25, "completions/mean_terminated_length": 132.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2735486924648285, "epoch": 0.1885132005558129, "frac_reward_zero_std": 1.0, "grad_norm": 0.004998963326215744, "kl": 0.0028393929824233055, "learning_rate": 9.623066234367762e-07, "loss": 0.0001, "num_tokens": 111989355.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4070, "step_time": 14.137649320065975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 258.4375, "completions/mean_terminated_length": 258.4375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.4584140181541443, "epoch": 0.18855951829550718, "frac_reward_zero_std": 0.0, "grad_norm": 0.09336674213409424, "kl": 0.0033523670863360167, "learning_rate": 9.622973598888373e-07, "loss": 0.0238, "num_tokens": 112017762.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 4071, "step_time": 26.69462824985385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 126.0625, "completions/mean_terminated_length": 126.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.28217051923274994, "epoch": 0.18860583603520148, "frac_reward_zero_std": 1.0, "grad_norm": 0.008498135954141617, "kl": 0.0033046818571165204, "learning_rate": 9.622880963408985e-07, "loss": 0.0002, "num_tokens": 112037571.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4072, "step_time": 13.568863987922668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 187.3125, "completions/mean_terminated_length": 187.3125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4118848964571953, "epoch": 0.18865215377489578, "frac_reward_zero_std": 0.0, "grad_norm": 0.15025535225868225, "kl": 0.007421009591780603, "learning_rate": 9.622788327929596e-07, "loss": -0.0591, "num_tokens": 112059032.0, "reward": 0.25, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.44721361994743347, "step": 4073, "step_time": 19.93145489320159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 212.9375, "completions/mean_terminated_length": 212.9375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2828494608402252, "epoch": 0.1886984715145901, "frac_reward_zero_std": 0.0, "grad_norm": 0.09432634711265564, "kl": 0.012376365251839161, "learning_rate": 9.622695692450207e-07, "loss": 0.0348, "num_tokens": 112088695.0, "reward": 0.09933918714523315, "reward_std": 0.026740100234746933, "rewards/reward_func/mean": 0.09933918714523315, "rewards/reward_func/std": 0.026740102097392082, "step": 4074, "step_time": 26.48119631409645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 178.4375, "completions/mean_terminated_length": 178.4375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.21490349248051643, "epoch": 0.1887447892542844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017751294653862715, "kl": 0.004497880348935723, "learning_rate": 9.622603056970818e-07, "loss": 0.0002, "num_tokens": 112124990.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4075, "step_time": 22.12447264790535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 138.0, "completions/mean_terminated_length": 138.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3145938962697983, "epoch": 0.1887911069939787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038075167685747147, "kl": 0.002874632424209267, "learning_rate": 9.622510421491432e-07, "loss": 0.0001, "num_tokens": 112149118.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4076, "step_time": 16.077032446861267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 204.0625, "completions/mean_terminated_length": 204.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2810898497700691, "epoch": 0.188837424733673, "frac_reward_zero_std": 0.0, "grad_norm": 0.13672883808612823, "kl": 0.007074593333527446, "learning_rate": 9.622417786012043e-07, "loss": -0.0298, "num_tokens": 112173551.0, "reward": 0.4939180016517639, "reward_std": 0.13089695572853088, "rewards/reward_func/mean": 0.4939180016517639, "rewards/reward_func/std": 0.13089697062969208, "step": 4077, "step_time": 21.388612024486065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.23593541234731674, "epoch": 0.1888837424733673, "frac_reward_zero_std": 1.0, "grad_norm": 0.003772187978029251, "kl": 0.0024787528382148594, "learning_rate": 9.622325150532652e-07, "loss": 0.0001, "num_tokens": 112195031.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4078, "step_time": 14.221291285008192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.875, "completions/mean_terminated_length": 124.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3156389445066452, "epoch": 0.1889300602130616, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039777373895049095, "kl": 0.0022487479145638645, "learning_rate": 9.622232515053266e-07, "loss": 0.0001, "num_tokens": 112219701.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4079, "step_time": 14.497664973139763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 140.4375, "completions/mean_terminated_length": 140.4375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3001774325966835, "epoch": 0.1889763779527559, "frac_reward_zero_std": 1.0, "grad_norm": 0.003297751070931554, "kl": 0.002578421903308481, "learning_rate": 9.622139879573877e-07, "loss": 0.0001, "num_tokens": 112239852.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4080, "step_time": 14.37793630734086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.41828029602766037, "epoch": 0.1890226956924502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026004703249782324, "kl": 0.0020941461552865803, "learning_rate": 9.622047244094488e-07, "loss": 0.0001, "num_tokens": 112283508.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4081, "step_time": 20.340359319001436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.27900033444166183, "epoch": 0.18906901343214452, "frac_reward_zero_std": 1.0, "grad_norm": 0.013750758953392506, "kl": 0.003349784354213625, "learning_rate": 9.6219546086151e-07, "loss": 0.0002, "num_tokens": 112303728.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4082, "step_time": 13.775241889059544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 132.0625, "completions/mean_terminated_length": 132.0625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.31625083088874817, "epoch": 0.18911533117183882, "frac_reward_zero_std": 1.0, "grad_norm": 0.003438417799770832, "kl": 0.0021677478798665106, "learning_rate": 9.62186197313571e-07, "loss": 0.0001, "num_tokens": 112325777.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4083, "step_time": 14.63520834967494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.322076752781868, "epoch": 0.18916164891153311, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041464813984930515, "kl": 0.0029935682541690767, "learning_rate": 9.621769337656322e-07, "loss": 0.0001, "num_tokens": 112347497.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4084, "step_time": 16.739662885665894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 159.125, "completions/mean_terminated_length": 159.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.39617153257131577, "epoch": 0.1892079666512274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022639597300440073, "kl": 0.0021434122463688254, "learning_rate": 9.621676702176933e-07, "loss": 0.0001, "num_tokens": 112378731.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4085, "step_time": 20.17177975550294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.397089883685112, "epoch": 0.18925428439092173, "frac_reward_zero_std": 1.0, "grad_norm": 0.002172194654121995, "kl": 0.0022505151864606887, "learning_rate": 9.621584066697544e-07, "loss": 0.0001, "num_tokens": 112425814.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4086, "step_time": 23.75847203284502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.23387274146080017, "epoch": 0.18930060213061603, "frac_reward_zero_std": 1.0, "grad_norm": 0.002618056023493409, "kl": 0.001825862971600145, "learning_rate": 9.621491431218156e-07, "loss": 0.0001, "num_tokens": 112455686.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 4087, "step_time": 19.35735733062029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 226.5625, "completions/mean_terminated_length": 226.5625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.4100160673260689, "epoch": 0.18934691987031033, "frac_reward_zero_std": 0.0, "grad_norm": 0.01020955853164196, "kl": 0.010344581445679069, "learning_rate": 9.621398795738767e-07, "loss": 0.0004, "num_tokens": 112483375.0, "reward": 3.279789240195896e-08, "reward_std": 1.1687691170436665e-07, "rewards/reward_func/mean": 3.279789240195896e-08, "rewards/reward_func/std": 1.16876918809794e-07, "step": 4088, "step_time": 28.477491047233343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.1213931031525135, "epoch": 0.18939323761000462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018979375017806888, "kl": 0.0014054529601708055, "learning_rate": 9.62130616025938e-07, "loss": 0.0001, "num_tokens": 112508159.0, "reward": 0.9534969329833984, "reward_std": 0.0, "rewards/reward_func/mean": 0.9534969329833984, "rewards/reward_func/std": 0.0, "step": 4089, "step_time": 21.313947524875402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 128.6875, "completions/mean_terminated_length": 128.6875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2810846120119095, "epoch": 0.18943955534969895, "frac_reward_zero_std": 1.0, "grad_norm": 0.004309963900595903, "kl": 0.0022224989079404622, "learning_rate": 9.621213524779991e-07, "loss": 0.0001, "num_tokens": 112527930.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4090, "step_time": 15.578890204429626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 142.8125, "completions/mean_terminated_length": 142.8125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.28713301569223404, "epoch": 0.18948587308939324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034399654250591993, "kl": 0.0029686609632335603, "learning_rate": 9.6211208893006e-07, "loss": 0.0001, "num_tokens": 112552935.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4091, "step_time": 17.679511532187462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.40622394531965256, "epoch": 0.18953219082908754, "frac_reward_zero_std": 1.0, "grad_norm": 0.006183540914207697, "kl": 0.0047001210623420775, "learning_rate": 9.621028253821214e-07, "loss": 0.0002, "num_tokens": 112598009.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4092, "step_time": 28.116632137447596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.28625255078077316, "epoch": 0.18957850856878183, "frac_reward_zero_std": 1.0, "grad_norm": 0.005459882784634829, "kl": 0.003465807647444308, "learning_rate": 9.620935618341825e-07, "loss": 0.0002, "num_tokens": 112627521.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4093, "step_time": 16.235010791569948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 282.8125, "completions/mean_terminated_length": 282.8125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "entropy": 0.17093633115291595, "epoch": 0.18962482630847616, "frac_reward_zero_std": 0.0, "grad_norm": 0.08792279660701752, "kl": 0.016549956053495407, "learning_rate": 9.620842982862436e-07, "loss": 0.0133, "num_tokens": 112667758.0, "reward": 0.995914101600647, "reward_std": 0.016343481838703156, "rewards/reward_func/mean": 0.995914101600647, "rewards/reward_func/std": 0.016343489289283752, "step": 4094, "step_time": 31.010224632918835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.8125, "completions/mean_terminated_length": 124.8125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.29375743120908737, "epoch": 0.18967114404817045, "frac_reward_zero_std": 1.0, "grad_norm": 0.002086227759718895, "kl": 0.0017451457388233393, "learning_rate": 9.620750347383048e-07, "loss": 0.0001, "num_tokens": 112689755.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4095, "step_time": 14.13269180059433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 139.5, "completions/mean_terminated_length": 139.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3268865644931793, "epoch": 0.18971746178786475, "frac_reward_zero_std": 1.0, "grad_norm": 0.006312190555036068, "kl": 0.0041411496931687, "learning_rate": 9.620657711903659e-07, "loss": 0.0002, "num_tokens": 112710003.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4096, "step_time": 15.974027272313833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.2679096534848213, "epoch": 0.18976377952755905, "frac_reward_zero_std": 0.0, "grad_norm": 0.06756766885519028, "kl": 0.015096401330083609, "learning_rate": 9.62056507642427e-07, "loss": -0.0137, "num_tokens": 112732075.0, "reward": 0.7114852666854858, "reward_std": 0.18972940742969513, "rewards/reward_func/mean": 0.7114852666854858, "rewards/reward_func/std": 0.18972940742969513, "step": 4097, "step_time": 24.38409310951829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.175911545753479, "epoch": 0.18981009726725337, "frac_reward_zero_std": 0.0, "grad_norm": 0.13000069558620453, "kl": 0.005401259404607117, "learning_rate": 9.620472440944881e-07, "loss": -0.0191, "num_tokens": 112753278.0, "reward": 0.848545253276825, "reward_std": 0.05034581571817398, "rewards/reward_func/mean": 0.848545253276825, "rewards/reward_func/std": 0.05034581571817398, "step": 4098, "step_time": 17.01097398623824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.42449715733528137, "epoch": 0.18985641500694767, "frac_reward_zero_std": 1.0, "grad_norm": 0.013729634694755077, "kl": 0.004774175118654966, "learning_rate": 9.620379805465493e-07, "loss": 0.0002, "num_tokens": 112787798.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4099, "step_time": 20.552163925021887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 142.4375, "completions/mean_terminated_length": 142.4375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.32483430951833725, "epoch": 0.18990273274664196, "frac_reward_zero_std": 1.0, "grad_norm": 0.006355844903737307, "kl": 0.0042109721107408404, "learning_rate": 9.620287169986104e-07, "loss": 0.0002, "num_tokens": 112815293.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4100, "step_time": 17.171034947037697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2035413607954979, "epoch": 0.18994905048633626, "frac_reward_zero_std": 0.0, "grad_norm": 0.12430978566408157, "kl": 0.014210526598617435, "learning_rate": 9.620194534506715e-07, "loss": -0.018, "num_tokens": 112836461.0, "reward": 0.826245903968811, "reward_std": 0.12673979997634888, "rewards/reward_func/mean": 0.826245903968811, "rewards/reward_func/std": 0.12673981487751007, "step": 4101, "step_time": 18.095979381352663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 194.75, "completions/mean_terminated_length": 194.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.14771785028278828, "epoch": 0.18999536822603058, "frac_reward_zero_std": 1.0, "grad_norm": 0.008715835399925709, "kl": 0.0030156224966049194, "learning_rate": 9.620101899027328e-07, "loss": 0.0002, "num_tokens": 112861641.0, "reward": 0.6170787811279297, "reward_std": 0.0, "rewards/reward_func/mean": 0.6170787811279297, "rewards/reward_func/std": 0.0, "step": 4102, "step_time": 19.890064790844917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.26230965554714203, "epoch": 0.19004168596572488, "frac_reward_zero_std": 1.0, "grad_norm": 0.002729707397520542, "kl": 0.00267799012362957, "learning_rate": 9.620009263547938e-07, "loss": 0.0001, "num_tokens": 112883586.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4103, "step_time": 16.894199144095182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 135.3125, "completions/mean_terminated_length": 135.3125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2919178009033203, "epoch": 0.19008800370541917, "frac_reward_zero_std": 1.0, "grad_norm": 0.006839972920715809, "kl": 0.0037067380035296082, "learning_rate": 9.619916628068549e-07, "loss": 0.0002, "num_tokens": 112903751.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4104, "step_time": 15.149609547108412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.22336683422327042, "epoch": 0.19013432144511347, "frac_reward_zero_std": 0.0, "grad_norm": 0.12809725105762482, "kl": 0.015582123305648565, "learning_rate": 9.61982399258916e-07, "loss": 0.0231, "num_tokens": 112930417.0, "reward": 0.990056037902832, "reward_std": 0.039775896817445755, "rewards/reward_func/mean": 0.990056037902832, "rewards/reward_func/std": 0.03977589309215546, "step": 4105, "step_time": 22.927800353616476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 222.6875, "completions/mean_terminated_length": 222.6875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.40010541677474976, "epoch": 0.1901806391848078, "frac_reward_zero_std": 0.0, "grad_norm": 0.12244771420955658, "kl": 0.008408163907006383, "learning_rate": 9.619731357109773e-07, "loss": -0.0514, "num_tokens": 112958156.0, "reward": 0.35345709323883057, "reward_std": 0.471556156873703, "rewards/reward_func/mean": 0.35345709323883057, "rewards/reward_func/std": 0.4715561866760254, "step": 4106, "step_time": 23.678833052515984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 153.0625, "completions/mean_terminated_length": 153.0625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3967430666089058, "epoch": 0.1902269569245021, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036235363222658634, "kl": 0.003490384726319462, "learning_rate": 9.619638721630385e-07, "loss": 0.0002, "num_tokens": 112999309.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4107, "step_time": 21.72936211153865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 147.1875, "completions/mean_terminated_length": 147.1875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3907182961702347, "epoch": 0.19027327466419638, "frac_reward_zero_std": 1.0, "grad_norm": 0.001981359673663974, "kl": 0.0022990216966718435, "learning_rate": 9.619546086150996e-07, "loss": 0.0001, "num_tokens": 113033424.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4108, "step_time": 18.73133908584714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 135.5625, "completions/mean_terminated_length": 135.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.30377212166786194, "epoch": 0.19031959240389068, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027422490529716015, "kl": 0.0021515804110094905, "learning_rate": 9.619453450671607e-07, "loss": 0.0001, "num_tokens": 113055865.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4109, "step_time": 15.47657148167491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 168.875, "completions/mean_terminated_length": 168.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.34115321189165115, "epoch": 0.190365910143585, "frac_reward_zero_std": 1.0, "grad_norm": 0.005145085975527763, "kl": 0.00553589453920722, "learning_rate": 9.619360815192218e-07, "loss": 0.0003, "num_tokens": 113076935.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4110, "step_time": 18.9564984254539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 175.75, "completions/mean_terminated_length": 175.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.2507578805088997, "epoch": 0.1904122278832793, "frac_reward_zero_std": 0.0, "grad_norm": 0.12401070445775986, "kl": 0.007570909336209297, "learning_rate": 9.61926817971283e-07, "loss": 0.0585, "num_tokens": 113100355.0, "reward": 0.9143877029418945, "reward_std": 0.26070141792297363, "rewards/reward_func/mean": 0.9143877029418945, "rewards/reward_func/std": 0.26070141792297363, "step": 4111, "step_time": 20.16663908213377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 211.5625, "completions/mean_terminated_length": 211.5625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3748941421508789, "epoch": 0.1904585456229736, "frac_reward_zero_std": 0.0, "grad_norm": 0.10845178365707397, "kl": 0.005902686039917171, "learning_rate": 9.61917554423344e-07, "loss": 0.2146, "num_tokens": 113135740.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4112, "step_time": 30.707610316574574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 126.5625, "completions/mean_terminated_length": 126.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.3122868686914444, "epoch": 0.1905048633626679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039450665935873985, "kl": 0.0023736665316391736, "learning_rate": 9.619082908754052e-07, "loss": 0.0001, "num_tokens": 113155733.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4113, "step_time": 14.587149430066347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 148.25, "completions/mean_terminated_length": 148.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.39277520775794983, "epoch": 0.19055118110236222, "frac_reward_zero_std": 1.0, "grad_norm": 0.009942091070115566, "kl": 0.005806333385407925, "learning_rate": 9.618990273274663e-07, "loss": 0.0003, "num_tokens": 113178409.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4114, "step_time": 16.174455918371677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4367034435272217, "epoch": 0.1905974988420565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024834980722516775, "kl": 0.0026712362887337804, "learning_rate": 9.618897637795275e-07, "loss": 0.0001, "num_tokens": 113229433.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4115, "step_time": 24.28413689136505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 136.5, "completions/mean_terminated_length": 136.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.258312463760376, "epoch": 0.1906438165817508, "frac_reward_zero_std": 1.0, "grad_norm": 0.010435893200337887, "kl": 0.0038344977074302733, "learning_rate": 9.618805002315886e-07, "loss": 0.0002, "num_tokens": 113249169.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4116, "step_time": 14.605042569339275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 188.3125, "completions/mean_terminated_length": 188.3125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.42330311983823776, "epoch": 0.1906901343214451, "frac_reward_zero_std": 1.0, "grad_norm": 0.005170047283172607, "kl": 0.0034668792504817247, "learning_rate": 9.618712366836497e-07, "loss": 0.0002, "num_tokens": 113272758.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4117, "step_time": 19.21664920821786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3497743457555771, "epoch": 0.19073645206113943, "frac_reward_zero_std": 0.0, "grad_norm": 0.1662413775920868, "kl": 0.007731353281997144, "learning_rate": 9.618619731357108e-07, "loss": -0.0099, "num_tokens": 113302878.0, "reward": 0.08951783180236816, "reward_std": 0.26566338539123535, "rewards/reward_func/mean": 0.08951783180236816, "rewards/reward_func/std": 0.26566338539123535, "step": 4118, "step_time": 21.004832059144974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 192.9375, "completions/mean_terminated_length": 192.9375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.17815063148736954, "epoch": 0.19078276980083372, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033755472395569086, "kl": 0.00312114623375237, "learning_rate": 9.618527095877722e-07, "loss": 0.0002, "num_tokens": 113333661.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4119, "step_time": 21.385315846651793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.2912794053554535, "epoch": 0.19082908754052802, "frac_reward_zero_std": 1.0, "grad_norm": 0.002820141613483429, "kl": 0.002084189676679671, "learning_rate": 9.618434460398333e-07, "loss": 0.0001, "num_tokens": 113354523.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4120, "step_time": 15.178599156439304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2421119175851345, "epoch": 0.19087540528022232, "frac_reward_zero_std": 0.0, "grad_norm": 0.09715329110622406, "kl": 0.005439578555524349, "learning_rate": 9.618341824918942e-07, "loss": -0.0392, "num_tokens": 113376237.0, "reward": 0.9850083589553833, "reward_std": 0.03223112225532532, "rewards/reward_func/mean": 0.9850083589553833, "rewards/reward_func/std": 0.03223112225532532, "step": 4121, "step_time": 20.041270956397057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 132.6875, "completions/mean_terminated_length": 132.6875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.272799015045166, "epoch": 0.19092172301991664, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033663539215922356, "kl": 0.0022227732406463474, "learning_rate": 9.618249189439556e-07, "loss": 0.0001, "num_tokens": 113396184.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4122, "step_time": 14.523907784372568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 167.5, "completions/mean_terminated_length": 167.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3335861638188362, "epoch": 0.19096804075961094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039022567216306925, "kl": 0.0028716546948999166, "learning_rate": 9.618156553960167e-07, "loss": 0.0001, "num_tokens": 113421728.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4123, "step_time": 17.749713256955147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 132.8125, "completions/mean_terminated_length": 132.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3208964839577675, "epoch": 0.19101435849930523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031338210683315992, "kl": 0.0026089075254276395, "learning_rate": 9.618063918480778e-07, "loss": 0.0001, "num_tokens": 113445645.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4124, "step_time": 15.515925850719213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 198.0625, "completions/mean_terminated_length": 198.0625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.19702854380011559, "epoch": 0.19106067623899953, "frac_reward_zero_std": 1.0, "grad_norm": 0.011642944067716599, "kl": 0.006318412139080465, "learning_rate": 9.61797128300139e-07, "loss": 0.0003, "num_tokens": 113470190.0, "reward": 0.8539396524429321, "reward_std": 0.0, "rewards/reward_func/mean": 0.8539396524429321, "rewards/reward_func/std": 0.0, "step": 4125, "step_time": 19.687298599630594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 132.625, "completions/mean_terminated_length": 132.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.28254781663417816, "epoch": 0.19110699397869385, "frac_reward_zero_std": 1.0, "grad_norm": 0.002491622930392623, "kl": 0.0019359943107701838, "learning_rate": 9.617878647522e-07, "loss": 0.0001, "num_tokens": 113502120.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4126, "step_time": 16.2068096883595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 149.0625, "completions/mean_terminated_length": 149.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.287677139043808, "epoch": 0.19115331171838815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059418524615466595, "kl": 0.00384613499045372, "learning_rate": 9.617786012042612e-07, "loss": 0.0002, "num_tokens": 113523689.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4127, "step_time": 16.24800293147564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 131.125, "completions/mean_terminated_length": 131.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.22621575370430946, "epoch": 0.19119962945808244, "frac_reward_zero_std": 1.0, "grad_norm": 0.00238971458747983, "kl": 0.0015752276813145727, "learning_rate": 9.617693376563223e-07, "loss": 0.0001, "num_tokens": 113543243.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4128, "step_time": 13.781833782792091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.2259707935154438, "epoch": 0.19124594719777674, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018358264351263642, "kl": 0.0019115104514639825, "learning_rate": 9.617600741083834e-07, "loss": 0.0001, "num_tokens": 113579783.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4129, "step_time": 21.70344466343522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 130.875, "completions/mean_terminated_length": 130.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.31674399226903915, "epoch": 0.19129226493747106, "frac_reward_zero_std": 1.0, "grad_norm": 0.003532495116814971, "kl": 0.0019234635983593762, "learning_rate": 9.617508105604446e-07, "loss": 0.0001, "num_tokens": 113607029.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4130, "step_time": 16.637072067707777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 232.3125, "completions/mean_terminated_length": 232.3125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.4538995549082756, "epoch": 0.19133858267716536, "frac_reward_zero_std": 0.0, "grad_norm": 0.09977710992097855, "kl": 0.003325658617541194, "learning_rate": 9.617415470125057e-07, "loss": 0.0237, "num_tokens": 113629210.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.5625, "rewards/reward_func/std": 0.5123475790023804, "step": 4131, "step_time": 22.828448496758938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 122.4375, "completions/mean_terminated_length": 122.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2979244291782379, "epoch": 0.19138490041685965, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024236678145825863, "kl": 0.0020474987395573407, "learning_rate": 9.61732283464567e-07, "loss": 0.0001, "num_tokens": 113657153.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4132, "step_time": 14.929260857403278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 145.9375, "completions/mean_terminated_length": 145.9375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.27596915513277054, "epoch": 0.19143121815655395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016421765321865678, "kl": 0.0016378310974687338, "learning_rate": 9.617230199166281e-07, "loss": 0.0001, "num_tokens": 113681984.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4133, "step_time": 16.583049949258566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2172876000404358, "epoch": 0.19147753589624827, "frac_reward_zero_std": 1.0, "grad_norm": 0.010466446168720722, "kl": 0.009740041103214025, "learning_rate": 9.61713756368689e-07, "loss": 0.0005, "num_tokens": 113705290.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4134, "step_time": 21.961980622261763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2684297263622284, "epoch": 0.19152385363594257, "frac_reward_zero_std": 0.0, "grad_norm": 0.15504348278045654, "kl": 0.008947949158027768, "learning_rate": 9.617044928207502e-07, "loss": -0.0027, "num_tokens": 113727163.0, "reward": 0.9928268790245056, "reward_std": 0.01960066333413124, "rewards/reward_func/mean": 0.9928268790245056, "rewards/reward_func/std": 0.019600657746195793, "step": 4135, "step_time": 17.34838716313243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 140.6875, "completions/mean_terminated_length": 140.6875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.20543736964464188, "epoch": 0.19157017137563687, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036165397614240646, "kl": 0.0022800464066676795, "learning_rate": 9.616952292728115e-07, "loss": 0.0001, "num_tokens": 113747798.0, "reward": 0.054531343281269073, "reward_std": 0.0, "rewards/reward_func/mean": 0.054531343281269073, "rewards/reward_func/std": 0.0, "step": 4136, "step_time": 14.98941059038043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 176.0625, "completions/mean_terminated_length": 176.0625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.25562866404652596, "epoch": 0.19161648911533116, "frac_reward_zero_std": 0.0, "grad_norm": 0.11315275728702545, "kl": 0.009432476945221424, "learning_rate": 9.616859657248726e-07, "loss": 0.0674, "num_tokens": 113768807.0, "reward": 0.9258368611335754, "reward_std": 0.2512396574020386, "rewards/reward_func/mean": 0.9258368611335754, "rewards/reward_func/std": 0.25123968720436096, "step": 4137, "step_time": 19.531320482492447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 164.8125, "completions/mean_terminated_length": 164.8125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3511381074786186, "epoch": 0.19166280685502549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030502998270094395, "kl": 0.0023590116179548204, "learning_rate": 9.616767021769338e-07, "loss": 0.0001, "num_tokens": 113795892.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4138, "step_time": 17.74069155752659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.48868875950574875, "epoch": 0.19170912459471978, "frac_reward_zero_std": 1.0, "grad_norm": 0.00716400658711791, "kl": 0.0056730881333351135, "learning_rate": 9.616674386289949e-07, "loss": 0.0003, "num_tokens": 113822110.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4139, "step_time": 22.50037330761552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 137.5625, "completions/mean_terminated_length": 137.5625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2536194510757923, "epoch": 0.19175544233441408, "frac_reward_zero_std": 1.0, "grad_norm": 0.00179067172575742, "kl": 0.001369092016830109, "learning_rate": 9.61658175081056e-07, "loss": 0.0001, "num_tokens": 113841927.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4140, "step_time": 14.555555552244186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.4374123215675354, "epoch": 0.19180176007410837, "frac_reward_zero_std": 0.0, "grad_norm": 0.14294247329235077, "kl": 0.0055836348328739405, "learning_rate": 9.616489115331171e-07, "loss": 0.3803, "num_tokens": 113870667.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4141, "step_time": 42.89665176346898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1771853119134903, "epoch": 0.1918480778138027, "frac_reward_zero_std": 0.0, "grad_norm": 0.2171192318201065, "kl": 0.03439937601797283, "learning_rate": 9.616396479851783e-07, "loss": -0.0669, "num_tokens": 113894549.0, "reward": 0.7944756746292114, "reward_std": 0.21226465702056885, "rewards/reward_func/mean": 0.7944756746292114, "rewards/reward_func/std": 0.21226465702056885, "step": 4142, "step_time": 16.87919330596924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 145.0625, "completions/mean_terminated_length": 145.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3588384613394737, "epoch": 0.191894395553497, "frac_reward_zero_std": 1.0, "grad_norm": 0.002160383854061365, "kl": 0.0017672441026661545, "learning_rate": 9.616303844372394e-07, "loss": 0.0001, "num_tokens": 113925398.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4143, "step_time": 18.057990729808807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3224322274327278, "epoch": 0.1919407132931913, "frac_reward_zero_std": 1.0, "grad_norm": 0.006190591957420111, "kl": 0.003072601044550538, "learning_rate": 9.616211208893005e-07, "loss": 0.0002, "num_tokens": 113954201.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4144, "step_time": 18.933008283376694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 177.6875, "completions/mean_terminated_length": 177.6875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.38990194350481033, "epoch": 0.19198703103288559, "frac_reward_zero_std": 1.0, "grad_norm": 0.00652702059596777, "kl": 0.0049262718530371785, "learning_rate": 9.616118573413616e-07, "loss": 0.0002, "num_tokens": 113977460.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4145, "step_time": 18.30564560368657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 137.1875, "completions/mean_terminated_length": 137.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35952040553092957, "epoch": 0.1920333487725799, "frac_reward_zero_std": 1.0, "grad_norm": 0.004456476774066687, "kl": 0.0028655443456955254, "learning_rate": 9.616025937934228e-07, "loss": 0.0001, "num_tokens": 113999127.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4146, "step_time": 14.782212276011705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 192.0625, "completions/mean_terminated_length": 192.0625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4055248573422432, "epoch": 0.1920796665122742, "frac_reward_zero_std": 1.0, "grad_norm": 0.007383132819086313, "kl": 0.007461455068551004, "learning_rate": 9.615933302454839e-07, "loss": 0.0004, "num_tokens": 114024008.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4147, "step_time": 25.052754264324903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 192.75, "completions/mean_terminated_length": 192.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.42571692913770676, "epoch": 0.1921259842519685, "frac_reward_zero_std": 0.0, "grad_norm": 0.15006126463413239, "kl": 0.0028612722526304424, "learning_rate": 9.61584066697545e-07, "loss": 0.0523, "num_tokens": 114052356.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4148, "step_time": 21.563019450753927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.432545930147171, "epoch": 0.1921723019916628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019523289520293474, "kl": 0.002061988925561309, "learning_rate": 9.615748031496064e-07, "loss": 0.0001, "num_tokens": 114085864.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4149, "step_time": 18.404424782842398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 276.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.22658958658576012, "epoch": 0.19221861973135712, "frac_reward_zero_std": 0.0, "grad_norm": 0.09118513762950897, "kl": 0.013732125982642174, "learning_rate": 9.615655396016675e-07, "loss": -0.0242, "num_tokens": 114111222.0, "reward": 0.9376300573348999, "reward_std": 0.03169326111674309, "rewards/reward_func/mean": 0.9376300573348999, "rewards/reward_func/std": 0.03169327229261398, "step": 4150, "step_time": 26.937728572636843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 161.0, "completions/mean_terminated_length": 161.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.14998751133680344, "epoch": 0.19226493747105142, "frac_reward_zero_std": 0.0, "grad_norm": 0.7351125478744507, "kl": 0.016170429589692503, "learning_rate": 9.615562760537286e-07, "loss": -0.0087, "num_tokens": 114148294.0, "reward": 0.9211294651031494, "reward_std": 0.047029267996549606, "rewards/reward_func/mean": 0.9211294651031494, "rewards/reward_func/std": 0.0470292754471302, "step": 4151, "step_time": 20.600656140595675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 158.5625, "completions/mean_terminated_length": 158.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.42090681195259094, "epoch": 0.1923112552107457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016604408156126738, "kl": 0.0020379158959258348, "learning_rate": 9.615470125057897e-07, "loss": 0.0001, "num_tokens": 114201711.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4152, "step_time": 26.212098207324743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.4798419699072838, "epoch": 0.19235757295044, "frac_reward_zero_std": 0.0, "grad_norm": 0.11509548127651215, "kl": 0.009450761834159493, "learning_rate": 9.615377489578509e-07, "loss": -0.0491, "num_tokens": 114236673.0, "reward": 0.05819142237305641, "reward_std": 0.23276568949222565, "rewards/reward_func/mean": 0.05819142237305641, "rewards/reward_func/std": 0.23276568949222565, "step": 4153, "step_time": 29.532480336725712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.36657992750406265, "epoch": 0.19240389069013433, "frac_reward_zero_std": 1.0, "grad_norm": 0.002306044800207019, "kl": 0.0024614148715045303, "learning_rate": 9.61528485409912e-07, "loss": 0.0001, "num_tokens": 114285531.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4154, "step_time": 22.670250222086906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 186.375, "completions/mean_terminated_length": 186.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3475746810436249, "epoch": 0.19245020842982863, "frac_reward_zero_std": 0.0, "grad_norm": 0.11811861395835876, "kl": 0.0071674431674182415, "learning_rate": 9.61519221861973e-07, "loss": 0.024, "num_tokens": 114318017.0, "reward": 0.2126832902431488, "reward_std": 0.3804594576358795, "rewards/reward_func/mean": 0.2126832902431488, "rewards/reward_func/std": 0.38045942783355713, "step": 4155, "step_time": 21.09713003784418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 152.9375, "completions/mean_terminated_length": 152.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.4188668951392174, "epoch": 0.19249652616952292, "frac_reward_zero_std": 1.0, "grad_norm": 0.002057152334600687, "kl": 0.0024148281081579626, "learning_rate": 9.615099583140342e-07, "loss": 0.0001, "num_tokens": 114360896.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4156, "step_time": 21.31528852507472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.38907235115766525, "epoch": 0.19254284390921722, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020492763724178076, "kl": 0.0019699092663358897, "learning_rate": 9.615006947660954e-07, "loss": 0.0001, "num_tokens": 114384736.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4157, "step_time": 15.394862465560436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 199.625, "completions/mean_terminated_length": 199.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3164656236767769, "epoch": 0.19258916164891154, "frac_reward_zero_std": 0.0, "grad_norm": 0.12706026434898376, "kl": 0.02596709644421935, "learning_rate": 9.614914312181565e-07, "loss": -0.0196, "num_tokens": 114413466.0, "reward": 0.017014067620038986, "reward_std": 0.0137868020683527, "rewards/reward_func/mean": 0.017014067620038986, "rewards/reward_func/std": 0.013786802999675274, "step": 4158, "step_time": 20.807936184108257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 199.75, "completions/mean_terminated_length": 199.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.20087550207972527, "epoch": 0.19263547938860584, "frac_reward_zero_std": 0.0, "grad_norm": 0.11680769175291061, "kl": 0.03648154158145189, "learning_rate": 9.614821676702176e-07, "loss": -0.077, "num_tokens": 114436854.0, "reward": 0.4652743935585022, "reward_std": 0.25358015298843384, "rewards/reward_func/mean": 0.4652743935585022, "rewards/reward_func/std": 0.25358015298843384, "step": 4159, "step_time": 21.823095712810755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 127.75, "completions/mean_terminated_length": 127.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.1348789781332016, "epoch": 0.19268179712830014, "frac_reward_zero_std": 1.0, "grad_norm": 0.009309993125498295, "kl": 0.003608310245908797, "learning_rate": 9.614729041222787e-07, "loss": 0.0002, "num_tokens": 114462418.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4160, "step_time": 14.663566589355469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 159.375, "completions/mean_terminated_length": 159.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.21860400587320328, "epoch": 0.19272811486799443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016577020287513733, "kl": 0.0014613009407185018, "learning_rate": 9.614636405743399e-07, "loss": 0.0001, "num_tokens": 114485816.0, "reward": 0.8824968934059143, "reward_std": 0.0, "rewards/reward_func/mean": 0.8824968934059143, "rewards/reward_func/std": 0.0, "step": 4161, "step_time": 16.999474443495274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 166.375, "completions/mean_terminated_length": 166.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3817687928676605, "epoch": 0.19277443260768876, "frac_reward_zero_std": 1.0, "grad_norm": 0.002473505912348628, "kl": 0.0023684672778472304, "learning_rate": 9.614543770264012e-07, "loss": 0.0001, "num_tokens": 114518302.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4162, "step_time": 19.303426075726748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 165.9375, "completions/mean_terminated_length": 165.9375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.16365380585193634, "epoch": 0.19282075034738305, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027192379347980022, "kl": 0.0019629605812951922, "learning_rate": 9.614451134784623e-07, "loss": 0.0001, "num_tokens": 114539453.0, "reward": 0.9487294554710388, "reward_std": 0.0, "rewards/reward_func/mean": 0.9487294554710388, "rewards/reward_func/std": 0.0, "step": 4163, "step_time": 17.175036642700434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 143.25, "completions/mean_terminated_length": 143.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.2042892538011074, "epoch": 0.19286706808707735, "frac_reward_zero_std": 1.0, "grad_norm": 0.005544089246541262, "kl": 0.0037651165621355176, "learning_rate": 9.614358499305234e-07, "loss": 0.0002, "num_tokens": 114560033.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 4164, "step_time": 16.043456874787807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 206.3125, "completions/mean_terminated_length": 206.3125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.43551913648843765, "epoch": 0.19291338582677164, "frac_reward_zero_std": 0.0, "grad_norm": 0.10597597807645798, "kl": 0.0061697757337242365, "learning_rate": 9.614265863825844e-07, "loss": 0.0702, "num_tokens": 114582182.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4165, "step_time": 21.919243324548006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 107.1875, "completions/mean_terminated_length": 107.1875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.2782635763287544, "epoch": 0.19295970356646597, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033314658794552088, "kl": 0.0019797772401943803, "learning_rate": 9.614173228346457e-07, "loss": 0.0001, "num_tokens": 114609609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4166, "step_time": 14.042929541319609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 123.75, "completions/mean_terminated_length": 123.75, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.28888339549303055, "epoch": 0.19300602130616026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036419229581952095, "kl": 0.002689459885004908, "learning_rate": 9.614080592867068e-07, "loss": 0.0001, "num_tokens": 114632997.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4167, "step_time": 14.445013903081417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 186.8125, "completions/mean_terminated_length": 186.8125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.27170734480023384, "epoch": 0.19305233904585456, "frac_reward_zero_std": 1.0, "grad_norm": 0.005016711074858904, "kl": 0.001881208037957549, "learning_rate": 9.61398795738768e-07, "loss": 0.0001, "num_tokens": 114666418.0, "reward": 0.2177126258611679, "reward_std": 0.0, "rewards/reward_func/mean": 0.2177126258611679, "rewards/reward_func/std": 0.0, "step": 4168, "step_time": 22.89176604896784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 139.6875, "completions/mean_terminated_length": 139.6875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3100237548351288, "epoch": 0.19309865678554886, "frac_reward_zero_std": 1.0, "grad_norm": 0.004565900191664696, "kl": 0.003418289590626955, "learning_rate": 9.61389532190829e-07, "loss": 0.0002, "num_tokens": 114689997.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4169, "step_time": 16.47387283295393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 150.5, "completions/mean_terminated_length": 150.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.21429673582315445, "epoch": 0.19314497452524318, "frac_reward_zero_std": 0.0, "grad_norm": 0.1330195516347885, "kl": 0.005152279802132398, "learning_rate": 9.613802686428902e-07, "loss": 0.0011, "num_tokens": 114713637.0, "reward": 0.9084429740905762, "reward_std": 0.01863100565969944, "rewards/reward_func/mean": 0.9084429740905762, "rewards/reward_func/std": 0.018631013110280037, "step": 4170, "step_time": 16.273397151380777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 187.25, "completions/mean_terminated_length": 187.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.21282903105020523, "epoch": 0.19319129226493748, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058262222446501255, "kl": 0.005617423914372921, "learning_rate": 9.613710050949513e-07, "loss": 0.0003, "num_tokens": 114735305.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4171, "step_time": 18.354433950036764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3839796110987663, "epoch": 0.19323761000463177, "frac_reward_zero_std": 1.0, "grad_norm": 0.004510688595473766, "kl": 0.0033752802992239594, "learning_rate": 9.613617415470124e-07, "loss": 0.0002, "num_tokens": 114762687.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4172, "step_time": 17.608592182397842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 142.375, "completions/mean_terminated_length": 142.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3697587624192238, "epoch": 0.19328392774432607, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048271669074893, "kl": 0.0035815382725559175, "learning_rate": 9.613524779990736e-07, "loss": 0.0002, "num_tokens": 114804485.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4173, "step_time": 21.474687982350588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2657466493546963, "epoch": 0.1933302454840204, "frac_reward_zero_std": 0.0, "grad_norm": 0.13559861481189728, "kl": 0.009173538070172071, "learning_rate": 9.613432144511347e-07, "loss": 0.0667, "num_tokens": 114824566.0, "reward": 0.7353917360305786, "reward_std": 0.3687800467014313, "rewards/reward_func/mean": 0.7353917360305786, "rewards/reward_func/std": 0.3687800467014313, "step": 4174, "step_time": 15.048453759402037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.39067216217517853, "epoch": 0.1933765632237147, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065057831816375256, "kl": 0.002822590176947415, "learning_rate": 9.613339509031958e-07, "loss": 0.0001, "num_tokens": 114875918.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4175, "step_time": 24.341829635202885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3952605575323105, "epoch": 0.19342288096340898, "frac_reward_zero_std": 1.0, "grad_norm": 0.010032407008111477, "kl": 0.003641855902969837, "learning_rate": 9.613246873552571e-07, "loss": 0.0002, "num_tokens": 114906914.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4176, "step_time": 19.441778726875782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 161.1875, "completions/mean_terminated_length": 161.1875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3579148128628731, "epoch": 0.19346919870310328, "frac_reward_zero_std": 1.0, "grad_norm": 0.004618246108293533, "kl": 0.003541071724612266, "learning_rate": 9.61315423807318e-07, "loss": 0.0002, "num_tokens": 114937877.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4177, "step_time": 18.79634879156947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 153.375, "completions/mean_terminated_length": 153.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.1737445928156376, "epoch": 0.1935155164427976, "frac_reward_zero_std": 1.0, "grad_norm": 0.005803847219794989, "kl": 0.012925693765282631, "learning_rate": 9.613061602593792e-07, "loss": 0.0006, "num_tokens": 114959803.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 4178, "step_time": 18.71392446756363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.36541668325662613, "epoch": 0.1935618341824919, "frac_reward_zero_std": 1.0, "grad_norm": 0.004241726361215115, "kl": 0.0033100010768976063, "learning_rate": 9.612968967114405e-07, "loss": 0.0002, "num_tokens": 115011111.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4179, "step_time": 22.45142360776663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 173.5625, "completions/mean_terminated_length": 173.5625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4178507551550865, "epoch": 0.1936081519221862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032413292210549116, "kl": 0.0025839487207122147, "learning_rate": 9.612876331635016e-07, "loss": 0.0001, "num_tokens": 115051392.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4180, "step_time": 23.177177645266056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 136.4375, "completions/mean_terminated_length": 136.4375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.31026995927095413, "epoch": 0.1936544696618805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030805582646280527, "kl": 0.001636517175938934, "learning_rate": 9.612783696155628e-07, "loss": 0.0001, "num_tokens": 115087575.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4181, "step_time": 18.440025456249714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 177.3125, "completions/mean_terminated_length": 177.3125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.20102904736995697, "epoch": 0.19370078740157481, "frac_reward_zero_std": 1.0, "grad_norm": 0.007652140222489834, "kl": 0.007242341234814376, "learning_rate": 9.61269106067624e-07, "loss": 0.0004, "num_tokens": 115122204.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 4182, "step_time": 22.110097400844097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 154.0625, "completions/mean_terminated_length": 154.0625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3495076298713684, "epoch": 0.1937471051412691, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021891407668590546, "kl": 0.0018741044332273304, "learning_rate": 9.61259842519685e-07, "loss": 0.0001, "num_tokens": 115151405.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4183, "step_time": 19.876628793776035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 201.625, "completions/mean_terminated_length": 201.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2147158570587635, "epoch": 0.1937934228809634, "frac_reward_zero_std": 1.0, "grad_norm": 0.004653643351048231, "kl": 0.0035977481165900826, "learning_rate": 9.612505789717461e-07, "loss": 0.0002, "num_tokens": 115175191.0, "reward": 0.7703813910484314, "reward_std": 0.0, "rewards/reward_func/mean": 0.7703813910484314, "rewards/reward_func/std": 0.0, "step": 4184, "step_time": 24.77579763531685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.18587301298975945, "epoch": 0.1938397406206577, "frac_reward_zero_std": 0.0, "grad_norm": 0.12032096832990646, "kl": 0.005851471039932221, "learning_rate": 9.612413154238073e-07, "loss": -0.0155, "num_tokens": 115208167.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4185, "step_time": 25.782169092446566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 158.5625, "completions/mean_terminated_length": 158.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.34699859470129013, "epoch": 0.19388605836035203, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022761430591344833, "kl": 0.0020523566636256874, "learning_rate": 9.612320518758684e-07, "loss": 0.0001, "num_tokens": 115240208.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4186, "step_time": 22.218475986272097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 210.0625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.2231697253882885, "epoch": 0.19393237610004632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070524620823562145, "kl": 0.007093827938660979, "learning_rate": 9.612227883279295e-07, "loss": 0.0004, "num_tokens": 115273697.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4187, "step_time": 25.761937137693167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 185.0625, "completions/mean_terminated_length": 185.0625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.23038675263524055, "epoch": 0.19397869383974062, "frac_reward_zero_std": 0.0, "grad_norm": 0.09096844494342804, "kl": 0.008536142529919744, "learning_rate": 9.612135247799906e-07, "loss": -0.0438, "num_tokens": 115296978.0, "reward": 0.9396798610687256, "reward_std": 0.023546535521745682, "rewards/reward_func/mean": 0.9396798610687256, "rewards/reward_func/std": 0.023546550422906876, "step": 4188, "step_time": 21.817990139126778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3383130058646202, "epoch": 0.19402501157943491, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037833014503121376, "kl": 0.0027758661308325827, "learning_rate": 9.612042612320518e-07, "loss": 0.0001, "num_tokens": 115318198.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4189, "step_time": 17.471931621432304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 146.5, "completions/mean_terminated_length": 146.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.34252479672431946, "epoch": 0.19407132931912924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029100715182721615, "kl": 0.0021770632301922888, "learning_rate": 9.61194997684113e-07, "loss": 0.0001, "num_tokens": 115344126.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4190, "step_time": 18.583404313772917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3847458064556122, "epoch": 0.19411764705882353, "frac_reward_zero_std": 1.0, "grad_norm": 0.002347928937524557, "kl": 0.0023530853795818985, "learning_rate": 9.61185734136174e-07, "loss": 0.0001, "num_tokens": 115377299.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4191, "step_time": 21.23593070358038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.19050484895706177, "epoch": 0.19416396479851783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015054878313094378, "kl": 0.0011488858435768634, "learning_rate": 9.611764705882354e-07, "loss": 0.0001, "num_tokens": 115414390.0, "reward": 0.8751733303070068, "reward_std": 0.0, "rewards/reward_func/mean": 0.8751733303070068, "rewards/reward_func/std": 0.0, "step": 4192, "step_time": 24.56995451077819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 148.0625, "completions/mean_terminated_length": 148.0625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4159550294280052, "epoch": 0.19421028253821213, "frac_reward_zero_std": 1.0, "grad_norm": 0.004171000327914953, "kl": 0.003203250002115965, "learning_rate": 9.611672070402965e-07, "loss": 0.0002, "num_tokens": 115458919.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4193, "step_time": 26.765640523284674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.207529217004776, "epoch": 0.19425660027790645, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022517722100019455, "kl": 0.0016181544633582234, "learning_rate": 9.611579434923576e-07, "loss": 0.0001, "num_tokens": 115478353.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4194, "step_time": 15.274928357452154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 141.0, "completions/mean_terminated_length": 141.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.25281310826539993, "epoch": 0.19430291801760075, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034878733567893505, "kl": 0.002078938763588667, "learning_rate": 9.611486799444185e-07, "loss": 0.0001, "num_tokens": 115498305.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4195, "step_time": 16.672792583703995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 161.9375, "completions/mean_terminated_length": 161.9375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3926580920815468, "epoch": 0.19434923575729504, "frac_reward_zero_std": 1.0, "grad_norm": 0.006166216917335987, "kl": 0.004865115392021835, "learning_rate": 9.611394163964799e-07, "loss": 0.0002, "num_tokens": 115543744.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4196, "step_time": 25.46180647611618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 163.625, "completions/mean_terminated_length": 163.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1641295999288559, "epoch": 0.19439555349698934, "frac_reward_zero_std": 0.0, "grad_norm": 0.12148923426866531, "kl": 0.009986362012568861, "learning_rate": 9.61130152848541e-07, "loss": -0.034, "num_tokens": 115577514.0, "reward": 0.8585449457168579, "reward_std": 0.030507458373904228, "rewards/reward_func/mean": 0.8585449457168579, "rewards/reward_func/std": 0.030507460236549377, "step": 4197, "step_time": 22.401244588196278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3376666083931923, "epoch": 0.19444187123668366, "frac_reward_zero_std": 1.0, "grad_norm": 0.004313699901103973, "kl": 0.0029067276045680046, "learning_rate": 9.611208893006021e-07, "loss": 0.0001, "num_tokens": 115598996.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4198, "step_time": 19.078201115131378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 152.4375, "completions/mean_terminated_length": 152.4375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.38116656243801117, "epoch": 0.19448818897637796, "frac_reward_zero_std": 1.0, "grad_norm": 0.005778941325843334, "kl": 0.003494909673463553, "learning_rate": 9.611116257526632e-07, "loss": 0.0002, "num_tokens": 115628859.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4199, "step_time": 22.87818080559373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 182.9375, "completions/mean_terminated_length": 182.9375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.4122886210680008, "epoch": 0.19453450671607225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040127490647137165, "kl": 0.0033474526717327535, "learning_rate": 9.611023622047244e-07, "loss": 0.0002, "num_tokens": 115654506.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4200, "step_time": 23.09018326923251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.317509301006794, "epoch": 0.19458082445576655, "frac_reward_zero_std": 1.0, "grad_norm": 0.00948141235858202, "kl": 0.005367578472942114, "learning_rate": 9.610930986567855e-07, "loss": 0.0003, "num_tokens": 115690554.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4201, "step_time": 20.8521859459579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3536151200532913, "epoch": 0.19462714219546087, "frac_reward_zero_std": 1.0, "grad_norm": 0.003814997849985957, "kl": 0.003215965232811868, "learning_rate": 9.610838351088466e-07, "loss": 0.0002, "num_tokens": 115719008.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4202, "step_time": 18.768044739961624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 137.0625, "completions/mean_terminated_length": 137.0625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.27821094542741776, "epoch": 0.19467345993515517, "frac_reward_zero_std": 1.0, "grad_norm": 0.003017124952748418, "kl": 0.0019369286019355059, "learning_rate": 9.610745715609077e-07, "loss": 0.0001, "num_tokens": 115740801.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4203, "step_time": 17.118156362324953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.4297303408384323, "epoch": 0.19471977767484946, "frac_reward_zero_std": 1.0, "grad_norm": 0.002109951339662075, "kl": 0.002258738153614104, "learning_rate": 9.610653080129689e-07, "loss": 0.0001, "num_tokens": 115778179.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4204, "step_time": 24.89584843814373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 195.375, "completions/mean_terminated_length": 195.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2171323448419571, "epoch": 0.19476609541454376, "frac_reward_zero_std": 0.0, "grad_norm": 0.15486817061901093, "kl": 0.014342037728056312, "learning_rate": 9.6105604446503e-07, "loss": -0.0228, "num_tokens": 115815929.0, "reward": 0.9793950319290161, "reward_std": 0.0824199914932251, "rewards/reward_func/mean": 0.9793950319290161, "rewards/reward_func/std": 0.0824199914932251, "step": 4205, "step_time": 25.921521224081516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.28121767938137054, "epoch": 0.19481241315423808, "frac_reward_zero_std": 0.0, "grad_norm": 0.16210217773914337, "kl": 0.03833826305344701, "learning_rate": 9.610467809170913e-07, "loss": 0.0126, "num_tokens": 115836679.0, "reward": 0.9559363126754761, "reward_std": 0.058751560747623444, "rewards/reward_func/mean": 0.9559363126754761, "rewards/reward_func/std": 0.058751557022333145, "step": 4206, "step_time": 19.4800363779068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.22877272963523865, "epoch": 0.19485873089393238, "frac_reward_zero_std": 0.0, "grad_norm": 0.13253124058246613, "kl": 0.005843870399985462, "learning_rate": 9.610375173691524e-07, "loss": -0.0154, "num_tokens": 115858179.0, "reward": 0.9439884424209595, "reward_std": 0.027789480984210968, "rewards/reward_func/mean": 0.9439884424209595, "rewards/reward_func/std": 0.027789490297436714, "step": 4207, "step_time": 18.124618284404278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.12721232697367668, "epoch": 0.19490504863362668, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015336594078689814, "kl": 0.0009773392375791445, "learning_rate": 9.610282538212134e-07, "loss": 0.0, "num_tokens": 115892945.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 4208, "step_time": 20.8048697412014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.39501602947711945, "epoch": 0.19495136637332097, "frac_reward_zero_std": 1.0, "grad_norm": 0.002226000651717186, "kl": 0.0021614611614495516, "learning_rate": 9.610189902732747e-07, "loss": 0.0001, "num_tokens": 115925897.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4209, "step_time": 20.158426381647587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 394.125, "completions/mean_terminated_length": 394.125, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "entropy": 0.2638545520603657, "epoch": 0.1949976841130153, "frac_reward_zero_std": 0.0, "grad_norm": 0.08238273113965988, "kl": 0.02052580565214157, "learning_rate": 9.610097267253358e-07, "loss": -0.0409, "num_tokens": 115961819.0, "reward": 0.899699330329895, "reward_std": 0.10436239093542099, "rewards/reward_func/mean": 0.899699330329895, "rewards/reward_func/std": 0.10436239093542099, "step": 4210, "step_time": 43.25924604386091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 152.75, "completions/mean_terminated_length": 152.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3825525864958763, "epoch": 0.1950440018527096, "frac_reward_zero_std": 1.0, "grad_norm": 0.006205675192177296, "kl": 0.004654404416214675, "learning_rate": 9.61000463177397e-07, "loss": 0.0002, "num_tokens": 115985607.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4211, "step_time": 18.63008676469326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 159.6875, "completions/mean_terminated_length": 159.6875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.155677430331707, "epoch": 0.1950903195924039, "frac_reward_zero_std": 1.0, "grad_norm": 0.002837250242009759, "kl": 0.09240571223199368, "learning_rate": 9.60991199629458e-07, "loss": 0.0046, "num_tokens": 116010354.0, "reward": 0.910879909992218, "reward_std": 0.0, "rewards/reward_func/mean": 0.910879909992218, "rewards/reward_func/std": 0.0, "step": 4212, "step_time": 19.852250806987286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 210.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.42255423218011856, "epoch": 0.19513663733209818, "frac_reward_zero_std": 1.0, "grad_norm": 0.005290188826620579, "kl": 0.00481342279817909, "learning_rate": 9.609819360815192e-07, "loss": 0.0002, "num_tokens": 116036403.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4213, "step_time": 32.70421166345477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.41341114789247513, "epoch": 0.1951829550717925, "frac_reward_zero_std": 1.0, "grad_norm": 0.011044755578041077, "kl": 0.0070021499413996935, "learning_rate": 9.609726725335803e-07, "loss": 0.0003, "num_tokens": 116079592.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4214, "step_time": 26.656338464468718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 142.4375, "completions/mean_terminated_length": 142.4375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2703259363770485, "epoch": 0.1952292728114868, "frac_reward_zero_std": 1.0, "grad_norm": 0.005582020152360201, "kl": 0.003675670886877924, "learning_rate": 9.609634089856414e-07, "loss": 0.0002, "num_tokens": 116099663.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4215, "step_time": 16.943227514624596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 159.5, "completions/mean_terminated_length": 159.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.42376527935266495, "epoch": 0.1952755905511811, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023706508800387383, "kl": 0.002180389012210071, "learning_rate": 9.609541454377026e-07, "loss": 0.0001, "num_tokens": 116133031.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4216, "step_time": 21.929262027144432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3579980581998825, "epoch": 0.1953219082908754, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021299226209521294, "kl": 0.0022214812925085425, "learning_rate": 9.609448818897637e-07, "loss": 0.0001, "num_tokens": 116163735.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4217, "step_time": 18.779218014329672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2582762949168682, "epoch": 0.19536822603056972, "frac_reward_zero_std": 0.0, "grad_norm": 0.15609025955200195, "kl": 0.006726448307745159, "learning_rate": 9.609356183418248e-07, "loss": -0.0004, "num_tokens": 116186847.0, "reward": 0.994957685470581, "reward_std": 0.020169313997030258, "rewards/reward_func/mean": 0.994957685470581, "rewards/reward_func/std": 0.020169317722320557, "step": 4218, "step_time": 20.52297157049179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 176.5, "completions/mean_terminated_length": 176.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3304376155138016, "epoch": 0.19541454377026402, "frac_reward_zero_std": 0.0, "grad_norm": 0.1558787077665329, "kl": 0.01612040540203452, "learning_rate": 9.609263547938862e-07, "loss": -0.1094, "num_tokens": 116210231.0, "reward": 0.1659776270389557, "reward_std": 0.35684120655059814, "rewards/reward_func/mean": 0.1659776270389557, "rewards/reward_func/std": 0.35684117674827576, "step": 4219, "step_time": 22.43446246162057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2898036688566208, "epoch": 0.1954608615099583, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023353155702352524, "kl": 0.0018903164600487798, "learning_rate": 9.60917091245947e-07, "loss": 0.0001, "num_tokens": 116236017.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4220, "step_time": 16.95613558217883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 190.3125, "completions/mean_terminated_length": 190.3125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.27350638061761856, "epoch": 0.1955071792496526, "frac_reward_zero_std": 0.0, "grad_norm": 0.1756594330072403, "kl": 0.021484515629708767, "learning_rate": 9.609078276980082e-07, "loss": 0.0274, "num_tokens": 116257894.0, "reward": 0.9054722785949707, "reward_std": 0.09762780368328094, "rewards/reward_func/mean": 0.9054722785949707, "rewards/reward_func/std": 0.09762781113386154, "step": 4221, "step_time": 24.026976376771927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 151.75, "completions/mean_terminated_length": 151.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.2922987937927246, "epoch": 0.19555349698934693, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031239285599440336, "kl": 0.002129494328983128, "learning_rate": 9.608985641500695e-07, "loss": 0.0001, "num_tokens": 116278498.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4222, "step_time": 18.22118454799056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4489883482456207, "epoch": 0.19559981472904123, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037734820507466793, "kl": 0.0030742106027901173, "learning_rate": 9.608893006021307e-07, "loss": 0.0002, "num_tokens": 116310247.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4223, "step_time": 22.699246127158403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.39858105778694153, "epoch": 0.19564613246873552, "frac_reward_zero_std": 1.0, "grad_norm": 0.007778745610266924, "kl": 0.006325821857899427, "learning_rate": 9.608800370541918e-07, "loss": 0.0003, "num_tokens": 116342379.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4224, "step_time": 29.138686653226614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.22915543615818024, "epoch": 0.19569245020842982, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035018566995859146, "kl": 0.0028540154453366995, "learning_rate": 9.60870773506253e-07, "loss": 0.0001, "num_tokens": 116362977.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4225, "step_time": 17.080576337873936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 190.75, "completions/mean_terminated_length": 190.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.42701394855976105, "epoch": 0.19573876794812414, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068210274912416935, "kl": 0.00447841000277549, "learning_rate": 9.60861509958314e-07, "loss": 0.0002, "num_tokens": 116384749.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4226, "step_time": 23.173054572194815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 203.8125, "completions/mean_terminated_length": 203.8125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2344268225133419, "epoch": 0.19578508568781844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021351028699427843, "kl": 0.035033333115279675, "learning_rate": 9.608522464103752e-07, "loss": 0.0017, "num_tokens": 116416714.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4227, "step_time": 25.170080687850714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "entropy": 0.17622502893209457, "epoch": 0.19583140342751273, "frac_reward_zero_std": 1.0, "grad_norm": 0.005085702054202557, "kl": 0.004294879618100822, "learning_rate": 9.608429828624363e-07, "loss": 0.0002, "num_tokens": 116442758.0, "reward": 0.7532761096954346, "reward_std": 0.0, "rewards/reward_func/mean": 0.7532761096954346, "rewards/reward_func/std": 0.0, "step": 4228, "step_time": 36.40155283361673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 132.125, "completions/mean_terminated_length": 132.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.294950395822525, "epoch": 0.19587772116720703, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016333815874531865, "kl": 0.0014642233145423234, "learning_rate": 9.608337193144974e-07, "loss": 0.0001, "num_tokens": 116468312.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4229, "step_time": 18.435693010687828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.19461952149868011, "epoch": 0.19592403890690135, "frac_reward_zero_std": 1.0, "grad_norm": 0.005276711191982031, "kl": 0.0035938590299338102, "learning_rate": 9.608244557665585e-07, "loss": 0.0002, "num_tokens": 116489874.0, "reward": 0.780767560005188, "reward_std": 0.0, "rewards/reward_func/mean": 0.780767560005188, "rewards/reward_func/std": 0.0, "step": 4230, "step_time": 21.15241439640522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.43789688497781754, "epoch": 0.19597035664659565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024460810236632824, "kl": 0.0021170018007978797, "learning_rate": 9.608151922186197e-07, "loss": 0.0001, "num_tokens": 116528797.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4231, "step_time": 25.725592702627182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 208.3125, "completions/mean_terminated_length": 208.3125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.2611159197986126, "epoch": 0.19601667438628995, "frac_reward_zero_std": 0.0, "grad_norm": 0.08797170966863632, "kl": 0.004341409425251186, "learning_rate": 9.608059286706808e-07, "loss": 0.0201, "num_tokens": 116550722.0, "reward": 0.9966088533401489, "reward_std": 0.013564594089984894, "rewards/reward_func/mean": 0.9966088533401489, "rewards/reward_func/std": 0.013564602471888065, "step": 4232, "step_time": 24.211438823491335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.20028812438249588, "epoch": 0.19606299212598424, "frac_reward_zero_std": 0.0, "grad_norm": 0.17509032785892487, "kl": 0.019681470876093954, "learning_rate": 9.60796665122742e-07, "loss": -0.0895, "num_tokens": 116575280.0, "reward": 0.4348171353340149, "reward_std": 0.27660179138183594, "rewards/reward_func/mean": 0.4348171353340149, "rewards/reward_func/std": 0.2766018211841583, "step": 4233, "step_time": 22.409042045474052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 165.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.16007276996970177, "epoch": 0.19610930986567857, "frac_reward_zero_std": 0.0, "grad_norm": 0.10615488141775131, "kl": 0.001744678505929187, "learning_rate": 9.60787401574803e-07, "loss": -0.0777, "num_tokens": 116626245.0, "reward": 0.8260819911956787, "reward_std": 0.036492254585027695, "rewards/reward_func/mean": 0.8260819911956787, "rewards/reward_func/std": 0.03649226203560829, "step": 4234, "step_time": 26.96291321888566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 192.125, "completions/mean_terminated_length": 192.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.35143470764160156, "epoch": 0.19615562760537286, "frac_reward_zero_std": 1.0, "grad_norm": 0.002009526826441288, "kl": 0.0020950931939296424, "learning_rate": 9.607781380268642e-07, "loss": 0.0001, "num_tokens": 116655479.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4235, "step_time": 22.444360587745905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 181.0625, "completions/mean_terminated_length": 181.0625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.36605076491832733, "epoch": 0.19620194534506716, "frac_reward_zero_std": 1.0, "grad_norm": 0.004338215105235577, "kl": 0.003199953935109079, "learning_rate": 9.607688744789255e-07, "loss": 0.0002, "num_tokens": 116697496.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4236, "step_time": 25.907401349395514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2512933015823364, "epoch": 0.19624826308476145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029526492580771446, "kl": 0.0016395335551351309, "learning_rate": 9.607596109309866e-07, "loss": 0.0001, "num_tokens": 116717126.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4237, "step_time": 16.43812559172511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1612314097583294, "epoch": 0.19629458082445578, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048973532393574715, "kl": 0.0034369818749837577, "learning_rate": 9.607503473830475e-07, "loss": 0.0002, "num_tokens": 116740425.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 4238, "step_time": 19.980030063539743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2984234616160393, "epoch": 0.19634089856415007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027296452317386866, "kl": 0.002165403391700238, "learning_rate": 9.607410838351089e-07, "loss": 0.0001, "num_tokens": 116762369.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4239, "step_time": 19.539735689759254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 188.3125, "completions/mean_terminated_length": 188.3125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.4104792848229408, "epoch": 0.19638721630384437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026210760697722435, "kl": 0.0024948460049927235, "learning_rate": 9.6073182028717e-07, "loss": 0.0001, "num_tokens": 116793350.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4240, "step_time": 25.698275484144688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 138.4375, "completions/mean_terminated_length": 138.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2984291911125183, "epoch": 0.19643353404353867, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014600768918171525, "kl": 0.0014500562101602554, "learning_rate": 9.607225567392311e-07, "loss": 0.0001, "num_tokens": 116815821.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4241, "step_time": 19.075826909393072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4563741758465767, "epoch": 0.196479851783233, "frac_reward_zero_std": 1.0, "grad_norm": 0.007138571701943874, "kl": 0.0027323447866365314, "learning_rate": 9.607132931912922e-07, "loss": 0.0001, "num_tokens": 116863351.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4242, "step_time": 31.27183734625578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.47689762711524963, "epoch": 0.19652616952292729, "frac_reward_zero_std": 1.0, "grad_norm": 0.004615445155650377, "kl": 0.0036913889925926924, "learning_rate": 9.607040296433534e-07, "loss": 0.0002, "num_tokens": 116903811.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4243, "step_time": 29.8212310038507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.22932784631848335, "epoch": 0.19657248726262158, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037516478914767504, "kl": 0.003196165431290865, "learning_rate": 9.606947660954145e-07, "loss": 0.0002, "num_tokens": 116924097.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4244, "step_time": 18.255730766803026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 121.8125, "completions/mean_terminated_length": 121.8125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.26331768184900284, "epoch": 0.19661880500231588, "frac_reward_zero_std": 1.0, "grad_norm": 0.004044357221573591, "kl": 0.002656788448803127, "learning_rate": 9.606855025474756e-07, "loss": 0.0001, "num_tokens": 116944446.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4245, "step_time": 15.24508398398757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 168.6875, "completions/mean_terminated_length": 168.6875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4032922387123108, "epoch": 0.1966651227420102, "frac_reward_zero_std": 1.0, "grad_norm": 0.003165687434375286, "kl": 0.0030372024630196393, "learning_rate": 9.606762389995367e-07, "loss": 0.0002, "num_tokens": 117003001.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4246, "step_time": 33.07563906535506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 198.75, "completions/mean_terminated_length": 198.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.24546974152326584, "epoch": 0.1967114404817045, "frac_reward_zero_std": 0.0, "grad_norm": 0.10324181616306305, "kl": 0.020861016120761633, "learning_rate": 9.606669754515979e-07, "loss": 0.0337, "num_tokens": 117024453.0, "reward": 0.6265100240707397, "reward_std": 0.00829548854380846, "rewards/reward_func/mean": 0.6265100240707397, "rewards/reward_func/std": 0.008295491337776184, "step": 4247, "step_time": 22.54040576890111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 228.3125, "completions/mean_terminated_length": 228.3125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.5325218811631203, "epoch": 0.1967577582213988, "frac_reward_zero_std": 0.0, "grad_norm": 0.12794294953346252, "kl": 0.007044766913168132, "learning_rate": 9.60657711903659e-07, "loss": 0.2161, "num_tokens": 117053866.0, "reward": 0.5625, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.5625, "rewards/reward_func/std": 0.5123475790023804, "step": 4248, "step_time": 32.69337200373411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 152.0625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.40886230766773224, "epoch": 0.1968040759610931, "frac_reward_zero_std": 1.0, "grad_norm": 0.006556299049407244, "kl": 0.0021459850249812007, "learning_rate": 9.606484483557203e-07, "loss": 0.0001, "num_tokens": 117086427.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4249, "step_time": 21.64050517976284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 127.4375, "completions/mean_terminated_length": 127.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3320283144712448, "epoch": 0.1968503937007874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021526897326111794, "kl": 0.0018389831820968539, "learning_rate": 9.606391848077814e-07, "loss": 0.0001, "num_tokens": 117115074.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4250, "step_time": 19.892197255045176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 232.6875, "completions/mean_terminated_length": 232.6875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.39874593913555145, "epoch": 0.1968967114404817, "frac_reward_zero_std": 0.0, "grad_norm": 0.10539249330759048, "kl": 0.004786168457940221, "learning_rate": 9.606299212598424e-07, "loss": 0.1641, "num_tokens": 117159405.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 4251, "step_time": 38.98399826139212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 212.0625, "completions/mean_terminated_length": 212.0625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.28354430943727493, "epoch": 0.196943029180176, "frac_reward_zero_std": 0.0, "grad_norm": 0.16920839250087738, "kl": 0.013166029239073396, "learning_rate": 9.606206577119035e-07, "loss": -0.1126, "num_tokens": 117189742.0, "reward": 0.40036869049072266, "reward_std": 0.41349899768829346, "rewards/reward_func/mean": 0.40036869049072266, "rewards/reward_func/std": 0.41349902749061584, "step": 4252, "step_time": 28.75827705487609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 168.9375, "completions/mean_terminated_length": 168.9375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.38469357043504715, "epoch": 0.1969893469198703, "frac_reward_zero_std": 1.0, "grad_norm": 0.016849059611558914, "kl": 0.008995268843136728, "learning_rate": 9.606113941639648e-07, "loss": 0.0005, "num_tokens": 117226493.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4253, "step_time": 25.00868810713291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 193.8125, "completions/mean_terminated_length": 193.8125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.2701209485530853, "epoch": 0.19703566465956462, "frac_reward_zero_std": 0.0, "grad_norm": 0.15225957334041595, "kl": 0.015052320901304483, "learning_rate": 9.60602130616026e-07, "loss": 0.0469, "num_tokens": 117250234.0, "reward": 0.4334152936935425, "reward_std": 0.1670493483543396, "rewards/reward_func/mean": 0.4334152936935425, "rewards/reward_func/std": 0.1670493334531784, "step": 4254, "step_time": 26.83351392298937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 191.875, "completions/mean_terminated_length": 191.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3544131889939308, "epoch": 0.19708198239925892, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018783751875162125, "kl": 0.0022542255464941263, "learning_rate": 9.60592867068087e-07, "loss": 0.0001, "num_tokens": 117283512.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4255, "step_time": 24.932211596518755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 211.8125, "completions/mean_terminated_length": 211.8125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4790257215499878, "epoch": 0.19712830013895322, "frac_reward_zero_std": 0.0, "grad_norm": 0.10050687938928604, "kl": 0.008070754003711045, "learning_rate": 9.605836035201482e-07, "loss": 0.0672, "num_tokens": 117305877.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4256, "step_time": 25.3606780692935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 172.3125, "completions/mean_terminated_length": 172.3125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.40603653341531754, "epoch": 0.1971746178786475, "frac_reward_zero_std": 0.0, "grad_norm": 0.1433793604373932, "kl": 0.009749911143444479, "learning_rate": 9.605743399722093e-07, "loss": 0.0347, "num_tokens": 117326330.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 4257, "step_time": 23.288886569440365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.19553682953119278, "epoch": 0.19722093561834184, "frac_reward_zero_std": 1.0, "grad_norm": 0.002261777874082327, "kl": 0.0021992510301060975, "learning_rate": 9.605650764242704e-07, "loss": 0.0001, "num_tokens": 117351276.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4258, "step_time": 21.188062090426683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.24807263538241386, "epoch": 0.19726725335803613, "frac_reward_zero_std": 0.0, "grad_norm": 0.14250698685646057, "kl": 0.029690278228372335, "learning_rate": 9.605558128763316e-07, "loss": -0.0069, "num_tokens": 117372566.0, "reward": 0.6371071338653564, "reward_std": 0.03512675315141678, "rewards/reward_func/mean": 0.6371071338653564, "rewards/reward_func/std": 0.03512675687670708, "step": 4259, "step_time": 20.61169083416462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 178.375, "completions/mean_terminated_length": 178.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.20011616870760918, "epoch": 0.19731357109773043, "frac_reward_zero_std": 0.0, "grad_norm": 0.14215035736560822, "kl": 0.07767318189144135, "learning_rate": 9.605465493283927e-07, "loss": 0.0035, "num_tokens": 117393612.0, "reward": 0.752162516117096, "reward_std": 0.2807443141937256, "rewards/reward_func/mean": 0.752162516117096, "rewards/reward_func/std": 0.280744343996048, "step": 4260, "step_time": 19.663628932088614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 133.375, "completions/mean_terminated_length": 133.375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2357555367052555, "epoch": 0.19735988883742472, "frac_reward_zero_std": 1.0, "grad_norm": 0.012321810238063335, "kl": 0.005285892868414521, "learning_rate": 9.605372857804538e-07, "loss": 0.0003, "num_tokens": 117413778.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4261, "step_time": 18.2445274181664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.24125902354717255, "epoch": 0.19740620657711905, "frac_reward_zero_std": 1.0, "grad_norm": 0.004134069662541151, "kl": 0.04424409521743655, "learning_rate": 9.605280222325152e-07, "loss": 0.0022, "num_tokens": 117436196.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4262, "step_time": 20.667074255645275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 142.0, "completions/mean_terminated_length": 142.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3567390739917755, "epoch": 0.19745252431681334, "frac_reward_zero_std": 1.0, "grad_norm": 0.002030797302722931, "kl": 0.0020479909144341946, "learning_rate": 9.60518758684576e-07, "loss": 0.0001, "num_tokens": 117460948.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4263, "step_time": 18.66989303380251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.27444790303707123, "epoch": 0.19749884205650764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030250735580921173, "kl": 0.002313581178896129, "learning_rate": 9.605094951366372e-07, "loss": 0.0001, "num_tokens": 117482748.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4264, "step_time": 17.1928793489933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 131.625, "completions/mean_terminated_length": 131.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.34081917256116867, "epoch": 0.19754515979620194, "frac_reward_zero_std": 1.0, "grad_norm": 0.003179185790941119, "kl": 0.0027597531443461776, "learning_rate": 9.605002315886983e-07, "loss": 0.0001, "num_tokens": 117504374.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4265, "step_time": 16.26287868246436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 173.5, "completions/mean_terminated_length": 173.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.20281190425157547, "epoch": 0.19759147753589626, "frac_reward_zero_std": 1.0, "grad_norm": 0.012527067214250565, "kl": 0.019857921521179378, "learning_rate": 9.604909680407597e-07, "loss": 0.0009, "num_tokens": 117536926.0, "reward": 0.10889280587434769, "reward_std": 0.0, "rewards/reward_func/mean": 0.10889280587434769, "rewards/reward_func/std": 0.0, "step": 4266, "step_time": 23.154936235398054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 144.8125, "completions/mean_terminated_length": 144.8125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.32744892686605453, "epoch": 0.19763779527559056, "frac_reward_zero_std": 1.0, "grad_norm": 0.013877227902412415, "kl": 0.007512084790505469, "learning_rate": 9.604817044928208e-07, "loss": 0.0004, "num_tokens": 117560475.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4267, "step_time": 19.869691032916307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2384634166955948, "epoch": 0.19768411301528485, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032265728805214167, "kl": 0.0018207980028819293, "learning_rate": 9.60472440944882e-07, "loss": 0.0001, "num_tokens": 117580322.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4268, "step_time": 15.346847128123045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 149.1875, "completions/mean_terminated_length": 149.1875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3133619949221611, "epoch": 0.19773043075497915, "frac_reward_zero_std": 1.0, "grad_norm": 0.008078614249825478, "kl": 0.004485111043322831, "learning_rate": 9.60463177396943e-07, "loss": 0.0002, "num_tokens": 117602469.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4269, "step_time": 18.841124154627323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.3125, "completions/mean_terminated_length": 120.3125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.30543189495801926, "epoch": 0.19777674849467347, "frac_reward_zero_std": 1.0, "grad_norm": 0.002928100759163499, "kl": 0.0022029224201105535, "learning_rate": 9.604539138490042e-07, "loss": 0.0001, "num_tokens": 117637562.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4270, "step_time": 18.047686591744423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 120.6875, "completions/mean_terminated_length": 120.6875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.27932991087436676, "epoch": 0.19782306623436777, "frac_reward_zero_std": 1.0, "grad_norm": 0.005019314121454954, "kl": 0.0028968447586521506, "learning_rate": 9.604446503010653e-07, "loss": 0.0001, "num_tokens": 117658005.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4271, "step_time": 14.7598297894001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 178.25, "completions/mean_terminated_length": 178.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.35574671626091003, "epoch": 0.19786938397406206, "frac_reward_zero_std": 0.0, "grad_norm": 0.18242698907852173, "kl": 0.018063213676214218, "learning_rate": 9.604353867531264e-07, "loss": -0.0296, "num_tokens": 117681561.0, "reward": 0.5846918821334839, "reward_std": 0.46775349974632263, "rewards/reward_func/mean": 0.5846918821334839, "rewards/reward_func/std": 0.46775349974632263, "step": 4272, "step_time": 22.201333187520504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 104.5625, "completions/mean_terminated_length": 104.5625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.2888680547475815, "epoch": 0.19791570171375636, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035976667422801256, "kl": 0.0022441393230110407, "learning_rate": 9.604261232051875e-07, "loss": 0.0001, "num_tokens": 117704594.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4273, "step_time": 14.309563618153334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 157.6875, "completions/mean_terminated_length": 157.6875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.1722608432173729, "epoch": 0.19796201945345068, "frac_reward_zero_std": 0.0, "grad_norm": 0.21106410026550293, "kl": 0.05202815495431423, "learning_rate": 9.604168596572487e-07, "loss": -0.0167, "num_tokens": 117732525.0, "reward": 0.9616204500198364, "reward_std": 0.06865544617176056, "rewards/reward_func/mean": 0.9616204500198364, "rewards/reward_func/std": 0.06865545362234116, "step": 4274, "step_time": 21.714784532785416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.3125, "completions/mean_terminated_length": 234.3125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "entropy": 0.16529910638928413, "epoch": 0.19800833719314498, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026992466300725937, "kl": 0.002464011573465541, "learning_rate": 9.604075961093098e-07, "loss": 0.0001, "num_tokens": 117770098.0, "reward": 0.5576546788215637, "reward_std": 0.0, "rewards/reward_func/mean": 0.5576546788215637, "rewards/reward_func/std": 0.0, "step": 4275, "step_time": 29.23502964526415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 160.9375, "completions/mean_terminated_length": 160.9375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.28430066257715225, "epoch": 0.19805465493283927, "frac_reward_zero_std": 1.0, "grad_norm": 0.003889898071065545, "kl": 0.0031116321333684027, "learning_rate": 9.60398332561371e-07, "loss": 0.0002, "num_tokens": 117792001.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4276, "step_time": 21.740156807005405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 146.3125, "completions/mean_terminated_length": 146.3125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.25032076239585876, "epoch": 0.19810097267253357, "frac_reward_zero_std": 0.0, "grad_norm": 0.17101536691188812, "kl": 0.006241139606572688, "learning_rate": 9.60389069013432e-07, "loss": -0.1195, "num_tokens": 117814230.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 4277, "step_time": 22.160698179155588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.4354545697569847, "epoch": 0.1981472904122279, "frac_reward_zero_std": 1.0, "grad_norm": 0.02668669819831848, "kl": 0.01329098385758698, "learning_rate": 9.603798054654932e-07, "loss": 0.0007, "num_tokens": 117840369.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4278, "step_time": 25.36359355598688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 190.0625, "completions/mean_terminated_length": 190.0625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.3894534707069397, "epoch": 0.1981936081519222, "frac_reward_zero_std": 0.0, "grad_norm": 0.11786775290966034, "kl": 0.007994166575372219, "learning_rate": 9.603705419175545e-07, "loss": 0.0213, "num_tokens": 117863106.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4279, "step_time": 23.085601836442947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2934921160340309, "epoch": 0.1982399258916165, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019177637295797467, "kl": 0.0017948686436284333, "learning_rate": 9.603612783696156e-07, "loss": 0.0001, "num_tokens": 117884369.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4280, "step_time": 15.263262741267681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 157.125, "completions/mean_terminated_length": 157.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.1332587655633688, "epoch": 0.19828624363131078, "frac_reward_zero_std": 0.0, "grad_norm": 0.25884559750556946, "kl": 0.036092991940677166, "learning_rate": 9.603520148216765e-07, "loss": -0.0136, "num_tokens": 117904963.0, "reward": 0.9548678994178772, "reward_std": 0.12332441657781601, "rewards/reward_func/mean": 0.9548678994178772, "rewards/reward_func/std": 0.1233244240283966, "step": 4281, "step_time": 17.021088305860758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.27391989156603813, "epoch": 0.1983325613710051, "frac_reward_zero_std": 0.0, "grad_norm": 0.11594119668006897, "kl": 0.014407145557925105, "learning_rate": 9.603427512737377e-07, "loss": 0.0807, "num_tokens": 117935355.0, "reward": 0.8145408630371094, "reward_std": 0.24217469990253448, "rewards/reward_func/mean": 0.8145408630371094, "rewards/reward_func/std": 0.24217469990253448, "step": 4282, "step_time": 29.77666798233986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 114.8125, "completions/mean_terminated_length": 114.8125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3361153304576874, "epoch": 0.1983788791106994, "frac_reward_zero_std": 1.0, "grad_norm": 0.00720619922503829, "kl": 0.003871683613397181, "learning_rate": 9.60333487725799e-07, "loss": 0.0002, "num_tokens": 117959832.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4283, "step_time": 15.830325540155172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 142.75, "completions/mean_terminated_length": 142.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.17089350149035454, "epoch": 0.1984251968503937, "frac_reward_zero_std": 0.0, "grad_norm": 0.19309671223163605, "kl": 0.005295194743666798, "learning_rate": 9.603242241778601e-07, "loss": -0.0036, "num_tokens": 117980164.0, "reward": 0.9444707632064819, "reward_std": 0.03311120346188545, "rewards/reward_func/mean": 0.9444707632064819, "rewards/reward_func/std": 0.03311121463775635, "step": 4284, "step_time": 17.01628965139389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 179.3125, "completions/mean_terminated_length": 179.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.19935547560453415, "epoch": 0.198471514590088, "frac_reward_zero_std": 0.0, "grad_norm": 0.1068725511431694, "kl": 0.004089641530299559, "learning_rate": 9.603149606299212e-07, "loss": -0.0525, "num_tokens": 118008345.0, "reward": 0.8930783271789551, "reward_std": 0.041308917105197906, "rewards/reward_func/mean": 0.8930783271789551, "rewards/reward_func/std": 0.04130890965461731, "step": 4285, "step_time": 22.150147527456284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 187.875, "completions/mean_terminated_length": 187.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.3802003040909767, "epoch": 0.19851783232978232, "frac_reward_zero_std": 1.0, "grad_norm": 0.004996567498892546, "kl": 0.004184939316473901, "learning_rate": 9.603056970819824e-07, "loss": 0.0002, "num_tokens": 118042711.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4286, "step_time": 27.316251508891582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 150.75, "completions/mean_terminated_length": 150.75, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.22020679339766502, "epoch": 0.1985641500694766, "frac_reward_zero_std": 1.0, "grad_norm": 0.005396071821451187, "kl": 0.003406070638448, "learning_rate": 9.602964335340435e-07, "loss": 0.0002, "num_tokens": 118066323.0, "reward": 0.8307302594184875, "reward_std": 0.0, "rewards/reward_func/mean": 0.8307302594184875, "rewards/reward_func/std": 0.0, "step": 4287, "step_time": 19.11954002082348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.39390766620635986, "epoch": 0.1986104678091709, "frac_reward_zero_std": 1.0, "grad_norm": 0.003806658321991563, "kl": 0.0030758902430534363, "learning_rate": 9.602871699861046e-07, "loss": 0.0002, "num_tokens": 118088249.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4288, "step_time": 19.577602609992027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.13845118507742882, "epoch": 0.1986567855488652, "frac_reward_zero_std": 1.0, "grad_norm": 0.004882005508989096, "kl": 0.0030589258822146803, "learning_rate": 9.602779064381657e-07, "loss": 0.0002, "num_tokens": 118110521.0, "reward": 0.73319411277771, "reward_std": 0.0, "rewards/reward_func/mean": 0.73319411277771, "rewards/reward_func/std": 0.0, "step": 4289, "step_time": 22.829368107020855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 140.625, "completions/mean_terminated_length": 140.625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.42518218606710434, "epoch": 0.19870310328855953, "frac_reward_zero_std": 1.0, "grad_norm": 0.003994393162429333, "kl": 0.002797422173898667, "learning_rate": 9.602686428902269e-07, "loss": 0.0001, "num_tokens": 118133203.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4290, "step_time": 18.251724045723677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 174.6875, "completions/mean_terminated_length": 174.6875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2062804326415062, "epoch": 0.19874942102825383, "frac_reward_zero_std": 1.0, "grad_norm": 0.008033553138375282, "kl": 0.006925027701072395, "learning_rate": 9.60259379342288e-07, "loss": 0.0003, "num_tokens": 118154910.0, "reward": 0.939104437828064, "reward_std": 0.0, "rewards/reward_func/mean": 0.939104437828064, "rewards/reward_func/std": 0.0, "step": 4291, "step_time": 20.61775129288435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.5162595435976982, "epoch": 0.19879573876794812, "frac_reward_zero_std": 0.0, "grad_norm": 0.11483185738325119, "kl": 0.00691199628636241, "learning_rate": 9.602501157943493e-07, "loss": 0.2097, "num_tokens": 118179930.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4292, "step_time": 39.865657422691584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 177.5625, "completions/mean_terminated_length": 177.5625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.33295440673828125, "epoch": 0.19884205650764242, "frac_reward_zero_std": 1.0, "grad_norm": 0.012061888352036476, "kl": 0.01105513609945774, "learning_rate": 9.602408522464105e-07, "loss": 0.0006, "num_tokens": 118200195.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4293, "step_time": 22.23841020464897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.19111380353569984, "epoch": 0.19888837424733674, "frac_reward_zero_std": 1.0, "grad_norm": 0.004515485372394323, "kl": 0.0043804405722767115, "learning_rate": 9.602315886984714e-07, "loss": 0.0002, "num_tokens": 118233011.0, "reward": 0.7177659273147583, "reward_std": 0.0, "rewards/reward_func/mean": 0.7177659273147583, "rewards/reward_func/std": 0.0, "step": 4294, "step_time": 18.93245917931199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 177.5625, "completions/mean_terminated_length": 177.5625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.35970911383628845, "epoch": 0.19893469198703104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026350398547947407, "kl": 0.0023667019268032163, "learning_rate": 9.602223251505325e-07, "loss": 0.0001, "num_tokens": 118262076.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4295, "step_time": 27.931419048458338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.24056771770119667, "epoch": 0.19898100972672533, "frac_reward_zero_std": 1.0, "grad_norm": 0.006367878057062626, "kl": 0.005521641462109983, "learning_rate": 9.602130616025938e-07, "loss": 0.0003, "num_tokens": 118286562.0, "reward": 0.558035135269165, "reward_std": 0.0, "rewards/reward_func/mean": 0.558035135269165, "rewards/reward_func/std": 0.0, "step": 4296, "step_time": 21.315611243247986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 148.3125, "completions/mean_terminated_length": 148.3125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.261702336370945, "epoch": 0.19902732746641963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036970695946365595, "kl": 0.0020516463846433908, "learning_rate": 9.60203798054655e-07, "loss": 0.0001, "num_tokens": 118310279.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4297, "step_time": 20.674363385885954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4102034866809845, "epoch": 0.19907364520611395, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024717217311263084, "kl": 0.002613342134281993, "learning_rate": 9.60194534506716e-07, "loss": 0.0001, "num_tokens": 118337693.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4298, "step_time": 22.854691732674837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3602997586131096, "epoch": 0.19911996294580825, "frac_reward_zero_std": 1.0, "grad_norm": 0.002583717694506049, "kl": 0.0021408379543572664, "learning_rate": 9.601852709587772e-07, "loss": 0.0001, "num_tokens": 118392483.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4299, "step_time": 27.734892047941685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 145.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3382541611790657, "epoch": 0.19916628068550254, "frac_reward_zero_std": 1.0, "grad_norm": 0.007224246393889189, "kl": 0.004065761575475335, "learning_rate": 9.601760074108383e-07, "loss": 0.0002, "num_tokens": 118414717.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4300, "step_time": 19.81400351598859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.16925596073269844, "epoch": 0.19921259842519684, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036267966497689486, "kl": 0.02466302504763007, "learning_rate": 9.601667438628995e-07, "loss": 0.0012, "num_tokens": 118438681.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4301, "step_time": 25.158739805221558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.27261241525411606, "epoch": 0.19925891616489116, "frac_reward_zero_std": 1.0, "grad_norm": 0.013136201538145542, "kl": 0.008726644096896052, "learning_rate": 9.601574803149606e-07, "loss": 0.0004, "num_tokens": 118460665.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4302, "step_time": 18.342788469046354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 117.0625, "completions/mean_terminated_length": 117.0625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2522359751164913, "epoch": 0.19930523390458546, "frac_reward_zero_std": 1.0, "grad_norm": 0.007597236428409815, "kl": 0.003091784135904163, "learning_rate": 9.601482167670217e-07, "loss": 0.0002, "num_tokens": 118480314.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4303, "step_time": 14.608827654272318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 134.8125, "completions/mean_terminated_length": 134.8125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.17270654812455177, "epoch": 0.19935155164427976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020094928331673145, "kl": 0.0017358693294227123, "learning_rate": 9.601389532190828e-07, "loss": 0.0001, "num_tokens": 118511879.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 4304, "step_time": 20.30519315227866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 137.75, "completions/mean_terminated_length": 137.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.2364879846572876, "epoch": 0.19939786938397405, "frac_reward_zero_std": 1.0, "grad_norm": 0.003375881351530552, "kl": 0.0024673426523804665, "learning_rate": 9.60129689671144e-07, "loss": 0.0001, "num_tokens": 118531699.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4305, "step_time": 15.670146342366934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.25742775574326515, "epoch": 0.19944418712366838, "frac_reward_zero_std": 0.0, "grad_norm": 0.14799560606479645, "kl": 0.026925077196210623, "learning_rate": 9.60120426123205e-07, "loss": 0.0024, "num_tokens": 118561205.0, "reward": 0.9811591506004333, "reward_std": 0.013905213214457035, "rewards/reward_func/mean": 0.9811591506004333, "rewards/reward_func/std": 0.013905220665037632, "step": 4306, "step_time": 36.99573115259409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 186.1875, "completions/mean_terminated_length": 186.1875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.17282568290829659, "epoch": 0.19949050486336267, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029926139395684004, "kl": 0.004836944048292935, "learning_rate": 9.601111625752662e-07, "loss": 0.0002, "num_tokens": 118584632.0, "reward": 0.8503032922744751, "reward_std": 0.0, "rewards/reward_func/mean": 0.8503032922744751, "rewards/reward_func/std": 0.0, "step": 4307, "step_time": 22.417905122041702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 192.5625, "completions/mean_terminated_length": 192.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.38263826817274094, "epoch": 0.19953682260305697, "frac_reward_zero_std": 1.0, "grad_norm": 0.0086146779358387, "kl": 0.005638763657771051, "learning_rate": 9.601018990273273e-07, "loss": 0.0003, "num_tokens": 118608785.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4308, "step_time": 23.658910185098648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2452923022210598, "epoch": 0.19958314034275126, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016304061282426119, "kl": 0.001445953967049718, "learning_rate": 9.600926354793887e-07, "loss": 0.0001, "num_tokens": 118636439.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 4309, "step_time": 21.38775284215808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 223.25, "completions/mean_terminated_length": 223.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.45355185866355896, "epoch": 0.1996294580824456, "frac_reward_zero_std": 0.0, "grad_norm": 0.008887724950909615, "kl": 0.00968296267092228, "learning_rate": 9.600833719314498e-07, "loss": 0.0003, "num_tokens": 118677355.0, "reward": 7.646224275958957e-08, "reward_std": 1.4227086353457707e-07, "rewards/reward_func/mean": 7.646224275958957e-08, "rewards/reward_func/std": 1.422708777454318e-07, "step": 4310, "step_time": 40.341128807514906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 176.9375, "completions/mean_terminated_length": 176.9375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.16954953595995903, "epoch": 0.19967577582213988, "frac_reward_zero_std": 1.0, "grad_norm": 0.003977600950747728, "kl": 0.004344969056546688, "learning_rate": 9.60074108383511e-07, "loss": 0.0002, "num_tokens": 118729914.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4311, "step_time": 27.7505673058331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 174.375, "completions/mean_terminated_length": 174.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.1930382288992405, "epoch": 0.19972209356183418, "frac_reward_zero_std": 0.0, "grad_norm": 0.1841389536857605, "kl": 0.02196095557883382, "learning_rate": 9.600648448355718e-07, "loss": 0.0097, "num_tokens": 118752560.0, "reward": 0.4496094584465027, "reward_std": 0.026650357991456985, "rewards/reward_func/mean": 0.4496094584465027, "rewards/reward_func/std": 0.026650357991456985, "step": 4312, "step_time": 20.353506673127413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 236.1875, "completions/mean_terminated_length": 236.1875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.3484003394842148, "epoch": 0.19976841130152848, "frac_reward_zero_std": 0.0, "grad_norm": 0.09551838785409927, "kl": 0.02796410722658038, "learning_rate": 9.600555812876332e-07, "loss": -0.061, "num_tokens": 118785635.0, "reward": 0.4152009189128876, "reward_std": 0.47523757815361023, "rewards/reward_func/mean": 0.4152009189128876, "rewards/reward_func/std": 0.47523754835128784, "step": 4313, "step_time": 30.679507791996002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 177.4375, "completions/mean_terminated_length": 177.4375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.14813685044646263, "epoch": 0.1998147290412228, "frac_reward_zero_std": 1.0, "grad_norm": 0.001104956492781639, "kl": 0.0009701547678560019, "learning_rate": 9.600463177396943e-07, "loss": 0.0, "num_tokens": 118814586.0, "reward": 0.8507331609725952, "reward_std": 0.0, "rewards/reward_func/mean": 0.8507331609725952, "rewards/reward_func/std": 0.0, "step": 4314, "step_time": 21.842204809188843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 119.6875, "completions/mean_terminated_length": 119.6875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2730370759963989, "epoch": 0.1998610467809171, "frac_reward_zero_std": 1.0, "grad_norm": 0.00297934515401721, "kl": 0.0021234832529444247, "learning_rate": 9.600370541917554e-07, "loss": 0.0001, "num_tokens": 118835605.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4315, "step_time": 14.282845091074705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 126.0625, "completions/mean_terminated_length": 126.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3452801778912544, "epoch": 0.1999073645206114, "frac_reward_zero_std": 1.0, "grad_norm": 0.00466137146577239, "kl": 0.0026993047795258462, "learning_rate": 9.600277906438165e-07, "loss": 0.0001, "num_tokens": 118856294.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4316, "step_time": 16.659909810870886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 220.3125, "completions/mean_terminated_length": 220.3125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.4068012908101082, "epoch": 0.1999536822603057, "frac_reward_zero_std": 0.0, "grad_norm": 0.10009792447090149, "kl": 0.009737064130604267, "learning_rate": 9.600185270958777e-07, "loss": -0.0476, "num_tokens": 118883243.0, "reward": 0.39948156476020813, "reward_std": 0.46782490611076355, "rewards/reward_func/mean": 0.39948156476020813, "rewards/reward_func/std": 0.46782493591308594, "step": 4317, "step_time": 27.1250514164567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 169.6875, "completions/mean_terminated_length": 169.6875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3841549754142761, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 0.15948259830474854, "kl": 0.012854799628257751, "learning_rate": 9.600092635479388e-07, "loss": 0.0265, "num_tokens": 118906550.0, "reward": 0.018789635971188545, "reward_std": 0.05163368210196495, "rewards/reward_func/mean": 0.018789635971188545, "rewards/reward_func/std": 0.05163368210196495, "step": 4318, "step_time": 20.691959884017706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 142.3125, "completions/mean_terminated_length": 142.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.33237239718437195, "epoch": 0.2000463177396943, "frac_reward_zero_std": 1.0, "grad_norm": 0.004691860172897577, "kl": 0.002636209363117814, "learning_rate": 9.6e-07, "loss": 0.0001, "num_tokens": 118933531.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4319, "step_time": 18.774765387177467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.18039793521165848, "epoch": 0.2000926354793886, "frac_reward_zero_std": 0.0, "grad_norm": 0.11167694628238678, "kl": 0.033629335928708315, "learning_rate": 9.59990736452061e-07, "loss": -0.0231, "num_tokens": 118955085.0, "reward": 0.7709895968437195, "reward_std": 0.19339685142040253, "rewards/reward_func/mean": 0.7709895968437195, "rewards/reward_func/std": 0.19339686632156372, "step": 4320, "step_time": 23.29052422195673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 174.0625, "completions/mean_terminated_length": 174.0625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.42348533123731613, "epoch": 0.2001389532190829, "frac_reward_zero_std": 1.0, "grad_norm": 0.005849068984389305, "kl": 0.004332504118792713, "learning_rate": 9.599814729041222e-07, "loss": 0.0002, "num_tokens": 118990238.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4321, "step_time": 25.806898567825556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 189.6875, "completions/mean_terminated_length": 189.6875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.18591520190238953, "epoch": 0.20018527095877722, "frac_reward_zero_std": 1.0, "grad_norm": 0.004633432254195213, "kl": 0.009485263377428055, "learning_rate": 9.599722093561833e-07, "loss": 0.0005, "num_tokens": 119023737.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4322, "step_time": 24.49805849790573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2583474703133106, "epoch": 0.20023158869847152, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035095608327537775, "kl": 0.0019000690372195095, "learning_rate": 9.599629458082446e-07, "loss": 0.0001, "num_tokens": 119043394.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4323, "step_time": 16.022602926939726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.35340263694524765, "epoch": 0.20027790643816581, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029389699921011925, "kl": 0.001835564646171406, "learning_rate": 9.599536822603058e-07, "loss": 0.0001, "num_tokens": 119079287.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4324, "step_time": 20.98586842417717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 158.0625, "completions/mean_terminated_length": 158.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.19614377617835999, "epoch": 0.2003242241778601, "frac_reward_zero_std": 0.0, "grad_norm": 0.17011243104934692, "kl": 0.007219140185043216, "learning_rate": 9.599444187123667e-07, "loss": 0.0188, "num_tokens": 119103032.0, "reward": 0.8365928530693054, "reward_std": 0.11378215998411179, "rewards/reward_func/mean": 0.8365928530693054, "rewards/reward_func/std": 0.11378216743469238, "step": 4325, "step_time": 20.398106019943953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 120.875, "completions/mean_terminated_length": 120.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.32871055603027344, "epoch": 0.20037054191755443, "frac_reward_zero_std": 1.0, "grad_norm": 0.004803883843123913, "kl": 0.0029782846686430275, "learning_rate": 9.59935155164428e-07, "loss": 0.0001, "num_tokens": 119123830.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4326, "step_time": 15.204549200832844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.3127734512090683, "epoch": 0.20041685965724873, "frac_reward_zero_std": 1.0, "grad_norm": 0.012301845476031303, "kl": 0.014960448257625103, "learning_rate": 9.599258916164891e-07, "loss": 0.0008, "num_tokens": 119147470.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4327, "step_time": 21.793963704258204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 148.375, "completions/mean_terminated_length": 148.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.34366950392723083, "epoch": 0.20046317739694303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045435307547450066, "kl": 0.002444214071147144, "learning_rate": 9.599166280685502e-07, "loss": 0.0001, "num_tokens": 119174916.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4328, "step_time": 20.995104853063822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.15307361632585526, "epoch": 0.20050949513663732, "frac_reward_zero_std": 0.0, "grad_norm": 0.15147282183170319, "kl": 0.0153071487438865, "learning_rate": 9.599073645206114e-07, "loss": -0.0684, "num_tokens": 119199324.0, "reward": 0.7627211809158325, "reward_std": 0.14264582097530365, "rewards/reward_func/mean": 0.7627211809158325, "rewards/reward_func/std": 0.14264583587646484, "step": 4329, "step_time": 21.754540774971247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.2038777731359005, "epoch": 0.20055581287633165, "frac_reward_zero_std": 1.0, "grad_norm": 0.004737113602459431, "kl": 0.004034957790281624, "learning_rate": 9.598981009726725e-07, "loss": 0.0002, "num_tokens": 119223262.0, "reward": 0.8434853553771973, "reward_std": 0.0, "rewards/reward_func/mean": 0.8434853553771973, "rewards/reward_func/std": 0.0, "step": 4330, "step_time": 27.322276193648577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 239.25, "completions/mean_terminated_length": 239.25, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "entropy": 0.2365478202700615, "epoch": 0.20060213061602594, "frac_reward_zero_std": 1.0, "grad_norm": 0.002932611620053649, "kl": 0.0027366644062567502, "learning_rate": 9.598888374247336e-07, "loss": 0.0001, "num_tokens": 119250018.0, "reward": 0.7398260831832886, "reward_std": 0.0, "rewards/reward_func/mean": 0.7398260831832886, "rewards/reward_func/std": 0.0, "step": 4331, "step_time": 28.320982787758112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 175.1875, "completions/mean_terminated_length": 175.1875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3801484704017639, "epoch": 0.20064844835572024, "frac_reward_zero_std": 0.0, "grad_norm": 0.1591929942369461, "kl": 0.0030830000760033727, "learning_rate": 9.598795738767947e-07, "loss": -0.1124, "num_tokens": 119276725.0, "reward": 0.11765605211257935, "reward_std": 0.3222125172615051, "rewards/reward_func/mean": 0.11765605211257935, "rewards/reward_func/std": 0.3222125172615051, "step": 4332, "step_time": 24.597198083996773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 152.9375, "completions/mean_terminated_length": 152.9375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2738218382000923, "epoch": 0.20069476609541453, "frac_reward_zero_std": 1.0, "grad_norm": 0.012202922254800797, "kl": 0.0170272181276232, "learning_rate": 9.598703103288559e-07, "loss": 0.0009, "num_tokens": 119297284.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 4333, "step_time": 19.582966059446335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 202.125, "completions/mean_terminated_length": 202.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.23936771973967552, "epoch": 0.20074108383510886, "frac_reward_zero_std": 0.0, "grad_norm": 0.10019609332084656, "kl": 0.0017584140587132424, "learning_rate": 9.59861046780917e-07, "loss": -0.0591, "num_tokens": 119330070.0, "reward": 0.4562724530696869, "reward_std": 0.49524882435798645, "rewards/reward_func/mean": 0.4562724530696869, "rewards/reward_func/std": 0.49524882435798645, "step": 4334, "step_time": 25.87820515036583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.256301824003458, "epoch": 0.20078740157480315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033000006806105375, "kl": 0.002873494871892035, "learning_rate": 9.598517832329781e-07, "loss": 0.0001, "num_tokens": 119356956.0, "reward": 0.3219582736492157, "reward_std": 0.0, "rewards/reward_func/mean": 0.3219582736492157, "rewards/reward_func/std": 0.0, "step": 4335, "step_time": 26.77712243050337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.2784839943051338, "epoch": 0.20083371931449745, "frac_reward_zero_std": 0.0, "grad_norm": 0.06832694262266159, "kl": 0.02920991904102266, "learning_rate": 9.598425196850395e-07, "loss": 0.0009, "num_tokens": 119379798.0, "reward": 0.944659948348999, "reward_std": 0.22136029601097107, "rewards/reward_func/mean": 0.944659948348999, "rewards/reward_func/std": 0.22136031091213226, "step": 4336, "step_time": 30.97024843469262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.30177614092826843, "epoch": 0.20088003705419175, "frac_reward_zero_std": 1.0, "grad_norm": 0.003893825225532055, "kl": 0.0022443159541580826, "learning_rate": 9.598332561371004e-07, "loss": 0.0001, "num_tokens": 119403354.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4337, "step_time": 17.57817819342017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 159.5625, "completions/mean_terminated_length": 159.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.18841608986258507, "epoch": 0.20092635479388607, "frac_reward_zero_std": 1.0, "grad_norm": 0.003976137842983007, "kl": 0.002540547284297645, "learning_rate": 9.598239925891615e-07, "loss": 0.0001, "num_tokens": 119429459.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 4338, "step_time": 20.13024763390422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 194.125, "completions/mean_terminated_length": 194.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.37682662159204483, "epoch": 0.20097267253358037, "frac_reward_zero_std": 0.0, "grad_norm": 0.1389477550983429, "kl": 0.009646297781728208, "learning_rate": 9.598147290412228e-07, "loss": -0.0543, "num_tokens": 119460325.0, "reward": 0.12000277638435364, "reward_std": 0.32823479175567627, "rewards/reward_func/mean": 0.12000277638435364, "rewards/reward_func/std": 0.32823479175567627, "step": 4339, "step_time": 24.533297430723906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.24463994428515434, "epoch": 0.20101899027327466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019469514954835176, "kl": 0.001599260256625712, "learning_rate": 9.59805465493284e-07, "loss": 0.0001, "num_tokens": 119482361.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4340, "step_time": 16.351763870567083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3754243478178978, "epoch": 0.20106530801296896, "frac_reward_zero_std": 1.0, "grad_norm": 0.005679195746779442, "kl": 0.004411848145537078, "learning_rate": 9.59796201945345e-07, "loss": 0.0002, "num_tokens": 119513550.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4341, "step_time": 23.29446402937174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.2867109067738056, "epoch": 0.20111162575266328, "frac_reward_zero_std": 0.0, "grad_norm": 0.12100539356470108, "kl": 0.01235551736317575, "learning_rate": 9.597869383974062e-07, "loss": -0.0355, "num_tokens": 119552994.0, "reward": 0.6384425759315491, "reward_std": 0.2656787037849426, "rewards/reward_func/mean": 0.6384425759315491, "rewards/reward_func/std": 0.2656787037849426, "step": 4342, "step_time": 35.454101640731096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 159.3125, "completions/mean_terminated_length": 159.3125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.25620218738913536, "epoch": 0.20115794349235758, "frac_reward_zero_std": 1.0, "grad_norm": 0.003614285262301564, "kl": 0.0031151602743193507, "learning_rate": 9.597776748494673e-07, "loss": 0.0002, "num_tokens": 119573831.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4343, "step_time": 18.78634275868535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.2814929857850075, "epoch": 0.20120426123205187, "frac_reward_zero_std": 1.0, "grad_norm": 0.002326444024220109, "kl": 0.0017608084890525788, "learning_rate": 9.597684113015285e-07, "loss": 0.0001, "num_tokens": 119594688.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4344, "step_time": 15.133714221417904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 162.0625, "completions/mean_terminated_length": 162.0625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3477717563509941, "epoch": 0.20125057897174617, "frac_reward_zero_std": 0.0, "grad_norm": 0.13272856175899506, "kl": 0.011347837978973985, "learning_rate": 9.597591477535896e-07, "loss": -0.0993, "num_tokens": 119617457.0, "reward": 0.09390753507614136, "reward_std": 0.12521004676818848, "rewards/reward_func/mean": 0.09390753507614136, "rewards/reward_func/std": 0.12521004676818848, "step": 4345, "step_time": 23.084840770810843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.24499881267547607, "epoch": 0.2012968967114405, "frac_reward_zero_std": 0.0, "grad_norm": 0.1363125443458557, "kl": 0.008282593917101622, "learning_rate": 9.597498842056507e-07, "loss": -0.0143, "num_tokens": 119644821.0, "reward": 0.5969303250312805, "reward_std": 0.04848959296941757, "rewards/reward_func/mean": 0.5969303250312805, "rewards/reward_func/std": 0.04848960041999817, "step": 4346, "step_time": 24.36744337901473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 333.1875, "completions/mean_terminated_length": 333.1875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.3103319816291332, "epoch": 0.2013432144511348, "frac_reward_zero_std": 0.0, "grad_norm": 0.08856040984392166, "kl": 0.01259826822206378, "learning_rate": 9.597406206577118e-07, "loss": 0.24, "num_tokens": 119672584.0, "reward": 0.3125, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.4787135720252991, "step": 4347, "step_time": 51.124805852770805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 205.625, "completions/mean_terminated_length": 205.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.19548982754349709, "epoch": 0.20138953219082908, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020060925744473934, "kl": 0.0017750268161762506, "learning_rate": 9.59731357109773e-07, "loss": 0.0001, "num_tokens": 119709186.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 4348, "step_time": 26.622994769364595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 128.125, "completions/mean_terminated_length": 128.125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.32591327279806137, "epoch": 0.20143584993052338, "frac_reward_zero_std": 1.0, "grad_norm": 0.001764351618476212, "kl": 0.0015986388607416302, "learning_rate": 9.59722093561834e-07, "loss": 0.0001, "num_tokens": 119731092.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4349, "step_time": 17.040066741406918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.26894331723451614, "epoch": 0.2014821676702177, "frac_reward_zero_std": 1.0, "grad_norm": 0.005290125031024218, "kl": 0.0035272493842057884, "learning_rate": 9.597128300138952e-07, "loss": 0.0002, "num_tokens": 119752404.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 4350, "step_time": 19.064855866134167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 149.9375, "completions/mean_terminated_length": 149.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3027791827917099, "epoch": 0.201528485409912, "frac_reward_zero_std": 1.0, "grad_norm": 0.002835450926795602, "kl": 0.0028183605172671378, "learning_rate": 9.597035664659563e-07, "loss": 0.0001, "num_tokens": 119783843.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4351, "step_time": 20.90618012100458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 181.25, "completions/mean_terminated_length": 181.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.15881448611617088, "epoch": 0.2015748031496063, "frac_reward_zero_std": 0.0, "grad_norm": 0.168456569314003, "kl": 0.0038807641831226647, "learning_rate": 9.596943029180175e-07, "loss": -0.0727, "num_tokens": 119808903.0, "reward": 0.9300388693809509, "reward_std": 0.027310028672218323, "rewards/reward_func/mean": 0.9300388693809509, "rewards/reward_func/std": 0.027310030534863472, "step": 4352, "step_time": 24.106820344924927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 106.6875, "completions/mean_terminated_length": 106.6875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.28208357840776443, "epoch": 0.2016211208893006, "frac_reward_zero_std": 1.0, "grad_norm": 0.008142439648509026, "kl": 0.0031564083765260875, "learning_rate": 9.596850393700788e-07, "loss": 0.0002, "num_tokens": 119828738.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4353, "step_time": 15.13072595745325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 232.9375, "completions/mean_terminated_length": 232.9375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.387161947786808, "epoch": 0.20166743862899492, "frac_reward_zero_std": 0.0, "grad_norm": 0.08558937162160873, "kl": 0.012177588418126106, "learning_rate": 9.5967577582214e-07, "loss": 0.0151, "num_tokens": 119866625.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 4354, "step_time": 30.575301326811314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.30372022092342377, "epoch": 0.2017137563686892, "frac_reward_zero_std": 1.0, "grad_norm": 0.007324842736124992, "kl": 0.004446553764864802, "learning_rate": 9.596665122742008e-07, "loss": 0.0002, "num_tokens": 119886409.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4355, "step_time": 17.15417054668069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3980333209037781, "epoch": 0.2017600741083835, "frac_reward_zero_std": 1.0, "grad_norm": 0.00380044081248343, "kl": 0.003151686047203839, "learning_rate": 9.596572487262622e-07, "loss": 0.0002, "num_tokens": 119926557.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4356, "step_time": 24.9662903547287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2784702852368355, "epoch": 0.2018063918480778, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031887576915323734, "kl": 0.0020782564824912697, "learning_rate": 9.596479851783233e-07, "loss": 0.0001, "num_tokens": 119946253.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4357, "step_time": 14.472920812666416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 291.6875, "completions/mean_terminated_length": 291.6875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.1992134526371956, "epoch": 0.20185270958777213, "frac_reward_zero_std": 1.0, "grad_norm": 0.00861821137368679, "kl": 0.009224783629179, "learning_rate": 9.596387216303844e-07, "loss": 0.0005, "num_tokens": 119971800.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4358, "step_time": 35.20111673697829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 184.1875, "completions/mean_terminated_length": 184.1875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.21076007932424545, "epoch": 0.20189902732746642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014462699182331562, "kl": 0.0011636416893452406, "learning_rate": 9.596294580824455e-07, "loss": 0.0001, "num_tokens": 120006475.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 4359, "step_time": 23.57875981926918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.26818672940135, "epoch": 0.20194534506716072, "frac_reward_zero_std": 1.0, "grad_norm": 0.006156525108963251, "kl": 0.0054788870038464665, "learning_rate": 9.596201945345067e-07, "loss": 0.0003, "num_tokens": 120028025.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4360, "step_time": 23.37025962397456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 149.6875, "completions/mean_terminated_length": 149.6875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.22371386364102364, "epoch": 0.20199166280685502, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031889823731034994, "kl": 0.0017451727471780032, "learning_rate": 9.596109309865678e-07, "loss": 0.0001, "num_tokens": 120048308.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4361, "step_time": 18.897894211113453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.1724872589111328, "epoch": 0.20203798054654934, "frac_reward_zero_std": 0.0, "grad_norm": 0.16300824284553528, "kl": 0.041608518455177546, "learning_rate": 9.59601667438629e-07, "loss": -0.0355, "num_tokens": 120069360.0, "reward": 0.6697071194648743, "reward_std": 0.2915422022342682, "rewards/reward_func/mean": 0.6697071194648743, "rewards/reward_func/std": 0.2915422320365906, "step": 4362, "step_time": 18.362178031355143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1738715022802353, "epoch": 0.20208429828624364, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047481004148721695, "kl": 0.013067428022623062, "learning_rate": 9.5959240389069e-07, "loss": 0.0007, "num_tokens": 120106428.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4363, "step_time": 23.938401725143194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 164.4375, "completions/mean_terminated_length": 164.4375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.16497112810611725, "epoch": 0.20213061602593793, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018727611750364304, "kl": 0.002182774478569627, "learning_rate": 9.595831403427512e-07, "loss": 0.0001, "num_tokens": 120127795.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 4364, "step_time": 18.761870093643665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 136.875, "completions/mean_terminated_length": 136.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2960905507206917, "epoch": 0.20217693376563223, "frac_reward_zero_std": 1.0, "grad_norm": 0.004074579104781151, "kl": 0.002672590548172593, "learning_rate": 9.595738767948123e-07, "loss": 0.0001, "num_tokens": 120149601.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4365, "step_time": 18.973962906748056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 269.6875, "completions/mean_terminated_length": 269.6875, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.29117827117443085, "epoch": 0.20222325150532655, "frac_reward_zero_std": 0.0, "grad_norm": 0.08026869595050812, "kl": 0.013203104259446263, "learning_rate": 9.595646132468736e-07, "loss": -0.0996, "num_tokens": 120182924.0, "reward": 0.42102187871932983, "reward_std": 0.41016829013824463, "rewards/reward_func/mean": 0.42102187871932983, "rewards/reward_func/std": 0.410168319940567, "step": 4366, "step_time": 36.51097435876727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 138.5625, "completions/mean_terminated_length": 138.5625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.32694901525974274, "epoch": 0.20226956924502085, "frac_reward_zero_std": 1.0, "grad_norm": 0.00745930103585124, "kl": 0.005590903921984136, "learning_rate": 9.595553496989348e-07, "loss": 0.0003, "num_tokens": 120203861.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4367, "step_time": 18.598782904446125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 274.4375, "completions/mean_terminated_length": 274.4375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.27239735424518585, "epoch": 0.20231588698471514, "frac_reward_zero_std": 0.0, "grad_norm": 0.07638388127088547, "kl": 0.011131863575428724, "learning_rate": 9.595460861509957e-07, "loss": 0.0218, "num_tokens": 120243100.0, "reward": 0.9975948929786682, "reward_std": 0.009620373137295246, "rewards/reward_func/mean": 0.9975948929786682, "rewards/reward_func/std": 0.009620368480682373, "step": 4368, "step_time": 34.29851580038667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 151.0625, "completions/mean_terminated_length": 151.0625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.2635408937931061, "epoch": 0.20236220472440944, "frac_reward_zero_std": 1.0, "grad_norm": 0.002637336030602455, "kl": 0.0023205436300486326, "learning_rate": 9.59536822603057e-07, "loss": 0.0001, "num_tokens": 120263405.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4369, "step_time": 17.772504441440105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 190.3125, "completions/mean_terminated_length": 190.3125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.4073800668120384, "epoch": 0.20240852246410376, "frac_reward_zero_std": 1.0, "grad_norm": 0.00430545536801219, "kl": 0.0041122655384242535, "learning_rate": 9.595275590551181e-07, "loss": 0.0002, "num_tokens": 120305474.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4370, "step_time": 27.753872349858284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 136.6875, "completions/mean_terminated_length": 136.6875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.21981431543827057, "epoch": 0.20245484020379806, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020968785975128412, "kl": 0.0012202737852931023, "learning_rate": 9.595182955071793e-07, "loss": 0.0001, "num_tokens": 120325773.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4371, "step_time": 18.225017122924328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 128.6875, "completions/mean_terminated_length": 128.6875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.25608067214488983, "epoch": 0.20250115794349235, "frac_reward_zero_std": 1.0, "grad_norm": 0.00358181307092309, "kl": 0.002328541479073465, "learning_rate": 9.595090319592404e-07, "loss": 0.0001, "num_tokens": 120348792.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4372, "step_time": 17.058714006096125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 186.9375, "completions/mean_terminated_length": 186.9375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3649733439087868, "epoch": 0.20254747568318665, "frac_reward_zero_std": 0.0, "grad_norm": 0.1176580861210823, "kl": 0.00996137852780521, "learning_rate": 9.594997684113015e-07, "loss": -0.033, "num_tokens": 120370519.0, "reward": 0.05787256732583046, "reward_std": 0.23149026930332184, "rewards/reward_func/mean": 0.05787256732583046, "rewards/reward_func/std": 0.23149026930332184, "step": 4373, "step_time": 22.45996080338955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 149.1875, "completions/mean_terminated_length": 149.1875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.4010501429438591, "epoch": 0.20259379342288097, "frac_reward_zero_std": 1.0, "grad_norm": 0.002867297036573291, "kl": 0.0027788946172222495, "learning_rate": 9.594905048633626e-07, "loss": 0.0001, "num_tokens": 120423290.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4374, "step_time": 25.786354899406433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 155.8125, "completions/mean_terminated_length": 155.8125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.30683937668800354, "epoch": 0.20264011116257527, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026754315476864576, "kl": 0.002054195530945435, "learning_rate": 9.594812413154238e-07, "loss": 0.0001, "num_tokens": 120453703.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4375, "step_time": 21.604269791394472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.20628655701875687, "epoch": 0.20268642890226957, "frac_reward_zero_std": 1.0, "grad_norm": 0.010255440138280392, "kl": 0.009761344874277711, "learning_rate": 9.594719777674849e-07, "loss": 0.0005, "num_tokens": 120475643.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4376, "step_time": 21.476807940751314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.23010990768671036, "epoch": 0.20273274664196386, "frac_reward_zero_std": 1.0, "grad_norm": 0.007033254019916058, "kl": 0.00605582888238132, "learning_rate": 9.59462714219546e-07, "loss": 0.0003, "num_tokens": 120503939.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4377, "step_time": 18.441113721579313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 199.0625, "completions/mean_terminated_length": 199.0625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2417679913341999, "epoch": 0.2027790643816582, "frac_reward_zero_std": 1.0, "grad_norm": 0.003662134986370802, "kl": 0.00411995907779783, "learning_rate": 9.594534506716071e-07, "loss": 0.0002, "num_tokens": 120531300.0, "reward": 0.7577395439147949, "reward_std": 0.0, "rewards/reward_func/mean": 0.7577395439147949, "rewards/reward_func/std": 0.0, "step": 4378, "step_time": 26.69351141527295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 146.0, "completions/mean_terminated_length": 146.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.16164137795567513, "epoch": 0.20282538212135248, "frac_reward_zero_std": 1.0, "grad_norm": 0.006584150716662407, "kl": 0.003201333398465067, "learning_rate": 9.594441871236685e-07, "loss": 0.0002, "num_tokens": 120551668.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 4379, "step_time": 18.956845924258232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 219.4375, "completions/mean_terminated_length": 219.4375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "entropy": 0.29696741700172424, "epoch": 0.20287169986104678, "frac_reward_zero_std": 1.0, "grad_norm": 0.014850565232336521, "kl": 0.017927177250385284, "learning_rate": 9.594349235757294e-07, "loss": 0.0009, "num_tokens": 120573419.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4380, "step_time": 23.3524604216218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 157.5, "completions/mean_terminated_length": 157.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.35661375522613525, "epoch": 0.20291801760074107, "frac_reward_zero_std": 1.0, "grad_norm": 0.014806061051785946, "kl": 0.013286509085446596, "learning_rate": 9.594256600277905e-07, "loss": 0.0007, "num_tokens": 120598387.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4381, "step_time": 19.596401181071997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 164.875, "completions/mean_terminated_length": 164.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.33992859721183777, "epoch": 0.2029643353404354, "frac_reward_zero_std": 1.0, "grad_norm": 0.00979616492986679, "kl": 0.007574495393782854, "learning_rate": 9.594163964798516e-07, "loss": 0.0004, "num_tokens": 120620113.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4382, "step_time": 19.149428606033325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 128.5625, "completions/mean_terminated_length": 128.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2978258281946182, "epoch": 0.2030106530801297, "frac_reward_zero_std": 1.0, "grad_norm": 0.005817455239593983, "kl": 0.003650028840638697, "learning_rate": 9.59407132931913e-07, "loss": 0.0002, "num_tokens": 120641290.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4383, "step_time": 17.43656163290143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 114.8125, "completions/mean_terminated_length": 114.8125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.27465222030878067, "epoch": 0.203056970819824, "frac_reward_zero_std": 1.0, "grad_norm": 0.002690394874662161, "kl": 0.0017183998716063797, "learning_rate": 9.59397869383974e-07, "loss": 0.0001, "num_tokens": 120660855.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4384, "step_time": 14.342736564576626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 199.6875, "completions/mean_terminated_length": 199.6875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.2033468410372734, "epoch": 0.20310328855951829, "frac_reward_zero_std": 1.0, "grad_norm": 0.011636612005531788, "kl": 0.008831958984956145, "learning_rate": 9.593886058360352e-07, "loss": 0.0004, "num_tokens": 120682306.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4385, "step_time": 21.862970259040594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 156.5625, "completions/mean_terminated_length": 156.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.1484076865017414, "epoch": 0.2031496062992126, "frac_reward_zero_std": 1.0, "grad_norm": 0.004241873510181904, "kl": 0.0027250301791355014, "learning_rate": 9.593793422880963e-07, "loss": 0.0001, "num_tokens": 120704619.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 4386, "step_time": 21.26169503480196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 136.6875, "completions/mean_terminated_length": 136.6875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2334960661828518, "epoch": 0.2031959240389069, "frac_reward_zero_std": 1.0, "grad_norm": 0.003352184547111392, "kl": 0.0019840349268633872, "learning_rate": 9.593700787401575e-07, "loss": 0.0001, "num_tokens": 120724310.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4387, "step_time": 16.824626177549362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 111.25, "completions/mean_terminated_length": 111.25, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2790839448571205, "epoch": 0.2032422417786012, "frac_reward_zero_std": 1.0, "grad_norm": 0.002758890390396118, "kl": 0.001983508700504899, "learning_rate": 9.593608151922186e-07, "loss": 0.0001, "num_tokens": 120744026.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4388, "step_time": 14.307689350098372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 218.9375, "completions/mean_terminated_length": 218.9375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.17713632434606552, "epoch": 0.2032885595182955, "frac_reward_zero_std": 1.0, "grad_norm": 0.003758077509701252, "kl": 0.003175067831762135, "learning_rate": 9.593515516442797e-07, "loss": 0.0002, "num_tokens": 120773817.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4389, "step_time": 25.04056917130947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.18416766449809074, "epoch": 0.20333487725798982, "frac_reward_zero_std": 1.0, "grad_norm": 0.02285883016884327, "kl": 0.0019181863754056394, "learning_rate": 9.593422880963408e-07, "loss": 0.0001, "num_tokens": 120809213.0, "reward": 0.951229453086853, "reward_std": 0.0, "rewards/reward_func/mean": 0.951229453086853, "rewards/reward_func/std": 0.0, "step": 4390, "step_time": 26.845209203660488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.30053722113370895, "epoch": 0.20338119499768412, "frac_reward_zero_std": 1.0, "grad_norm": 0.003516019554808736, "kl": 0.002620826126076281, "learning_rate": 9.59333024548402e-07, "loss": 0.0001, "num_tokens": 120838696.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4391, "step_time": 18.624075073748827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 180.875, "completions/mean_terminated_length": 180.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.434813991189003, "epoch": 0.2034275127373784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037418357096612453, "kl": 0.00333327054977417, "learning_rate": 9.59323761000463e-07, "loss": 0.0002, "num_tokens": 120866854.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4392, "step_time": 22.500424940139055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.22748274356126785, "epoch": 0.2034738304770727, "frac_reward_zero_std": 1.0, "grad_norm": 0.002320576226338744, "kl": 0.0017496794753242284, "learning_rate": 9.593144974525242e-07, "loss": 0.0001, "num_tokens": 120893721.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 4393, "step_time": 18.4317070171237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 220.3125, "completions/mean_terminated_length": 220.3125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.24937263131141663, "epoch": 0.20352014821676703, "frac_reward_zero_std": 0.0, "grad_norm": 0.09236271679401398, "kl": 0.006573219550773501, "learning_rate": 9.593052339045853e-07, "loss": -0.0487, "num_tokens": 120917390.0, "reward": 0.2537243962287903, "reward_std": 0.065545953810215, "rewards/reward_func/mean": 0.2537243962287903, "rewards/reward_func/std": 0.065545953810215, "step": 4394, "step_time": 28.78863863646984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.17656762152910233, "epoch": 0.20356646595646133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057213762775063515, "kl": 0.04755854792892933, "learning_rate": 9.592959703566465e-07, "loss": 0.0024, "num_tokens": 120955376.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4395, "step_time": 27.040158040821552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 116.5, "completions/mean_terminated_length": 116.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.2675146237015724, "epoch": 0.20361278369615562, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023586463648825884, "kl": 0.0017603501328267157, "learning_rate": 9.592867068087078e-07, "loss": 0.0001, "num_tokens": 120978520.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4396, "step_time": 16.30543616786599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 189.9375, "completions/mean_terminated_length": 189.9375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.18880921602249146, "epoch": 0.20365910143584992, "frac_reward_zero_std": 1.0, "grad_norm": 0.007340978365391493, "kl": 0.005150568729732186, "learning_rate": 9.59277443260769e-07, "loss": 0.0003, "num_tokens": 121006727.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4397, "step_time": 22.45252402871847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 218.3125, "completions/mean_terminated_length": 218.3125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.15946005657315254, "epoch": 0.20370541917554424, "frac_reward_zero_std": 0.0, "grad_norm": 0.1293276995420456, "kl": 0.03771508112549782, "learning_rate": 9.592681797128298e-07, "loss": -0.0428, "num_tokens": 121037068.0, "reward": 0.9918040037155151, "reward_std": 0.01466143038123846, "rewards/reward_func/mean": 0.9918040037155151, "rewards/reward_func/std": 0.014661417342722416, "step": 4398, "step_time": 26.476797968149185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.4311412423849106, "epoch": 0.20375173691523854, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017392414156347513, "kl": 0.002208363323006779, "learning_rate": 9.592589161648912e-07, "loss": 0.0001, "num_tokens": 121087188.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4399, "step_time": 25.522712852805853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 153.6875, "completions/mean_terminated_length": 153.6875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.15880463644862175, "epoch": 0.20379805465493284, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029165507294237614, "kl": 0.0038405574741773307, "learning_rate": 9.592496526169523e-07, "loss": 0.0002, "num_tokens": 121110943.0, "reward": 0.8337529301643372, "reward_std": 0.0, "rewards/reward_func/mean": 0.8337529301643372, "rewards/reward_func/std": 0.0, "step": 4400, "step_time": 18.827107544988394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 182.1875, "completions/mean_terminated_length": 182.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.1975412853062153, "epoch": 0.20384437239462713, "frac_reward_zero_std": 0.0, "grad_norm": 0.07105456292629242, "kl": 0.0011213233519811183, "learning_rate": 9.592403890690134e-07, "loss": 0.0113, "num_tokens": 121141874.0, "reward": 0.8560765981674194, "reward_std": 0.038379572331905365, "rewards/reward_func/mean": 0.8560765981674194, "rewards/reward_func/std": 0.03837956488132477, "step": 4401, "step_time": 23.41580117121339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 143.625, "completions/mean_terminated_length": 143.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.18176880106329918, "epoch": 0.20389069013432146, "frac_reward_zero_std": 0.0, "grad_norm": 0.13676771521568298, "kl": 0.0044325272319838405, "learning_rate": 9.592311255210746e-07, "loss": -0.0306, "num_tokens": 121163100.0, "reward": 0.884931206703186, "reward_std": 0.09087805449962616, "rewards/reward_func/mean": 0.884931206703186, "rewards/reward_func/std": 0.09087805449962616, "step": 4402, "step_time": 16.363803543150425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.449375756084919, "epoch": 0.20393700787401575, "frac_reward_zero_std": 0.0, "grad_norm": 0.11944127082824707, "kl": 0.0065049066906794906, "learning_rate": 9.592218619731357e-07, "loss": -0.1757, "num_tokens": 121196910.0, "reward": 0.002497798530384898, "reward_std": 0.005370105616748333, "rewards/reward_func/mean": 0.002497798530384898, "rewards/reward_func/std": 0.005370105616748333, "step": 4403, "step_time": 30.80251520872116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 146.4375, "completions/mean_terminated_length": 146.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3295118361711502, "epoch": 0.20398332561371005, "frac_reward_zero_std": 1.0, "grad_norm": 0.003453849582001567, "kl": 0.0028061120538040996, "learning_rate": 9.592125984251968e-07, "loss": 0.0001, "num_tokens": 121218069.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4404, "step_time": 19.674273550510406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 203.1875, "completions/mean_terminated_length": 203.1875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.44458911567926407, "epoch": 0.20402964335340434, "frac_reward_zero_std": 1.0, "grad_norm": 0.003419496351853013, "kl": 0.003455746453255415, "learning_rate": 9.59203334877258e-07, "loss": 0.0002, "num_tokens": 121241576.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4405, "step_time": 24.537865091115236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3620297908782959, "epoch": 0.20407596109309867, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036682465579360723, "kl": 0.0029127378948032856, "learning_rate": 9.59194071329319e-07, "loss": 0.0001, "num_tokens": 121263641.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4406, "step_time": 20.35015856102109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.1944119967520237, "epoch": 0.20412227883279296, "frac_reward_zero_std": 0.0, "grad_norm": 0.07078786939382553, "kl": 0.0031040938338264823, "learning_rate": 9.591848077813802e-07, "loss": -0.0229, "num_tokens": 121300679.0, "reward": 0.5942342281341553, "reward_std": 0.12828125059604645, "rewards/reward_func/mean": 0.5942342281341553, "rewards/reward_func/std": 0.12828125059604645, "step": 4407, "step_time": 33.402572583407164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 300.4375, "completions/mean_terminated_length": 300.4375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.41832730174064636, "epoch": 0.20416859657248726, "frac_reward_zero_std": 0.0, "grad_norm": 0.08045594394207001, "kl": 0.011987762525677681, "learning_rate": 9.591755442334413e-07, "loss": -0.2775, "num_tokens": 121341694.0, "reward": 0.42598751187324524, "reward_std": 0.4988654851913452, "rewards/reward_func/mean": 0.42598751187324524, "rewards/reward_func/std": 0.4988655149936676, "step": 4408, "step_time": 45.18706896901131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 158.5, "completions/mean_terminated_length": 158.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.17929061502218246, "epoch": 0.20421491431218156, "frac_reward_zero_std": 1.0, "grad_norm": 0.010240025818347931, "kl": 0.025862340815365314, "learning_rate": 9.591662806855026e-07, "loss": 0.0013, "num_tokens": 121363094.0, "reward": 0.894839346408844, "reward_std": 0.0, "rewards/reward_func/mean": 0.894839346408844, "rewards/reward_func/std": 0.0, "step": 4409, "step_time": 21.9126313701272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 226.1875, "completions/mean_terminated_length": 226.1875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.31503815203905106, "epoch": 0.20426123205187588, "frac_reward_zero_std": 0.0, "grad_norm": 0.11403704434633255, "kl": 0.005874601716641337, "learning_rate": 9.591570171375638e-07, "loss": 0.0202, "num_tokens": 121409081.0, "reward": 0.8776825666427612, "reward_std": 0.33423489332199097, "rewards/reward_func/mean": 0.8776825666427612, "rewards/reward_func/std": 0.33423489332199097, "step": 4410, "step_time": 29.934467788785696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 231.3125, "completions/mean_terminated_length": 231.3125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.4675322622060776, "epoch": 0.20430754979157018, "frac_reward_zero_std": 0.0, "grad_norm": 0.12299149483442307, "kl": 0.006061073509044945, "learning_rate": 9.591477535896247e-07, "loss": 0.1042, "num_tokens": 121433182.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 4411, "step_time": 28.184160079807043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 151.5, "completions/mean_terminated_length": 151.5, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3533034175634384, "epoch": 0.20435386753126447, "frac_reward_zero_std": 1.0, "grad_norm": 0.003036539303138852, "kl": 0.002608212234918028, "learning_rate": 9.591384900416858e-07, "loss": 0.0001, "num_tokens": 121469414.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4412, "step_time": 23.193986248224974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 194.3125, "completions/mean_terminated_length": 194.3125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4515443667769432, "epoch": 0.20440018527095877, "frac_reward_zero_std": 1.0, "grad_norm": 0.011975867673754692, "kl": 0.00841202971059829, "learning_rate": 9.591292264937471e-07, "loss": 0.0004, "num_tokens": 121491451.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4413, "step_time": 22.457041680812836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 233.3125, "completions/mean_terminated_length": 233.3125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.3064805567264557, "epoch": 0.2044465030106531, "frac_reward_zero_std": 0.0, "grad_norm": 0.1058402955532074, "kl": 0.024243885651230812, "learning_rate": 9.591199629458083e-07, "loss": -0.0399, "num_tokens": 121515696.0, "reward": 0.5222058892250061, "reward_std": 0.363617479801178, "rewards/reward_func/mean": 0.5222058892250061, "rewards/reward_func/std": 0.36361750960350037, "step": 4414, "step_time": 26.59234295785427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 172.5, "completions/mean_terminated_length": 172.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4450515806674957, "epoch": 0.2044928207503474, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030375299975275993, "kl": 0.0031095953891053796, "learning_rate": 9.591106993978694e-07, "loss": 0.0002, "num_tokens": 121550088.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4415, "step_time": 25.035363737493753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.25684917718172073, "epoch": 0.20453913849004168, "frac_reward_zero_std": 1.0, "grad_norm": 0.001872918801382184, "kl": 0.0014846816484350711, "learning_rate": 9.591014358499305e-07, "loss": 0.0001, "num_tokens": 121571486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4416, "step_time": 16.55630274116993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2973170876502991, "epoch": 0.20458545622973598, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056549739092588425, "kl": 0.002468589402269572, "learning_rate": 9.590921723019916e-07, "loss": 0.0001, "num_tokens": 121594534.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4417, "step_time": 16.745481088757515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 232.5625, "completions/mean_terminated_length": 232.5625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "entropy": 0.22854237258434296, "epoch": 0.2046317739694303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073995441198349, "kl": 0.004934438620693982, "learning_rate": 9.590829087540528e-07, "loss": 0.0002, "num_tokens": 121633423.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4418, "step_time": 29.776594549417496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3942725211381912, "epoch": 0.2046780917091246, "frac_reward_zero_std": 1.0, "grad_norm": 0.015917306765913963, "kl": 0.008968879468739033, "learning_rate": 9.590736452061139e-07, "loss": 0.0005, "num_tokens": 121657847.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4419, "step_time": 21.063758365809917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 158.0625, "completions/mean_terminated_length": 158.0625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.19162816181778908, "epoch": 0.2047244094488189, "frac_reward_zero_std": 1.0, "grad_norm": 0.00394805520772934, "kl": 0.0029846797697246075, "learning_rate": 9.59064381658175e-07, "loss": 0.0001, "num_tokens": 121678472.0, "reward": 0.8702397346496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.8702397346496582, "rewards/reward_func/std": 0.0, "step": 4420, "step_time": 19.00661974772811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2606373205780983, "epoch": 0.2047707271885132, "frac_reward_zero_std": 1.0, "grad_norm": 0.004122025799006224, "kl": 0.0044886431423947215, "learning_rate": 9.590551181102361e-07, "loss": 0.0002, "num_tokens": 121701439.0, "reward": 0.31414684653282166, "reward_std": 0.0, "rewards/reward_func/mean": 0.31414684653282166, "rewards/reward_func/std": 0.0, "step": 4421, "step_time": 24.56281067058444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.20721253752708435, "epoch": 0.20481704492820751, "frac_reward_zero_std": 1.0, "grad_norm": 0.004045045934617519, "kl": 0.0024959520087577403, "learning_rate": 9.590458545622973e-07, "loss": 0.0001, "num_tokens": 121738759.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4422, "step_time": 24.778950180858374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 147.4375, "completions/mean_terminated_length": 147.4375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.2940957322716713, "epoch": 0.2048633626679018, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042904941365122795, "kl": 0.002092456095851958, "learning_rate": 9.590365910143584e-07, "loss": 0.0001, "num_tokens": 121766334.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4423, "step_time": 19.694414857774973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 220.1875, "completions/mean_terminated_length": 220.1875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.38973531872034073, "epoch": 0.2049096804075961, "frac_reward_zero_std": 0.0, "grad_norm": 0.1366364061832428, "kl": 0.014282828662544489, "learning_rate": 9.590273274664195e-07, "loss": -0.0137, "num_tokens": 121788817.0, "reward": 0.15132294595241547, "reward_std": 0.3253345489501953, "rewards/reward_func/mean": 0.15132294595241547, "rewards/reward_func/std": 0.3253345489501953, "step": 4424, "step_time": 27.229420375078917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.18300453945994377, "epoch": 0.2049559981472904, "frac_reward_zero_std": 1.0, "grad_norm": 0.002770983148366213, "kl": 0.0026137664972338825, "learning_rate": 9.590180639184806e-07, "loss": 0.0001, "num_tokens": 121816197.0, "reward": 0.9428731203079224, "reward_std": 0.0, "rewards/reward_func/mean": 0.9428731203079224, "rewards/reward_func/std": 0.0, "step": 4425, "step_time": 20.217567820101976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 162.625, "completions/mean_terminated_length": 162.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.3745696395635605, "epoch": 0.20500231588698473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027576652355492115, "kl": 0.0023471819586120546, "learning_rate": 9.59008800370542e-07, "loss": 0.0001, "num_tokens": 121854255.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4426, "step_time": 22.342611964792013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 227.4375, "completions/mean_terminated_length": 227.4375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.37403204292058945, "epoch": 0.20504863362667902, "frac_reward_zero_std": 0.0, "grad_norm": 0.14391353726387024, "kl": 0.02081705490127206, "learning_rate": 9.58999536822603e-07, "loss": -0.1185, "num_tokens": 121881622.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 4427, "step_time": 34.336918134242296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 176.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3281799256801605, "epoch": 0.20509495136637332, "frac_reward_zero_std": 0.0, "grad_norm": 0.13997496664524078, "kl": 0.0050591142498888075, "learning_rate": 9.589902732746642e-07, "loss": -0.04, "num_tokens": 121912241.0, "reward": 0.6342861652374268, "reward_std": 0.4144529700279236, "rewards/reward_func/mean": 0.6342861652374268, "rewards/reward_func/std": 0.41445299983024597, "step": 4428, "step_time": 24.334660917520523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 207.0625, "completions/mean_terminated_length": 207.0625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.40722689777612686, "epoch": 0.20514126910606761, "frac_reward_zero_std": 1.0, "grad_norm": 0.09576158970594406, "kl": 0.017346865322906524, "learning_rate": 9.589810097267253e-07, "loss": 0.0008, "num_tokens": 121942034.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4429, "step_time": 26.369808048009872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4024273753166199, "epoch": 0.20518758684576194, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029543309938162565, "kl": 0.0025500337942503393, "learning_rate": 9.589717461787865e-07, "loss": 0.0001, "num_tokens": 121966858.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4430, "step_time": 23.2380328476429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 177.8125, "completions/mean_terminated_length": 177.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.37463711202144623, "epoch": 0.20523390458545623, "frac_reward_zero_std": 1.0, "grad_norm": 0.014829622581601143, "kl": 0.008776698727160692, "learning_rate": 9.589624826308476e-07, "loss": 0.0004, "num_tokens": 121989991.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4431, "step_time": 19.534911889582872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3643925115466118, "epoch": 0.20528022232515053, "frac_reward_zero_std": 1.0, "grad_norm": 0.006356745958328247, "kl": 0.004867341020144522, "learning_rate": 9.589532190829087e-07, "loss": 0.0002, "num_tokens": 122016609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4432, "step_time": 20.41070855781436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 139.0, "completions/mean_terminated_length": 139.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2526129074394703, "epoch": 0.20532654006484483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018666721880435944, "kl": 0.0015373661299236119, "learning_rate": 9.589439555349698e-07, "loss": 0.0001, "num_tokens": 122037409.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4433, "step_time": 17.788786247372627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.4167468100786209, "epoch": 0.20537285780453915, "frac_reward_zero_std": 0.0, "grad_norm": 0.13366052508354187, "kl": 0.015270611038431525, "learning_rate": 9.58934691987031e-07, "loss": -0.0291, "num_tokens": 122060023.0, "reward": 0.12705472111701965, "reward_std": 0.15925611555576324, "rewards/reward_func/mean": 0.12705472111701965, "rewards/reward_func/std": 0.15925611555576324, "step": 4434, "step_time": 24.816624749451876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2214369997382164, "epoch": 0.20541917554423345, "frac_reward_zero_std": 1.0, "grad_norm": 0.005897964350879192, "kl": 0.002670010202564299, "learning_rate": 9.58925428439092e-07, "loss": 0.0001, "num_tokens": 122079367.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4435, "step_time": 14.795745197683573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 345.875, "completions/mean_terminated_length": 345.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.23983752354979515, "epoch": 0.20546549328392774, "frac_reward_zero_std": 0.0, "grad_norm": 0.08564194291830063, "kl": 0.006884301896207035, "learning_rate": 9.589161648911532e-07, "loss": -0.027, "num_tokens": 122120389.0, "reward": 0.8980263471603394, "reward_std": 0.10627995431423187, "rewards/reward_func/mean": 0.8980263471603394, "rewards/reward_func/std": 0.10627996176481247, "step": 4436, "step_time": 44.15879878401756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.1434022542089224, "epoch": 0.20551181102362204, "frac_reward_zero_std": 1.0, "grad_norm": 0.010433053597807884, "kl": 0.005398858338594437, "learning_rate": 9.589069013432143e-07, "loss": 0.0003, "num_tokens": 122144441.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4437, "step_time": 22.41115650907159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2852722741663456, "epoch": 0.20555812876331636, "frac_reward_zero_std": 1.0, "grad_norm": 0.004220884758979082, "kl": 0.0028225001879036427, "learning_rate": 9.588976377952755e-07, "loss": 0.0001, "num_tokens": 122172059.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4438, "step_time": 17.71949626505375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 124.6875, "completions/mean_terminated_length": 124.6875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.25068046152591705, "epoch": 0.20560444650301066, "frac_reward_zero_std": 1.0, "grad_norm": 0.003623596392571926, "kl": 0.0022179295192472637, "learning_rate": 9.588883742473368e-07, "loss": 0.0001, "num_tokens": 122192678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4439, "step_time": 14.5108128413558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.31101303547620773, "epoch": 0.20565076424270495, "frac_reward_zero_std": 0.0, "grad_norm": 0.10069181025028229, "kl": 0.02580390591174364, "learning_rate": 9.58879110699398e-07, "loss": -0.1711, "num_tokens": 122225856.0, "reward": 0.5459882020950317, "reward_std": 0.4171529710292816, "rewards/reward_func/mean": 0.5459882020950317, "rewards/reward_func/std": 0.417153000831604, "step": 4440, "step_time": 36.09346652403474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 131.9375, "completions/mean_terminated_length": 131.9375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3213750869035721, "epoch": 0.20569708198239925, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022330954670906067, "kl": 0.0018797389639075845, "learning_rate": 9.588698471514588e-07, "loss": 0.0001, "num_tokens": 122245503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4441, "step_time": 16.572161994874477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3204464539885521, "epoch": 0.20574339972209357, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026541727129369974, "kl": 0.001986485061934218, "learning_rate": 9.5886058360352e-07, "loss": 0.0001, "num_tokens": 122266489.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4442, "step_time": 18.283427093178034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 315.25, "completions/mean_terminated_length": 315.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "entropy": 0.2674109488725662, "epoch": 0.20578971746178787, "frac_reward_zero_std": 1.0, "grad_norm": 0.00503187533468008, "kl": 0.016199012519791722, "learning_rate": 9.588513200555813e-07, "loss": 0.0008, "num_tokens": 122299581.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4443, "step_time": 35.6751859895885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.2758583277463913, "epoch": 0.20583603520148216, "frac_reward_zero_std": 1.0, "grad_norm": 0.003130985889583826, "kl": 0.001854931062553078, "learning_rate": 9.588420565076424e-07, "loss": 0.0001, "num_tokens": 122319953.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4444, "step_time": 15.612978029996157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 187.6875, "completions/mean_terminated_length": 187.6875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.40260956436395645, "epoch": 0.20588235294117646, "frac_reward_zero_std": 1.0, "grad_norm": 0.008119883015751839, "kl": 0.005284482031129301, "learning_rate": 9.588327929597036e-07, "loss": 0.0003, "num_tokens": 122347340.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4445, "step_time": 23.433291722089052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 149.1875, "completions/mean_terminated_length": 149.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.23468093946576118, "epoch": 0.20592867068087078, "frac_reward_zero_std": 0.0, "grad_norm": 0.19637228548526764, "kl": 0.013076600152999163, "learning_rate": 9.588235294117647e-07, "loss": 0.0845, "num_tokens": 122367967.0, "reward": 0.6438281536102295, "reward_std": 0.28493744134902954, "rewards/reward_func/mean": 0.6438281536102295, "rewards/reward_func/std": 0.28493744134902954, "step": 4446, "step_time": 18.772852525115013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 115.75, "completions/mean_terminated_length": 115.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.23649794608354568, "epoch": 0.20597498842056508, "frac_reward_zero_std": 1.0, "grad_norm": 0.009086528792977333, "kl": 0.003549997869413346, "learning_rate": 9.588142658638258e-07, "loss": 0.0002, "num_tokens": 122387083.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4447, "step_time": 13.733539011329412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 234.6875, "completions/mean_terminated_length": 234.6875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3815370425581932, "epoch": 0.20602130616025938, "frac_reward_zero_std": 1.0, "grad_norm": 0.009284532628953457, "kl": 0.00850689155049622, "learning_rate": 9.58805002315887e-07, "loss": 0.0004, "num_tokens": 122408422.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4448, "step_time": 34.92842309176922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.16739467531442642, "epoch": 0.20606762389995367, "frac_reward_zero_std": 0.0, "grad_norm": 0.07969807088375092, "kl": 0.0027272370643913746, "learning_rate": 9.58795738767948e-07, "loss": 0.0051, "num_tokens": 122432554.0, "reward": 0.9014118909835815, "reward_std": 0.02629014663398266, "rewards/reward_func/mean": 0.9014118909835815, "rewards/reward_func/std": 0.026290163397789, "step": 4449, "step_time": 19.270759791135788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.17201659083366394, "epoch": 0.206113941639648, "frac_reward_zero_std": 1.0, "grad_norm": 0.005692894570529461, "kl": 0.0036805764539167285, "learning_rate": 9.587864752200092e-07, "loss": 0.0002, "num_tokens": 122464800.0, "reward": 0.964915931224823, "reward_std": 0.0, "rewards/reward_func/mean": 0.964915931224823, "rewards/reward_func/std": 0.0, "step": 4450, "step_time": 22.921984735876322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.32443325221538544, "epoch": 0.2061602593793423, "frac_reward_zero_std": 1.0, "grad_norm": 0.003495546290650964, "kl": 0.0022235424839891493, "learning_rate": 9.587772116720703e-07, "loss": 0.0001, "num_tokens": 122487272.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4451, "step_time": 16.174052443355322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 214.0625, "completions/mean_terminated_length": 214.0625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.38778331875801086, "epoch": 0.2062065771190366, "frac_reward_zero_std": 0.0, "grad_norm": 0.1335936337709427, "kl": 0.014234495349228382, "learning_rate": 9.587679481241314e-07, "loss": -0.0268, "num_tokens": 122520505.0, "reward": 0.03813140466809273, "reward_std": 0.09032773971557617, "rewards/reward_func/mean": 0.03813140466809273, "rewards/reward_func/std": 0.09032774716615677, "step": 4452, "step_time": 26.763722702860832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 154.1875, "completions/mean_terminated_length": 154.1875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.347697950899601, "epoch": 0.20625289485873088, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033534392714500427, "kl": 0.0034794610110111535, "learning_rate": 9.587586845761928e-07, "loss": 0.0002, "num_tokens": 122540764.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4453, "step_time": 19.237523522228003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 131.9375, "completions/mean_terminated_length": 131.9375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.2027127742767334, "epoch": 0.2062992125984252, "frac_reward_zero_std": 1.0, "grad_norm": 0.004297061823308468, "kl": 0.002365048392675817, "learning_rate": 9.587494210282537e-07, "loss": 0.0001, "num_tokens": 122560331.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4454, "step_time": 15.467692468315363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 178.4375, "completions/mean_terminated_length": 178.4375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.22610027343034744, "epoch": 0.2063455303381195, "frac_reward_zero_std": 1.0, "grad_norm": 0.012379267252981663, "kl": 0.009055770700797439, "learning_rate": 9.587401574803148e-07, "loss": 0.0005, "num_tokens": 122581730.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4455, "step_time": 19.87071193009615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 214.3125, "completions/mean_terminated_length": 214.3125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.33952800184488297, "epoch": 0.2063918480778138, "frac_reward_zero_std": 0.0, "grad_norm": 0.10389196127653122, "kl": 0.027087272610515356, "learning_rate": 9.587308939323761e-07, "loss": -0.0913, "num_tokens": 122605719.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 4456, "step_time": 26.03425007686019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 211.9375, "completions/mean_terminated_length": 211.9375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.184910599142313, "epoch": 0.2064381658175081, "frac_reward_zero_std": 0.0, "grad_norm": 0.09489887207746506, "kl": 0.02374209789559245, "learning_rate": 9.587216303844373e-07, "loss": -0.0467, "num_tokens": 122630902.0, "reward": 0.7882111072540283, "reward_std": 0.16943113505840302, "rewards/reward_func/mean": 0.7882111072540283, "rewards/reward_func/std": 0.1694311648607254, "step": 4457, "step_time": 27.780203569680452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 124.5, "completions/mean_terminated_length": 124.5, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2856317386031151, "epoch": 0.20648448355720242, "frac_reward_zero_std": 1.0, "grad_norm": 0.004135873634368181, "kl": 0.002742825250606984, "learning_rate": 9.587123668364984e-07, "loss": 0.0001, "num_tokens": 122652542.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4458, "step_time": 15.610228545963764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.17769036814570427, "epoch": 0.20653080129689672, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034338689874857664, "kl": 0.0032599986298009753, "learning_rate": 9.587031032885595e-07, "loss": 0.0002, "num_tokens": 122681958.0, "reward": 0.9489824771881104, "reward_std": 0.0, "rewards/reward_func/mean": 0.9489824771881104, "rewards/reward_func/std": 0.0, "step": 4459, "step_time": 27.415871299803257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 129.8125, "completions/mean_terminated_length": 129.8125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.27154625207185745, "epoch": 0.206577119036591, "frac_reward_zero_std": 1.0, "grad_norm": 0.002972422167658806, "kl": 0.0022357859706971794, "learning_rate": 9.586938397406206e-07, "loss": 0.0001, "num_tokens": 122705427.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4460, "step_time": 17.25046358630061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 147.875, "completions/mean_terminated_length": 147.875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.24585303664207458, "epoch": 0.2066234367762853, "frac_reward_zero_std": 0.0, "grad_norm": 0.21688760817050934, "kl": 0.011696155939716846, "learning_rate": 9.586845761926818e-07, "loss": -0.0442, "num_tokens": 122725617.0, "reward": 0.7421827912330627, "reward_std": 0.06671953946352005, "rewards/reward_func/mean": 0.7421827912330627, "rewards/reward_func/std": 0.06671953201293945, "step": 4461, "step_time": 18.486302569508553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 241.8125, "completions/mean_terminated_length": 241.8125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.45322611927986145, "epoch": 0.20666975451597963, "frac_reward_zero_std": 0.0, "grad_norm": 0.11351599544286728, "kl": 0.00714661309029907, "learning_rate": 9.586753126447429e-07, "loss": 0.1097, "num_tokens": 122763326.0, "reward": 0.4375, "reward_std": 0.5123475193977356, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.5123475790023804, "step": 4462, "step_time": 34.19764931499958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.44985658675432205, "epoch": 0.20671607225567393, "frac_reward_zero_std": 0.0, "grad_norm": 0.12578284740447998, "kl": 0.005065404460765421, "learning_rate": 9.58666049096804e-07, "loss": 0.0635, "num_tokens": 122797507.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4463, "step_time": 25.71725757792592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 187.4375, "completions/mean_terminated_length": 187.4375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2840280532836914, "epoch": 0.20676238999536822, "frac_reward_zero_std": 1.0, "grad_norm": 0.009209983982145786, "kl": 0.009957100730389357, "learning_rate": 9.586567855488651e-07, "loss": 0.0005, "num_tokens": 122818714.0, "reward": 0.9021315574645996, "reward_std": 0.0, "rewards/reward_func/mean": 0.9021315574645996, "rewards/reward_func/std": 0.0, "step": 4464, "step_time": 23.85198138281703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 240.1875, "completions/mean_terminated_length": 240.1875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "entropy": 0.23530149459838867, "epoch": 0.20680870773506252, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069038900546729565, "kl": 0.00827450561337173, "learning_rate": 9.586475220009263e-07, "loss": 0.0004, "num_tokens": 122842813.0, "reward": 0.9775290489196777, "reward_std": 0.0, "rewards/reward_func/mean": 0.9775290489196777, "rewards/reward_func/std": 0.0, "step": 4465, "step_time": 25.949667435139418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 270.1875, "completions/mean_terminated_length": 270.1875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.22654571384191513, "epoch": 0.20685502547475684, "frac_reward_zero_std": 1.0, "grad_norm": 0.006940394174307585, "kl": 0.004943744745105505, "learning_rate": 9.586382584529874e-07, "loss": 0.0002, "num_tokens": 122866544.0, "reward": 0.686811089515686, "reward_std": 0.0, "rewards/reward_func/mean": 0.686811089515686, "rewards/reward_func/std": 0.0, "step": 4466, "step_time": 27.922365579754114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 139.75, "completions/mean_terminated_length": 139.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.32575297355651855, "epoch": 0.20690134321445114, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027997978031635284, "kl": 0.002147680992493406, "learning_rate": 9.586289949050485e-07, "loss": 0.0001, "num_tokens": 122890428.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4467, "step_time": 17.359182715415955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.20986905321478844, "epoch": 0.20694766095414543, "frac_reward_zero_std": 0.0, "grad_norm": 0.08784881979227066, "kl": 0.004571834113448858, "learning_rate": 9.586197313571096e-07, "loss": 0.0031, "num_tokens": 122914446.0, "reward": 0.7982771396636963, "reward_std": 0.031642720103263855, "rewards/reward_func/mean": 0.7982771396636963, "rewards/reward_func/std": 0.03164271265268326, "step": 4468, "step_time": 26.15432145446539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 150.875, "completions/mean_terminated_length": 150.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3842119202017784, "epoch": 0.20699397869383973, "frac_reward_zero_std": 1.0, "grad_norm": 0.001620774739421904, "kl": 0.002083538507577032, "learning_rate": 9.58610467809171e-07, "loss": 0.0001, "num_tokens": 122972572.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4469, "step_time": 27.610013004392385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 157.3125, "completions/mean_terminated_length": 157.3125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.19669905677437782, "epoch": 0.20704029643353405, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016642799600958824, "kl": 0.0013919929624535143, "learning_rate": 9.58601204261232e-07, "loss": 0.0001, "num_tokens": 122993361.0, "reward": 0.9131007194519043, "reward_std": 0.0, "rewards/reward_func/mean": 0.9131007194519043, "rewards/reward_func/std": 0.0, "step": 4470, "step_time": 20.828556288033724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 313.0625, "completions/mean_terminated_length": 313.0625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 0.18001729249954224, "epoch": 0.20708661417322835, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019139184150844812, "kl": 0.002082289836835116, "learning_rate": 9.585919407132932e-07, "loss": 0.0001, "num_tokens": 123027490.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4471, "step_time": 35.17898824065924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 202.3125, "completions/mean_terminated_length": 202.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.31132566928863525, "epoch": 0.20713293191292265, "frac_reward_zero_std": 0.0, "grad_norm": 0.10304898768663406, "kl": 0.016367219388484955, "learning_rate": 9.585826771653541e-07, "loss": 0.0174, "num_tokens": 123049959.0, "reward": 0.969406247138977, "reward_std": 0.0835980772972107, "rewards/reward_func/mean": 0.969406247138977, "rewards/reward_func/std": 0.08359809219837189, "step": 4472, "step_time": 26.043080002069473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 121.0625, "completions/mean_terminated_length": 121.0625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.32665877044200897, "epoch": 0.20717924965261694, "frac_reward_zero_std": 1.0, "grad_norm": 0.003180100815370679, "kl": 0.0021426203893497586, "learning_rate": 9.585734136174155e-07, "loss": 0.0001, "num_tokens": 123078232.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4473, "step_time": 16.59187662974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.4231932908296585, "epoch": 0.20722556739231127, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037095234729349613, "kl": 0.0027809487655758858, "learning_rate": 9.585641500694766e-07, "loss": 0.0001, "num_tokens": 123120454.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4474, "step_time": 24.51600181683898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 180.125, "completions/mean_terminated_length": 180.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3709437772631645, "epoch": 0.20727188513200556, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024438004475086927, "kl": 0.002327405847609043, "learning_rate": 9.585548865215377e-07, "loss": 0.0001, "num_tokens": 123148056.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4475, "step_time": 23.199911538511515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.3465873748064041, "epoch": 0.20731820287169986, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034997561015188694, "kl": 0.00299051619367674, "learning_rate": 9.585456229735989e-07, "loss": 0.0001, "num_tokens": 123172318.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4476, "step_time": 17.120753288269043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 171.8125, "completions/mean_terminated_length": 171.8125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.42620351910591125, "epoch": 0.20736452061139415, "frac_reward_zero_std": 1.0, "grad_norm": 0.006532012950628996, "kl": 0.0044926885748282075, "learning_rate": 9.5853635942566e-07, "loss": 0.0002, "num_tokens": 123209963.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4477, "step_time": 25.134059321135283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 135.4375, "completions/mean_terminated_length": 135.4375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.21477627009153366, "epoch": 0.20741083835108848, "frac_reward_zero_std": 1.0, "grad_norm": 0.002007042523473501, "kl": 0.00134053552756086, "learning_rate": 9.58527095877721e-07, "loss": 0.0001, "num_tokens": 123229570.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4478, "step_time": 16.767207119613886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27937763184309006, "epoch": 0.20745715609078277, "frac_reward_zero_std": 1.0, "grad_norm": 0.005686054937541485, "kl": 0.003374650375917554, "learning_rate": 9.585178323297822e-07, "loss": 0.0002, "num_tokens": 123249396.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4479, "step_time": 17.23304009065032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 132.0, "completions/mean_terminated_length": 132.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2886121869087219, "epoch": 0.20750347383047707, "frac_reward_zero_std": 1.0, "grad_norm": 0.003832530463114381, "kl": 0.0024614507565274835, "learning_rate": 9.585085687818434e-07, "loss": 0.0001, "num_tokens": 123269956.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4480, "step_time": 17.470990426838398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.2175096571445465, "epoch": 0.20754979157017137, "frac_reward_zero_std": 0.0, "grad_norm": 0.11134587228298187, "kl": 0.027105043176561594, "learning_rate": 9.584993052339045e-07, "loss": -0.0259, "num_tokens": 123295918.0, "reward": 0.7903306484222412, "reward_std": 0.21662232279777527, "rewards/reward_func/mean": 0.7903306484222412, "rewards/reward_func/std": 0.21662232279777527, "step": 4481, "step_time": 23.024387542158365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 191.875, "completions/mean_terminated_length": 191.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.39290447533130646, "epoch": 0.2075961093098657, "frac_reward_zero_std": 1.0, "grad_norm": 0.021612225100398064, "kl": 0.009134544990956783, "learning_rate": 9.584900416859656e-07, "loss": 0.0005, "num_tokens": 123331532.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4482, "step_time": 26.628085043281317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 204.3125, "completions/mean_terminated_length": 204.3125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.2346363514661789, "epoch": 0.20764242704955999, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025011177640408278, "kl": 0.0022415997518692166, "learning_rate": 9.58480778138027e-07, "loss": 0.0001, "num_tokens": 123369057.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 4483, "step_time": 27.2366630025208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 126.5, "completions/mean_terminated_length": 126.5, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.28307194262742996, "epoch": 0.20768874478925428, "frac_reward_zero_std": 1.0, "grad_norm": 0.002643357031047344, "kl": 0.0018845770973712206, "learning_rate": 9.58471514590088e-07, "loss": 0.0001, "num_tokens": 123395449.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4484, "step_time": 17.27096877992153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.16094931587576866, "epoch": 0.20773506252894858, "frac_reward_zero_std": 1.0, "grad_norm": 0.005039875395596027, "kl": 0.004182444128673524, "learning_rate": 9.58462251042149e-07, "loss": 0.0002, "num_tokens": 123418963.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4485, "step_time": 19.453738171607256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.18804769590497017, "epoch": 0.2077813802686429, "frac_reward_zero_std": 1.0, "grad_norm": 0.007854282855987549, "kl": 0.05850925948470831, "learning_rate": 9.584529874942103e-07, "loss": 0.0029, "num_tokens": 123441428.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4486, "step_time": 23.984460175037384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 139.0, "completions/max_terminated_length": 139.0, "completions/mean_length": 119.3125, "completions/mean_terminated_length": 119.3125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.29421278089284897, "epoch": 0.2078276980083372, "frac_reward_zero_std": 1.0, "grad_norm": 0.004592785611748695, "kl": 0.0025236682558897883, "learning_rate": 9.584437239462714e-07, "loss": 0.0001, "num_tokens": 123463369.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4487, "step_time": 16.386843316257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 365.6875, "completions/mean_terminated_length": 365.6875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.16563784703612328, "epoch": 0.2078740157480315, "frac_reward_zero_std": 1.0, "grad_norm": 0.003584612160921097, "kl": 0.00325860851444304, "learning_rate": 9.584344603983326e-07, "loss": 0.0002, "num_tokens": 123492132.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4488, "step_time": 36.691088780760765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 148.1875, "completions/mean_terminated_length": 148.1875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.33117473870515823, "epoch": 0.2079203334877258, "frac_reward_zero_std": 0.0, "grad_norm": 0.15245525538921356, "kl": 0.01356105669401586, "learning_rate": 9.584251968503937e-07, "loss": -0.0058, "num_tokens": 123513639.0, "reward": 0.042260896414518356, "reward_std": 0.16904358565807343, "rewards/reward_func/mean": 0.042260896414518356, "rewards/reward_func/std": 0.16904358565807343, "step": 4489, "step_time": 17.898266334086657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 115.0, "completions/mean_terminated_length": 115.0, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.26029717922210693, "epoch": 0.2079666512274201, "frac_reward_zero_std": 1.0, "grad_norm": 0.014157130382955074, "kl": 0.006502093398012221, "learning_rate": 9.584159333024548e-07, "loss": 0.0003, "num_tokens": 123533623.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4490, "step_time": 14.306664638221264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 131.75, "completions/mean_terminated_length": 131.75, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.31367746740579605, "epoch": 0.2080129689671144, "frac_reward_zero_std": 1.0, "grad_norm": 0.00331858335994184, "kl": 0.002797766006551683, "learning_rate": 9.58406669754516e-07, "loss": 0.0001, "num_tokens": 123557107.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4491, "step_time": 17.110287327319384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 138.1875, "completions/mean_terminated_length": 138.1875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.1890847384929657, "epoch": 0.2080592867068087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013698014663532376, "kl": 0.001124966685893014, "learning_rate": 9.58397406206577e-07, "loss": 0.0001, "num_tokens": 123576918.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4492, "step_time": 15.461940791457891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 161.5, "completions/mean_terminated_length": 161.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.2741554155945778, "epoch": 0.208105604446503, "frac_reward_zero_std": 1.0, "grad_norm": 0.004652024712413549, "kl": 0.0028163317474536598, "learning_rate": 9.583881426586382e-07, "loss": 0.0001, "num_tokens": 123599566.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4493, "step_time": 20.036991473287344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 160.6875, "completions/mean_terminated_length": 160.6875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2065758854150772, "epoch": 0.20815192218619732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038431889843195677, "kl": 0.0022339446586556733, "learning_rate": 9.583788791106993e-07, "loss": 0.0001, "num_tokens": 123636953.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4494, "step_time": 22.542246356606483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 154.625, "completions/mean_terminated_length": 154.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.18167023360729218, "epoch": 0.20819823992589162, "frac_reward_zero_std": 0.0, "grad_norm": 0.12746275961399078, "kl": 0.02427427435759455, "learning_rate": 9.583696155627604e-07, "loss": 0.0071, "num_tokens": 123661843.0, "reward": 0.9681414365768433, "reward_std": 0.015806283801794052, "rewards/reward_func/mean": 0.9681414365768433, "rewards/reward_func/std": 0.015806281939148903, "step": 4495, "step_time": 18.49478606879711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 114.6875, "completions/mean_terminated_length": 114.6875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.24950814619660378, "epoch": 0.20824455766558592, "frac_reward_zero_std": 1.0, "grad_norm": 0.004354960285127163, "kl": 0.0026552329654805362, "learning_rate": 9.583603520148218e-07, "loss": 0.0001, "num_tokens": 123681374.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4496, "step_time": 15.170330747961998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 116.1875, "completions/mean_terminated_length": 116.1875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.31312163919210434, "epoch": 0.2082908754052802, "frac_reward_zero_std": 1.0, "grad_norm": 0.005197524558752775, "kl": 0.0026675930712372065, "learning_rate": 9.583510884668827e-07, "loss": 0.0001, "num_tokens": 123701777.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4497, "step_time": 16.79928321763873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 138.6875, "completions/mean_terminated_length": 138.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3549370616674423, "epoch": 0.20833719314497454, "frac_reward_zero_std": 1.0, "grad_norm": 0.003198597813025117, "kl": 0.002531467005610466, "learning_rate": 9.583418249189438e-07, "loss": 0.0001, "num_tokens": 123729708.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4498, "step_time": 17.87234526500106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 129.9375, "completions/mean_terminated_length": 129.9375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.32815539091825485, "epoch": 0.20838351088466883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026750012766569853, "kl": 0.0021164837235119194, "learning_rate": 9.583325613710051e-07, "loss": 0.0001, "num_tokens": 123755035.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4499, "step_time": 16.812711495906115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 128.5, "completions/mean_terminated_length": 128.5, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3110589236021042, "epoch": 0.20842982862436313, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036182873882353306, "kl": 0.002530342200770974, "learning_rate": 9.583232978230663e-07, "loss": 0.0001, "num_tokens": 123780483.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4500, "step_time": 16.715143274515867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.308862529695034, "epoch": 0.20847614636405742, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020533078350126743, "kl": 0.00184511041152291, "learning_rate": 9.583140342751274e-07, "loss": 0.0001, "num_tokens": 123807427.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4501, "step_time": 17.075216569006443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.16802659630775452, "epoch": 0.20852246410375175, "frac_reward_zero_std": 1.0, "grad_norm": 0.00467786705121398, "kl": 0.0035754297859966755, "learning_rate": 9.583047707271885e-07, "loss": 0.0002, "num_tokens": 123834015.0, "reward": 0.6246347427368164, "reward_std": 0.0, "rewards/reward_func/mean": 0.6246347427368164, "rewards/reward_func/std": 0.0, "step": 4502, "step_time": 23.39074996113777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 189.3125, "completions/mean_terminated_length": 189.3125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.4275503605604172, "epoch": 0.20856878184344604, "frac_reward_zero_std": 0.0, "grad_norm": 0.12397279590368271, "kl": 0.017243289854377508, "learning_rate": 9.582955071792496e-07, "loss": -0.0971, "num_tokens": 123856804.0, "reward": 0.08718213438987732, "reward_std": 0.23822695016860962, "rewards/reward_func/mean": 0.08718213438987732, "rewards/reward_func/std": 0.23822695016860962, "step": 4503, "step_time": 23.68745656311512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2466145195066929, "epoch": 0.20861509958314034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025737672112882137, "kl": 0.002056895347777754, "learning_rate": 9.582862436313108e-07, "loss": 0.0001, "num_tokens": 123882271.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4504, "step_time": 16.518370371311903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 199.25, "completions/mean_terminated_length": 199.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3941348195075989, "epoch": 0.20866141732283464, "frac_reward_zero_std": 1.0, "grad_norm": 0.003663417650386691, "kl": 0.003953707404434681, "learning_rate": 9.58276980083372e-07, "loss": 0.0002, "num_tokens": 123904419.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4505, "step_time": 23.864782005548477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 125.25, "completions/mean_terminated_length": 125.25, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.2710713595151901, "epoch": 0.20870773506252896, "frac_reward_zero_std": 1.0, "grad_norm": 0.00256364862434566, "kl": 0.001976566738449037, "learning_rate": 9.58267716535433e-07, "loss": 0.0001, "num_tokens": 123927511.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4506, "step_time": 16.101536702364683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 178.75, "completions/mean_terminated_length": 178.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.2687610983848572, "epoch": 0.20875405280222326, "frac_reward_zero_std": 0.0, "grad_norm": 0.13785500824451447, "kl": 0.029831380117684603, "learning_rate": 9.582584529874941e-07, "loss": -0.0317, "num_tokens": 123964499.0, "reward": 0.7785302996635437, "reward_std": 0.17717573046684265, "rewards/reward_func/mean": 0.7785302996635437, "rewards/reward_func/std": 0.17717573046684265, "step": 4507, "step_time": 25.48934479802847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.32828326523303986, "epoch": 0.20880037054191755, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029990829061716795, "kl": 0.0021405020961537957, "learning_rate": 9.582491894395553e-07, "loss": 0.0001, "num_tokens": 123992707.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4508, "step_time": 20.664042081683874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 204.5625, "completions/mean_terminated_length": 204.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.36476946622133255, "epoch": 0.20884668828161185, "frac_reward_zero_std": 1.0, "grad_norm": 0.011225164867937565, "kl": 0.0076727664563804865, "learning_rate": 9.582399258916164e-07, "loss": 0.0004, "num_tokens": 124025036.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4509, "step_time": 27.071549084037542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2992003932595253, "epoch": 0.20889300602130617, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068072606809437275, "kl": 0.0032806770759634674, "learning_rate": 9.582306623436775e-07, "loss": 0.0002, "num_tokens": 124044988.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4510, "step_time": 15.329447764903307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 116.6875, "completions/mean_terminated_length": 116.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3145041763782501, "epoch": 0.20893932376100047, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037256877403706312, "kl": 0.0029680809238925576, "learning_rate": 9.582213987957386e-07, "loss": 0.0001, "num_tokens": 124067719.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4511, "step_time": 15.804791938513517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.25249340385198593, "epoch": 0.20898564150069476, "frac_reward_zero_std": 1.0, "grad_norm": 0.004193861503154039, "kl": 0.0022256918309722096, "learning_rate": 9.582121352477998e-07, "loss": 0.0001, "num_tokens": 124088521.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4512, "step_time": 16.926363054662943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.29557929188013077, "epoch": 0.20903195924038906, "frac_reward_zero_std": 1.0, "grad_norm": 0.004239782691001892, "kl": 0.03148888412397355, "learning_rate": 9.582028716998611e-07, "loss": 0.0015, "num_tokens": 124121779.0, "reward": 0.26359713077545166, "reward_std": 0.0, "rewards/reward_func/mean": 0.26359713077545166, "rewards/reward_func/std": 0.0, "step": 4513, "step_time": 20.87648806348443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 212.0, "completions/mean_terminated_length": 212.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3210453614592552, "epoch": 0.20907827698008338, "frac_reward_zero_std": 0.0, "grad_norm": 0.11319698393344879, "kl": 0.022904083132743835, "learning_rate": 9.581936081519222e-07, "loss": -0.0202, "num_tokens": 124143859.0, "reward": 0.9328369498252869, "reward_std": 0.24945172667503357, "rewards/reward_func/mean": 0.9328369498252869, "rewards/reward_func/std": 0.24945174157619476, "step": 4514, "step_time": 34.970582257956266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 187.8125, "completions/mean_terminated_length": 187.8125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.2327510304749012, "epoch": 0.20912459471977768, "frac_reward_zero_std": 1.0, "grad_norm": 0.006550724618136883, "kl": 0.006306188879534602, "learning_rate": 9.581843446039831e-07, "loss": 0.0003, "num_tokens": 124171360.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4515, "step_time": 22.35580413416028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 126.8125, "completions/mean_terminated_length": 126.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.29604586213827133, "epoch": 0.20917091245947197, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019563401583582163, "kl": 0.0016671387711539865, "learning_rate": 9.581750810560445e-07, "loss": 0.0001, "num_tokens": 124194749.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4516, "step_time": 16.582060985267162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2425302192568779, "epoch": 0.20921723019916627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023137491662055254, "kl": 0.0021871365315746516, "learning_rate": 9.581658175081056e-07, "loss": 0.0001, "num_tokens": 124230434.0, "reward": 0.11362193524837494, "reward_std": 0.0, "rewards/reward_func/mean": 0.11362193524837494, "rewards/reward_func/std": 0.0, "step": 4517, "step_time": 24.648380033671856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 197.9375, "completions/mean_terminated_length": 197.9375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.1759553737938404, "epoch": 0.2092635479388606, "frac_reward_zero_std": 1.0, "grad_norm": 0.005160289816558361, "kl": 0.003777960315346718, "learning_rate": 9.581565539601667e-07, "loss": 0.0002, "num_tokens": 124259857.0, "reward": 0.8131037354469299, "reward_std": 0.0, "rewards/reward_func/mean": 0.8131037354469299, "rewards/reward_func/std": 0.0, "step": 4518, "step_time": 23.335392862558365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 173.125, "completions/mean_terminated_length": 173.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.40470677614212036, "epoch": 0.2093098656785549, "frac_reward_zero_std": 1.0, "grad_norm": 0.006334818433970213, "kl": 0.005239688558503985, "learning_rate": 9.581472904122279e-07, "loss": 0.0003, "num_tokens": 124281507.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4519, "step_time": 20.79850087314844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 174.0, "completions/mean_terminated_length": 174.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.19005835056304932, "epoch": 0.2093561834182492, "frac_reward_zero_std": 0.0, "grad_norm": 0.14783187210559845, "kl": 0.032623309176415205, "learning_rate": 9.58138026864289e-07, "loss": -0.0332, "num_tokens": 124305155.0, "reward": 0.6320604085922241, "reward_std": 0.22408708930015564, "rewards/reward_func/mean": 0.6320604085922241, "rewards/reward_func/std": 0.22408710420131683, "step": 4520, "step_time": 22.775241017341614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.39643463492393494, "epoch": 0.20940250115794348, "frac_reward_zero_std": 0.0, "grad_norm": 0.09006068110466003, "kl": 0.004479829920455813, "learning_rate": 9.581287633163501e-07, "loss": 0.0988, "num_tokens": 124327819.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4521, "step_time": 26.601082772016525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4853120595216751, "epoch": 0.2094488188976378, "frac_reward_zero_std": 0.0, "grad_norm": 0.1422317773103714, "kl": 0.009126980789005756, "learning_rate": 9.581194997684112e-07, "loss": 0.1064, "num_tokens": 124353111.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 4522, "step_time": 26.04064156487584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 194.1875, "completions/mean_terminated_length": 194.1875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.40737374126911163, "epoch": 0.2094951366373321, "frac_reward_zero_std": 1.0, "grad_norm": 0.006968447007238865, "kl": 0.005220564664341509, "learning_rate": 9.581102362204724e-07, "loss": 0.0003, "num_tokens": 124376570.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4523, "step_time": 23.655402705073357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 147.8125, "completions/mean_terminated_length": 147.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.21773340553045273, "epoch": 0.2095414543770264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028009614907205105, "kl": 0.0018515900010243058, "learning_rate": 9.581009726725335e-07, "loss": 0.0001, "num_tokens": 124396759.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4524, "step_time": 17.065115593373775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 139.4375, "completions/mean_terminated_length": 139.4375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.29620032012462616, "epoch": 0.2095877721167207, "frac_reward_zero_std": 1.0, "grad_norm": 0.002556238789111376, "kl": 0.001933709834702313, "learning_rate": 9.580917091245946e-07, "loss": 0.0001, "num_tokens": 124432750.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4525, "step_time": 20.982255693525076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 154.8125, "completions/mean_terminated_length": 154.8125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.24403830617666245, "epoch": 0.20963408985641502, "frac_reward_zero_std": 1.0, "grad_norm": 0.006410935427993536, "kl": 0.005382670904509723, "learning_rate": 9.58082445576656e-07, "loss": 0.0003, "num_tokens": 124459291.0, "reward": 0.3384654223918915, "reward_std": 0.0, "rewards/reward_func/mean": 0.3384654223918915, "rewards/reward_func/std": 0.0, "step": 4526, "step_time": 20.046632915735245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 171.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.37928277254104614, "epoch": 0.20968040759610931, "frac_reward_zero_std": 1.0, "grad_norm": 0.021629396826028824, "kl": 0.004303328867536038, "learning_rate": 9.58073182028717e-07, "loss": 0.0002, "num_tokens": 124494271.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4527, "step_time": 22.399007219821215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.28371690958738327, "epoch": 0.2097267253358036, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023662883322685957, "kl": 0.0022507273824885488, "learning_rate": 9.58063918480778e-07, "loss": 0.0001, "num_tokens": 124515365.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4528, "step_time": 17.098990455269814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4401158168911934, "epoch": 0.2097730430754979, "frac_reward_zero_std": 1.0, "grad_norm": 0.002236809115856886, "kl": 0.0022928886755835265, "learning_rate": 9.580546549328393e-07, "loss": 0.0001, "num_tokens": 124559936.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4529, "step_time": 24.50910273194313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.24911126121878624, "epoch": 0.20981936081519223, "frac_reward_zero_std": 0.0, "grad_norm": 0.07972320914268494, "kl": 0.0025148409185931087, "learning_rate": 9.580453913849004e-07, "loss": -0.0235, "num_tokens": 124586892.0, "reward": 0.9431997537612915, "reward_std": 0.015146732330322266, "rewards/reward_func/mean": 0.9431997537612915, "rewards/reward_func/std": 0.015146732330322266, "step": 4530, "step_time": 26.23358115181327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 129.3125, "completions/mean_terminated_length": 129.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3201339915394783, "epoch": 0.20986567855488653, "frac_reward_zero_std": 1.0, "grad_norm": 0.006610604003071785, "kl": 0.0034712717751972377, "learning_rate": 9.580361278369616e-07, "loss": 0.0002, "num_tokens": 124606897.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4531, "step_time": 15.311695747077465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 125.5625, "completions/mean_terminated_length": 125.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.1992301195859909, "epoch": 0.20991199629458082, "frac_reward_zero_std": 1.0, "grad_norm": 0.003081638365983963, "kl": 0.0017424341931473464, "learning_rate": 9.580268642890227e-07, "loss": 0.0001, "num_tokens": 124626314.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4532, "step_time": 16.176436487585306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 188.0625, "completions/mean_terminated_length": 188.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3342147395014763, "epoch": 0.20995831403427512, "frac_reward_zero_std": 1.0, "grad_norm": 0.0051155127584934235, "kl": 0.003918955335393548, "learning_rate": 9.580176007410838e-07, "loss": 0.0002, "num_tokens": 124655371.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4533, "step_time": 24.33262361958623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 164.5625, "completions/mean_terminated_length": 164.5625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4239857494831085, "epoch": 0.21000463177396944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022273610811680555, "kl": 0.0020079879905097187, "learning_rate": 9.58008337193145e-07, "loss": 0.0001, "num_tokens": 124695780.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4534, "step_time": 23.14905248582363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 135.5, "completions/mean_terminated_length": 135.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.35153312236070633, "epoch": 0.21005094951366374, "frac_reward_zero_std": 1.0, "grad_norm": 0.003029546467587352, "kl": 0.002299476764164865, "learning_rate": 9.57999073645206e-07, "loss": 0.0001, "num_tokens": 124730828.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4535, "step_time": 19.38253891095519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.37292734533548355, "epoch": 0.21009726725335803, "frac_reward_zero_std": 1.0, "grad_norm": 0.004241833463311195, "kl": 0.0028279765392653644, "learning_rate": 9.579898100972672e-07, "loss": 0.0001, "num_tokens": 124766836.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4536, "step_time": 22.115824546664953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 155.625, "completions/mean_terminated_length": 155.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.2844870761036873, "epoch": 0.21014358499305233, "frac_reward_zero_std": 0.0, "grad_norm": 0.1563979685306549, "kl": 0.009537339094094932, "learning_rate": 9.579805465493283e-07, "loss": 0.0557, "num_tokens": 124787806.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4537, "step_time": 22.45786213129759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.26435963809490204, "epoch": 0.21018990273274665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027738655917346478, "kl": 0.0017964460421353579, "learning_rate": 9.579712830013894e-07, "loss": 0.0001, "num_tokens": 124808974.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4538, "step_time": 16.81413860619068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.30298901349306107, "epoch": 0.21023622047244095, "frac_reward_zero_std": 1.0, "grad_norm": 0.004321299027651548, "kl": 0.0024628351093269885, "learning_rate": 9.579620194534508e-07, "loss": 0.0001, "num_tokens": 124831916.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4539, "step_time": 16.509110625833273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 145.1875, "completions/mean_terminated_length": 145.1875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.45492929965257645, "epoch": 0.21028253821213524, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024500072468072176, "kl": 0.002628411049954593, "learning_rate": 9.579527559055117e-07, "loss": 0.0001, "num_tokens": 124873775.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4540, "step_time": 23.426053818315268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 222.3125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.45737026631832123, "epoch": 0.21032885595182954, "frac_reward_zero_std": 0.0, "grad_norm": 0.08502742648124695, "kl": 0.005424272269010544, "learning_rate": 9.579434923575728e-07, "loss": 0.0314, "num_tokens": 124895828.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4541, "step_time": 25.104504711925983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 135.8125, "completions/mean_terminated_length": 135.8125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.22275033220648766, "epoch": 0.21037517369152386, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028084134683012962, "kl": 0.0016665546572767198, "learning_rate": 9.57934228809634e-07, "loss": 0.0001, "num_tokens": 124915505.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4542, "step_time": 17.30308758467436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.34233835339546204, "epoch": 0.21042149143121816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034828847274184227, "kl": 0.0029158778488636017, "learning_rate": 9.579249652616953e-07, "loss": 0.0001, "num_tokens": 124951627.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4543, "step_time": 21.10429633408785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.2279246486723423, "epoch": 0.21046780917091246, "frac_reward_zero_std": 0.0, "grad_norm": 0.1677601933479309, "kl": 0.03046353254467249, "learning_rate": 9.579157017137564e-07, "loss": -0.0139, "num_tokens": 124976257.0, "reward": 0.8965252637863159, "reward_std": 0.1090937927365303, "rewards/reward_func/mean": 0.8965252637863159, "rewards/reward_func/std": 0.1090938076376915, "step": 4544, "step_time": 22.578668504953384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.3844281882047653, "epoch": 0.21051412691060675, "frac_reward_zero_std": 0.0, "grad_norm": 0.09906401485204697, "kl": 0.009228316484950483, "learning_rate": 9.579064381658175e-07, "loss": 0.0533, "num_tokens": 125005721.0, "reward": 0.23867067694664001, "reward_std": 0.42886680364608765, "rewards/reward_func/mean": 0.23867067694664001, "rewards/reward_func/std": 0.42886680364608765, "step": 4545, "step_time": 32.811641447246075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 150.0, "completions/mean_terminated_length": 150.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.1901368945837021, "epoch": 0.21056044465030108, "frac_reward_zero_std": 1.0, "grad_norm": 0.007037638686597347, "kl": 0.005093779705930501, "learning_rate": 9.578971746178787e-07, "loss": 0.0003, "num_tokens": 125027913.0, "reward": 0.3441537916660309, "reward_std": 0.0, "rewards/reward_func/mean": 0.3441537916660309, "rewards/reward_func/std": 0.0, "step": 4546, "step_time": 20.51408862695098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 190.375, "completions/mean_terminated_length": 190.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3855598568916321, "epoch": 0.21060676238999537, "frac_reward_zero_std": 1.0, "grad_norm": 0.007795747369527817, "kl": 0.006330879288725555, "learning_rate": 9.578879110699398e-07, "loss": 0.0003, "num_tokens": 125051711.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4547, "step_time": 22.817515335977077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.42228440940380096, "epoch": 0.21065308012968967, "frac_reward_zero_std": 1.0, "grad_norm": 0.006360540632158518, "kl": 0.004638633807189763, "learning_rate": 9.57878647522001e-07, "loss": 0.0002, "num_tokens": 125077860.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4548, "step_time": 22.673727177083492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 130.25, "completions/mean_terminated_length": 130.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3095518723130226, "epoch": 0.21069939786938396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042094760574400425, "kl": 0.002994225302245468, "learning_rate": 9.57869383974062e-07, "loss": 0.0001, "num_tokens": 125100072.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4549, "step_time": 16.402844041585922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.29624032229185104, "epoch": 0.2107457156090783, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017073858762159944, "kl": 0.001342095754807815, "learning_rate": 9.578601204261232e-07, "loss": 0.0001, "num_tokens": 125121091.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4550, "step_time": 15.18632896989584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 143.1875, "completions/mean_terminated_length": 143.1875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.33111632615327835, "epoch": 0.21079203334877258, "frac_reward_zero_std": 1.0, "grad_norm": 0.001679004286415875, "kl": 0.0017345676897093654, "learning_rate": 9.578508568781843e-07, "loss": 0.0001, "num_tokens": 125143910.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4551, "step_time": 19.42225407063961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2126971296966076, "epoch": 0.21083835108846688, "frac_reward_zero_std": 0.0, "grad_norm": 0.10637318342924118, "kl": 0.005246924585662782, "learning_rate": 9.578415933302454e-07, "loss": 0.0236, "num_tokens": 125165690.0, "reward": 0.9966224431991577, "reward_std": 0.013510131277143955, "rewards/reward_func/mean": 0.9966224431991577, "rewards/reward_func/std": 0.013510138727724552, "step": 4552, "step_time": 18.213520001620054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 116.6875, "completions/mean_terminated_length": 116.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.29982779920101166, "epoch": 0.21088466882816118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036096482072025537, "kl": 0.00243820488685742, "learning_rate": 9.578323297823065e-07, "loss": 0.0001, "num_tokens": 125186293.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4553, "step_time": 16.36350916698575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 295.0625, "completions/mean_terminated_length": 295.0625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.18046902120113373, "epoch": 0.2109309865678555, "frac_reward_zero_std": 0.0, "grad_norm": 0.0981452465057373, "kl": 0.021709760883823037, "learning_rate": 9.578230662343677e-07, "loss": -0.0837, "num_tokens": 125226966.0, "reward": 0.834690272808075, "reward_std": 0.1847260296344757, "rewards/reward_func/mean": 0.834690272808075, "rewards/reward_func/std": 0.1847260296344757, "step": 4554, "step_time": 36.709081299602985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.2706534266471863, "epoch": 0.2109773043075498, "frac_reward_zero_std": 1.0, "grad_norm": 0.003649881575256586, "kl": 0.011807393282651901, "learning_rate": 9.578138026864288e-07, "loss": 0.0006, "num_tokens": 125261310.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4555, "step_time": 24.966287799179554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 138.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.26266178488731384, "epoch": 0.2110236220472441, "frac_reward_zero_std": 1.0, "grad_norm": 0.003660748712718487, "kl": 0.0018841018609236926, "learning_rate": 9.578045391384901e-07, "loss": 0.0001, "num_tokens": 125282730.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4556, "step_time": 17.017644729465246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 170.4375, "completions/mean_terminated_length": 170.4375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.25271132960915565, "epoch": 0.2110699397869384, "frac_reward_zero_std": 0.0, "grad_norm": 0.14928926527500153, "kl": 0.021795076318085194, "learning_rate": 9.577952755905512e-07, "loss": 0.0029, "num_tokens": 125321665.0, "reward": 0.7160109281539917, "reward_std": 0.16633981466293335, "rewards/reward_func/mean": 0.7160109281539917, "rewards/reward_func/std": 0.16633982956409454, "step": 4557, "step_time": 24.311111871153116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 173.375, "completions/mean_terminated_length": 173.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3092428520321846, "epoch": 0.2111162575266327, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042096651159226894, "kl": 0.003150644537527114, "learning_rate": 9.577860120426122e-07, "loss": 0.0002, "num_tokens": 125344439.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4558, "step_time": 20.087486639618874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 130.5, "completions/mean_terminated_length": 130.5, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2757374122738838, "epoch": 0.211162575266327, "frac_reward_zero_std": 1.0, "grad_norm": 0.008420100435614586, "kl": 0.003156241960823536, "learning_rate": 9.577767484946733e-07, "loss": 0.0002, "num_tokens": 125364255.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4559, "step_time": 16.498020764440298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 250.8125, "completions/mean_terminated_length": 250.8125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.24240896478295326, "epoch": 0.2112088930060213, "frac_reward_zero_std": 0.0, "grad_norm": 0.07986567914485931, "kl": 0.00367005035514012, "learning_rate": 9.577674849467346e-07, "loss": 0.025, "num_tokens": 125397404.0, "reward": 0.9970567226409912, "reward_std": 0.008042472414672375, "rewards/reward_func/mean": 0.9970567226409912, "rewards/reward_func/std": 0.008042463101446629, "step": 4560, "step_time": 29.679964408278465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 183.6875, "completions/mean_terminated_length": 183.6875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.4318304732441902, "epoch": 0.2112552107457156, "frac_reward_zero_std": 0.0, "grad_norm": 0.024634553119540215, "kl": 0.010949882445856929, "learning_rate": 9.577582213987957e-07, "loss": 0.0024, "num_tokens": 125419463.0, "reward": 1.0134880540135782e-05, "reward_std": 4.053952216054313e-05, "rewards/reward_func/mean": 1.0134880540135782e-05, "rewards/reward_func/std": 4.053952216054313e-05, "step": 4561, "step_time": 22.271343171596527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 169.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3239048570394516, "epoch": 0.21130152848540992, "frac_reward_zero_std": 0.0, "grad_norm": 0.11310670524835587, "kl": 0.008481680473778397, "learning_rate": 9.577489578508569e-07, "loss": 0.1158, "num_tokens": 125449159.0, "reward": 0.8728713393211365, "reward_std": 0.23276567459106445, "rewards/reward_func/mean": 0.8728713393211365, "rewards/reward_func/std": 0.23276568949222565, "step": 4562, "step_time": 26.040478244423866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 125.3125, "completions/mean_terminated_length": 125.3125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.3014897257089615, "epoch": 0.21134784622510422, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033178734593093395, "kl": 0.002205540135037154, "learning_rate": 9.57739694302918e-07, "loss": 0.0001, "num_tokens": 125471180.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4563, "step_time": 15.17513533681631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.3299860283732414, "epoch": 0.21139416396479851, "frac_reward_zero_std": 0.0, "grad_norm": 0.1173308938741684, "kl": 0.005431530880741775, "learning_rate": 9.577304307549791e-07, "loss": 0.0216, "num_tokens": 125508904.0, "reward": 0.8586323261260986, "reward_std": 0.14511004090309143, "rewards/reward_func/mean": 0.8586323261260986, "rewards/reward_func/std": 0.14511004090309143, "step": 4564, "step_time": 29.23145243152976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 145.4375, "completions/mean_terminated_length": 145.4375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29369380325078964, "epoch": 0.2114404817044928, "frac_reward_zero_std": 1.0, "grad_norm": 0.00887998379766941, "kl": 0.0034651908790692687, "learning_rate": 9.577211672070402e-07, "loss": 0.0002, "num_tokens": 125529839.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4565, "step_time": 18.286695763468742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 195.5625, "completions/mean_terminated_length": 195.5625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4388611167669296, "epoch": 0.21148679944418713, "frac_reward_zero_std": 1.0, "grad_norm": 0.007069175597280264, "kl": 0.005334179208148271, "learning_rate": 9.577119036591014e-07, "loss": 0.0003, "num_tokens": 125555224.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4566, "step_time": 24.03802677616477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 174.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2579498067498207, "epoch": 0.21153311718388143, "frac_reward_zero_std": 0.0, "grad_norm": 0.10790275037288666, "kl": 0.006988507346250117, "learning_rate": 9.577026401111625e-07, "loss": 0.0027, "num_tokens": 125576084.0, "reward": 0.75, "reward_std": 0.4472135901451111, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.44721361994743347, "step": 4567, "step_time": 20.670855939388275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 125.125, "completions/mean_terminated_length": 125.125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.29111990332603455, "epoch": 0.21157943492357573, "frac_reward_zero_std": 1.0, "grad_norm": 0.004740697797387838, "kl": 0.002562293957453221, "learning_rate": 9.576933765632236e-07, "loss": 0.0001, "num_tokens": 125595654.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4568, "step_time": 16.81375139206648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 223.5625, "completions/mean_terminated_length": 223.5625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "entropy": 0.19886551052331924, "epoch": 0.21162575266327002, "frac_reward_zero_std": 0.0, "grad_norm": 0.08791195601224899, "kl": 0.005757181206718087, "learning_rate": 9.57684113015285e-07, "loss": 0.0125, "num_tokens": 125621247.0, "reward": 0.05890907347202301, "reward_std": 0.012393509037792683, "rewards/reward_func/mean": 0.05890907347202301, "rewards/reward_func/std": 0.012393510900437832, "step": 4569, "step_time": 25.73760474100709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 148.625, "completions/mean_terminated_length": 148.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.2894188240170479, "epoch": 0.21167207040296435, "frac_reward_zero_std": 1.0, "grad_norm": 0.005633847322314978, "kl": 0.0026751953992061317, "learning_rate": 9.57674849467346e-07, "loss": 0.0001, "num_tokens": 125642233.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4570, "step_time": 21.330371465533972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 185.5625, "completions/mean_terminated_length": 185.5625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.19079741090536118, "epoch": 0.21171838814265864, "frac_reward_zero_std": 1.0, "grad_norm": 0.007866875268518925, "kl": 0.005902003264054656, "learning_rate": 9.57665585919407e-07, "loss": 0.0003, "num_tokens": 125664850.0, "reward": 0.9181891679763794, "reward_std": 0.0, "rewards/reward_func/mean": 0.9181891679763794, "rewards/reward_func/std": 0.0, "step": 4571, "step_time": 20.46226677671075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.29646675288677216, "epoch": 0.21176470588235294, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030221971683204174, "kl": 0.002487851830665022, "learning_rate": 9.576563223714681e-07, "loss": 0.0001, "num_tokens": 125690518.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4572, "step_time": 17.496889680624008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 141.5, "completions/mean_terminated_length": 141.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.32203683257102966, "epoch": 0.21181102362204723, "frac_reward_zero_std": 1.0, "grad_norm": 0.003484180895611644, "kl": 0.0019459775066934526, "learning_rate": 9.576470588235294e-07, "loss": 0.0001, "num_tokens": 125711294.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4573, "step_time": 19.43079449236393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 130.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.2513924464583397, "epoch": 0.21185734136174156, "frac_reward_zero_std": 1.0, "grad_norm": 0.004076133482158184, "kl": 0.0025015312421601266, "learning_rate": 9.576377952755906e-07, "loss": 0.0001, "num_tokens": 125730970.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4574, "step_time": 15.779032353311777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.23622899502515793, "epoch": 0.21190365910143585, "frac_reward_zero_std": 1.0, "grad_norm": 0.003442511660978198, "kl": 0.012230751803144813, "learning_rate": 9.576285317276517e-07, "loss": 0.0006, "num_tokens": 125763224.0, "reward": 0.3448947072029114, "reward_std": 0.0, "rewards/reward_func/mean": 0.3448947072029114, "rewards/reward_func/std": 0.0, "step": 4575, "step_time": 24.85686208307743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 188.9375, "completions/mean_terminated_length": 188.9375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2207338623702526, "epoch": 0.21194997684113015, "frac_reward_zero_std": 0.0, "grad_norm": 0.16800262033939362, "kl": 0.008518489601556212, "learning_rate": 9.576192681797128e-07, "loss": -0.0413, "num_tokens": 125786407.0, "reward": 0.6571069955825806, "reward_std": 0.1239239051938057, "rewards/reward_func/mean": 0.6571069955825806, "rewards/reward_func/std": 0.1239239051938057, "step": 4576, "step_time": 24.301697324961424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 181.0, "completions/mean_terminated_length": 181.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.2521881014108658, "epoch": 0.21199629458082445, "frac_reward_zero_std": 0.0, "grad_norm": 0.10636038333177567, "kl": 0.011740713787730783, "learning_rate": 9.57610004631774e-07, "loss": 0.0301, "num_tokens": 125808263.0, "reward": 0.973493218421936, "reward_std": 0.031041564419865608, "rewards/reward_func/mean": 0.973493218421936, "rewards/reward_func/std": 0.031041564419865608, "step": 4577, "step_time": 22.679786428809166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.2899409234523773, "epoch": 0.21204261232051877, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031824936158955097, "kl": 0.0034957891330122948, "learning_rate": 9.57600741083835e-07, "loss": 0.0002, "num_tokens": 125833605.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4578, "step_time": 30.98334626108408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.28880978375673294, "epoch": 0.21208893006021307, "frac_reward_zero_std": 0.0, "grad_norm": 0.10978582501411438, "kl": 0.006959647638723254, "learning_rate": 9.575914775358962e-07, "loss": -0.0325, "num_tokens": 125859501.0, "reward": 0.46855640411376953, "reward_std": 0.48227450251579285, "rewards/reward_func/mean": 0.46855640411376953, "rewards/reward_func/std": 0.48227453231811523, "step": 4579, "step_time": 28.84536913409829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 133.8125, "completions/mean_terminated_length": 133.8125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2924625501036644, "epoch": 0.21213524779990736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031335516832768917, "kl": 0.0021028506162110716, "learning_rate": 9.575822139879573e-07, "loss": 0.0001, "num_tokens": 125880154.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4580, "step_time": 18.896043598651886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 124.1875, "completions/mean_terminated_length": 124.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.31091052293777466, "epoch": 0.21218156553960166, "frac_reward_zero_std": 1.0, "grad_norm": 0.004532175604254007, "kl": 0.002397934324108064, "learning_rate": 9.575729504400184e-07, "loss": 0.0001, "num_tokens": 125900573.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4581, "step_time": 16.21372254192829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.18246766552329063, "epoch": 0.21222788327929598, "frac_reward_zero_std": 1.0, "grad_norm": 0.004709724336862564, "kl": 0.0029257500427775085, "learning_rate": 9.575636868920796e-07, "loss": 0.0001, "num_tokens": 125934751.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4582, "step_time": 22.58020205423236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 118.875, "completions/mean_terminated_length": 118.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2522609978914261, "epoch": 0.21227420101899028, "frac_reward_zero_std": 1.0, "grad_norm": 0.001804339000955224, "kl": 0.0015569084498565644, "learning_rate": 9.575544233441407e-07, "loss": 0.0001, "num_tokens": 125954877.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4583, "step_time": 14.784600455313921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 182.9375, "completions/mean_terminated_length": 182.9375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.27114830538630486, "epoch": 0.21232051875868457, "frac_reward_zero_std": 1.0, "grad_norm": 0.00425915839150548, "kl": 0.0043210614239797, "learning_rate": 9.575451597962018e-07, "loss": 0.0002, "num_tokens": 125981980.0, "reward": 0.9036020040512085, "reward_std": 0.0, "rewards/reward_func/mean": 0.9036020040512085, "rewards/reward_func/std": 0.0, "step": 4584, "step_time": 29.392589770257473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.23679695650935173, "epoch": 0.21236683649837887, "frac_reward_zero_std": 0.0, "grad_norm": 0.21038685739040375, "kl": 0.013930513057857752, "learning_rate": 9.57535896248263e-07, "loss": -0.0353, "num_tokens": 126018424.0, "reward": 0.82862389087677, "reward_std": 0.10218896716833115, "rewards/reward_func/mean": 0.82862389087677, "rewards/reward_func/std": 0.10218898206949234, "step": 4585, "step_time": 22.847710091620684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 133.3125, "completions/mean_terminated_length": 133.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.2652202919125557, "epoch": 0.2124131542380732, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024052548687905073, "kl": 0.0017215238767676055, "learning_rate": 9.575266327003243e-07, "loss": 0.0001, "num_tokens": 126040077.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4586, "step_time": 16.063202656805515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 430.625, "completions/mean_terminated_length": 430.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "entropy": 0.3443702310323715, "epoch": 0.2124594719777675, "frac_reward_zero_std": 0.0, "grad_norm": 0.06466368585824966, "kl": 0.009476348757743835, "learning_rate": 9.575173691523854e-07, "loss": -0.147, "num_tokens": 126073319.0, "reward": 0.17105086147785187, "reward_std": 0.14411544799804688, "rewards/reward_func/mean": 0.17105086147785187, "rewards/reward_func/std": 0.14411544799804688, "step": 4587, "step_time": 58.89421058818698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 193.4375, "completions/mean_terminated_length": 193.4375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.1631155014038086, "epoch": 0.21250578971746178, "frac_reward_zero_std": 1.0, "grad_norm": 0.004590533673763275, "kl": 0.0763822328299284, "learning_rate": 9.575081056044465e-07, "loss": 0.0038, "num_tokens": 126102574.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4588, "step_time": 23.851110119372606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 177.5625, "completions/mean_terminated_length": 177.5625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.428097628057003, "epoch": 0.21255210745715608, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042982702143490314, "kl": 0.003557933960109949, "learning_rate": 9.574988420565074e-07, "loss": 0.0002, "num_tokens": 126148007.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4589, "step_time": 28.871601589024067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 122.5, "completions/mean_terminated_length": 122.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3085201680660248, "epoch": 0.2125984251968504, "frac_reward_zero_std": 1.0, "grad_norm": 0.005929566919803619, "kl": 0.003112640930339694, "learning_rate": 9.574895785085688e-07, "loss": 0.0002, "num_tokens": 126170511.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4590, "step_time": 16.47818062081933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 208.4375, "completions/mean_terminated_length": 208.4375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.21477292105555534, "epoch": 0.2126447429365447, "frac_reward_zero_std": 1.0, "grad_norm": 0.004304870497435331, "kl": 0.003596308291889727, "learning_rate": 9.5748031496063e-07, "loss": 0.0002, "num_tokens": 126195174.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4591, "step_time": 23.432234924286604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 201.75, "completions/mean_terminated_length": 201.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3422112464904785, "epoch": 0.212691060676239, "frac_reward_zero_std": 0.0, "grad_norm": 0.14413142204284668, "kl": 0.017712951870635152, "learning_rate": 9.57471051412691e-07, "loss": -0.0043, "num_tokens": 126222178.0, "reward": 0.0016898601315915585, "reward_std": 0.006743177305907011, "rewards/reward_func/mean": 0.0016898601315915585, "rewards/reward_func/std": 0.006743177771568298, "step": 4592, "step_time": 26.71819446235895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 160.875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.16070354357361794, "epoch": 0.2127373784159333, "frac_reward_zero_std": 1.0, "grad_norm": 0.023666655644774437, "kl": 0.001973198464838788, "learning_rate": 9.574617878647522e-07, "loss": 0.0001, "num_tokens": 126245424.0, "reward": 0.9310627579689026, "reward_std": 0.0, "rewards/reward_func/mean": 0.9310627579689026, "rewards/reward_func/std": 0.0, "step": 4593, "step_time": 19.66918433830142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 183.9375, "completions/mean_terminated_length": 183.9375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.3029061332345009, "epoch": 0.21278369615562762, "frac_reward_zero_std": 0.0, "grad_norm": 0.10588514804840088, "kl": 0.023241990245878696, "learning_rate": 9.574525243168133e-07, "loss": 0.0331, "num_tokens": 126268447.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4594, "step_time": 22.20823608711362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2079700380563736, "epoch": 0.2128300138953219, "frac_reward_zero_std": 0.0, "grad_norm": 0.19576317071914673, "kl": 0.02368050836957991, "learning_rate": 9.574432607688744e-07, "loss": -0.2295, "num_tokens": 126290633.0, "reward": 0.3210454285144806, "reward_std": 0.3790600001811981, "rewards/reward_func/mean": 0.3210454285144806, "rewards/reward_func/std": 0.3790600001811981, "step": 4595, "step_time": 23.530725929886103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 141.125, "completions/mean_terminated_length": 141.125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.33004576712846756, "epoch": 0.2128763316350162, "frac_reward_zero_std": 1.0, "grad_norm": 0.00828706193715334, "kl": 0.004713651491329074, "learning_rate": 9.574339972209355e-07, "loss": 0.0002, "num_tokens": 126312571.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4596, "step_time": 19.159527648240328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 115.875, "completions/mean_terminated_length": 115.875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.23844431713223457, "epoch": 0.2129226493747105, "frac_reward_zero_std": 1.0, "grad_norm": 0.01080168504267931, "kl": 0.004030712181702256, "learning_rate": 9.574247336729967e-07, "loss": 0.0002, "num_tokens": 126331897.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4597, "step_time": 14.468946900218725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 170.5, "completions/mean_terminated_length": 170.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.1602429263293743, "epoch": 0.21296896711440483, "frac_reward_zero_std": 0.0, "grad_norm": 0.12220467627048492, "kl": 0.00193205161485821, "learning_rate": 9.574154701250578e-07, "loss": -0.0131, "num_tokens": 126369793.0, "reward": 0.8918383121490479, "reward_std": 0.0536632239818573, "rewards/reward_func/mean": 0.8918383121490479, "rewards/reward_func/std": 0.0536632314324379, "step": 4598, "step_time": 22.584174297749996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 154.3125, "completions/mean_terminated_length": 154.3125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.4211675226688385, "epoch": 0.21301528485409912, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018207606626674533, "kl": 0.0016961352666839957, "learning_rate": 9.574062065771191e-07, "loss": 0.0001, "num_tokens": 126401718.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4599, "step_time": 20.368704564869404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2625105753540993, "epoch": 0.21306160259379342, "frac_reward_zero_std": 0.0, "grad_norm": 0.11870481818914413, "kl": 0.008358007995411754, "learning_rate": 9.573969430291802e-07, "loss": -0.0239, "num_tokens": 126424341.0, "reward": 0.9573257565498352, "reward_std": 0.01665830798447132, "rewards/reward_func/mean": 0.9573257565498352, "rewards/reward_func/std": 0.01665831357240677, "step": 4600, "step_time": 21.436405293643475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 185.9375, "completions/mean_terminated_length": 185.9375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.2466711439192295, "epoch": 0.21310792033348772, "frac_reward_zero_std": 1.0, "grad_norm": 0.007192135322839022, "kl": 0.029116731602698565, "learning_rate": 9.573876794812412e-07, "loss": 0.0014, "num_tokens": 126453316.0, "reward": 0.9682132601737976, "reward_std": 0.0, "rewards/reward_func/mean": 0.9682132601737976, "rewards/reward_func/std": 0.0, "step": 4601, "step_time": 22.567537255585194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 165.4375, "completions/mean_terminated_length": 165.4375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.25666791945695877, "epoch": 0.21315423807318204, "frac_reward_zero_std": 0.0, "grad_norm": 0.32522323727607727, "kl": 0.06802494544535875, "learning_rate": 9.573784159333023e-07, "loss": -0.0251, "num_tokens": 126474011.0, "reward": 0.8512284755706787, "reward_std": 0.06417026370763779, "rewards/reward_func/mean": 0.8512284755706787, "rewards/reward_func/std": 0.06417026370763779, "step": 4602, "step_time": 19.077897660434246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.49531012773513794, "epoch": 0.21320055581287634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0043403939343988895, "kl": 0.003245527681428939, "learning_rate": 9.573691523853636e-07, "loss": 0.0002, "num_tokens": 126509193.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4603, "step_time": 23.65907971560955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.21409835293889046, "epoch": 0.21324687355257063, "frac_reward_zero_std": 0.0, "grad_norm": 0.09618958830833435, "kl": 0.003828092012554407, "learning_rate": 9.573598888374247e-07, "loss": -0.0234, "num_tokens": 126537741.0, "reward": 0.9188828468322754, "reward_std": 0.08377746492624283, "rewards/reward_func/mean": 0.9188828468322754, "rewards/reward_func/std": 0.08377746492624283, "step": 4604, "step_time": 29.859783098101616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3226679340004921, "epoch": 0.21329319129226493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025867170188575983, "kl": 0.002313228149432689, "learning_rate": 9.573506252894859e-07, "loss": 0.0001, "num_tokens": 126558447.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4605, "step_time": 16.92801634594798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 163.5, "completions/mean_terminated_length": 163.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.21732071042060852, "epoch": 0.21333950903195925, "frac_reward_zero_std": 1.0, "grad_norm": 0.003922209143638611, "kl": 0.004118205222766846, "learning_rate": 9.57341361741547e-07, "loss": 0.0002, "num_tokens": 126579351.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4606, "step_time": 18.701394371688366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 140.25, "completions/mean_terminated_length": 140.25, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.27046389877796173, "epoch": 0.21338582677165355, "frac_reward_zero_std": 1.0, "grad_norm": 0.003340133000165224, "kl": 0.002406741550657898, "learning_rate": 9.573320981936081e-07, "loss": 0.0001, "num_tokens": 126599611.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4607, "step_time": 17.629219640046358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 124.125, "completions/mean_terminated_length": 124.125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.3118182420730591, "epoch": 0.21343214451134784, "frac_reward_zero_std": 1.0, "grad_norm": 0.012819971889257431, "kl": 0.003876145579852164, "learning_rate": 9.573228346456692e-07, "loss": 0.0002, "num_tokens": 126622557.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4608, "step_time": 16.826832067221403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.32553625851869583, "epoch": 0.21347846225104214, "frac_reward_zero_std": 1.0, "grad_norm": 0.018847228959202766, "kl": 0.01652515958994627, "learning_rate": 9.573135710977304e-07, "loss": 0.0008, "num_tokens": 126643489.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4609, "step_time": 24.18261268734932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 200.1875, "completions/mean_terminated_length": 200.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.40819302946329117, "epoch": 0.21352477999073646, "frac_reward_zero_std": 0.0, "grad_norm": 0.13739241659641266, "kl": 0.015501508256420493, "learning_rate": 9.573043075497915e-07, "loss": -0.0506, "num_tokens": 126667332.0, "reward": 0.0253753662109375, "reward_std": 0.054555393755435944, "rewards/reward_func/mean": 0.0253753662109375, "rewards/reward_func/std": 0.05455539748072624, "step": 4610, "step_time": 29.842195238918066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.4177410379052162, "epoch": 0.21357109773043076, "frac_reward_zero_std": 0.0, "grad_norm": 0.15692751109600067, "kl": 0.00395078951260075, "learning_rate": 9.572950440018526e-07, "loss": -0.135, "num_tokens": 126702350.0, "reward": 0.016619674861431122, "reward_std": 0.06647869944572449, "rewards/reward_func/mean": 0.016619674861431122, "rewards/reward_func/std": 0.06647869944572449, "step": 4611, "step_time": 27.532116916030645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 212.9375, "completions/mean_terminated_length": 212.9375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3895299881696701, "epoch": 0.21361741547012505, "frac_reward_zero_std": 0.0, "grad_norm": 0.12125103175640106, "kl": 0.005163967609405518, "learning_rate": 9.572857804539137e-07, "loss": 0.032, "num_tokens": 126725389.0, "reward": 0.06860418617725372, "reward_std": 0.23119214177131653, "rewards/reward_func/mean": 0.06860418617725372, "rewards/reward_func/std": 0.23119214177131653, "step": 4612, "step_time": 23.18581724539399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 114.625, "completions/mean_terminated_length": 114.625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.2978924736380577, "epoch": 0.21366373320981935, "frac_reward_zero_std": 1.0, "grad_norm": 0.004663385450839996, "kl": 0.0027721599326469004, "learning_rate": 9.57276516905975e-07, "loss": 0.0001, "num_tokens": 126746151.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4613, "step_time": 16.64656350389123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3374093174934387, "epoch": 0.21371005094951367, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039483108557760715, "kl": 0.003512556490022689, "learning_rate": 9.57267253358036e-07, "loss": 0.0002, "num_tokens": 126771493.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4614, "step_time": 22.854474045336246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 141.1875, "completions/mean_terminated_length": 141.1875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3065594434738159, "epoch": 0.21375636868920797, "frac_reward_zero_std": 1.0, "grad_norm": 0.005497485864907503, "kl": 0.003522971353959292, "learning_rate": 9.572579898100971e-07, "loss": 0.0002, "num_tokens": 126795496.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4615, "step_time": 17.771773859858513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 132.4375, "completions/mean_terminated_length": 132.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3171450048685074, "epoch": 0.21380268642890227, "frac_reward_zero_std": 1.0, "grad_norm": 0.020709557458758354, "kl": 0.005141422443557531, "learning_rate": 9.572487262621585e-07, "loss": 0.0003, "num_tokens": 126815903.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4616, "step_time": 16.291943036019802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 194.875, "completions/mean_terminated_length": 194.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.15820416063070297, "epoch": 0.21384900416859656, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011281813494861126, "kl": 0.053559258580207825, "learning_rate": 9.572394627142196e-07, "loss": 0.0027, "num_tokens": 126837197.0, "reward": 0.5706745982170105, "reward_std": 0.0, "rewards/reward_func/mean": 0.5706745982170105, "rewards/reward_func/std": 0.0, "step": 4617, "step_time": 22.558176815509796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 139.25, "completions/mean_terminated_length": 139.25, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.3524722456932068, "epoch": 0.2138953219082909, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023935355711728334, "kl": 0.0020296122529543936, "learning_rate": 9.572301991662807e-07, "loss": 0.0001, "num_tokens": 126858001.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4618, "step_time": 16.64563300460577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 138.5, "completions/mean_terminated_length": 138.5, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.30146465450525284, "epoch": 0.21394163964798518, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034549932461231947, "kl": 0.00310306076426059, "learning_rate": 9.572209356183418e-07, "loss": 0.0002, "num_tokens": 126878137.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4619, "step_time": 17.105186745524406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.313643679022789, "epoch": 0.21398795738767948, "frac_reward_zero_std": 1.0, "grad_norm": 0.004588900599628687, "kl": 0.0025320181739516556, "learning_rate": 9.57211672070403e-07, "loss": 0.0001, "num_tokens": 126904289.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4620, "step_time": 19.462565012276173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 129.8125, "completions/mean_terminated_length": 129.8125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.26016219705343246, "epoch": 0.21403427512737377, "frac_reward_zero_std": 1.0, "grad_norm": 0.006031656637787819, "kl": 0.0028295708762016147, "learning_rate": 9.57202408522464e-07, "loss": 0.0001, "num_tokens": 126928798.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4621, "step_time": 16.489441718906164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.2527787424623966, "epoch": 0.2140805928670681, "frac_reward_zero_std": 0.0, "grad_norm": 0.27769744396209717, "kl": 0.15016119182109833, "learning_rate": 9.571931449745252e-07, "loss": -0.0194, "num_tokens": 126951032.0, "reward": 0.9095566272735596, "reward_std": 0.24713833630084991, "rewards/reward_func/mean": 0.9095566272735596, "rewards/reward_func/std": 0.24713833630084991, "step": 4622, "step_time": 16.10928536951542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 210.8125, "completions/mean_terminated_length": 210.8125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.47990524023771286, "epoch": 0.2141269106067624, "frac_reward_zero_std": 0.0, "grad_norm": 0.11754059791564941, "kl": 0.010775170288980007, "learning_rate": 9.571838814265863e-07, "loss": 0.1185, "num_tokens": 126972773.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 4623, "step_time": 28.969821341335773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.45073506236076355, "epoch": 0.2141732283464567, "frac_reward_zero_std": 1.0, "grad_norm": 0.002356166485697031, "kl": 0.002657567209098488, "learning_rate": 9.571746178786475e-07, "loss": 0.0001, "num_tokens": 127023063.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4624, "step_time": 30.01600630953908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 172.8125, "completions/mean_terminated_length": 172.8125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.36030296981334686, "epoch": 0.21421954608615099, "frac_reward_zero_std": 1.0, "grad_norm": 0.004975039511919022, "kl": 0.004276565974578261, "learning_rate": 9.571653543307086e-07, "loss": 0.0002, "num_tokens": 127045972.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4625, "step_time": 22.17604398727417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.22017846629023552, "epoch": 0.2142658638258453, "frac_reward_zero_std": 0.0, "grad_norm": 0.13473278284072876, "kl": 0.004622761334758252, "learning_rate": 9.571560907827697e-07, "loss": 0.0022, "num_tokens": 127071798.0, "reward": 0.8826128244400024, "reward_std": 0.04670385643839836, "rewards/reward_func/mean": 0.8826128244400024, "rewards/reward_func/std": 0.04670385643839836, "step": 4626, "step_time": 20.95901571586728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "entropy": 0.21980761364102364, "epoch": 0.2143121815655396, "frac_reward_zero_std": 0.0, "grad_norm": 0.1065199077129364, "kl": 0.026685321470722556, "learning_rate": 9.571468272348308e-07, "loss": -0.0192, "num_tokens": 127101306.0, "reward": 0.45331358909606934, "reward_std": 0.08640240132808685, "rewards/reward_func/mean": 0.45331358909606934, "rewards/reward_func/std": 0.08640240877866745, "step": 4627, "step_time": 28.487680412828922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 153.8125, "completions/mean_terminated_length": 153.8125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.42449887096881866, "epoch": 0.2143584993052339, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018880803836509585, "kl": 0.002178079157602042, "learning_rate": 9.57137563686892e-07, "loss": 0.0001, "num_tokens": 127128359.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4628, "step_time": 18.94478588551283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.17183669283986092, "epoch": 0.2144048170449282, "frac_reward_zero_std": 0.0, "grad_norm": 0.14275139570236206, "kl": 0.002390524576185271, "learning_rate": 9.57128300138953e-07, "loss": 0.0254, "num_tokens": 127160553.0, "reward": 0.9952442646026611, "reward_std": 0.012995131313800812, "rewards/reward_func/mean": 0.9952442646026611, "rewards/reward_func/std": 0.01299512293189764, "step": 4629, "step_time": 39.509769801050425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4044266939163208, "epoch": 0.21445113478462252, "frac_reward_zero_std": 1.0, "grad_norm": 0.012995369732379913, "kl": 0.011043486651033163, "learning_rate": 9.571190365910144e-07, "loss": 0.0005, "num_tokens": 127181640.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4630, "step_time": 20.03908582776785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 141.75, "completions/mean_terminated_length": 141.75, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.12327790260314941, "epoch": 0.21449745252431682, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023728630039840937, "kl": 0.0017822127847466618, "learning_rate": 9.571097730430755e-07, "loss": 0.0001, "num_tokens": 127203524.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 4631, "step_time": 19.283959042280912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 188.4375, "completions/mean_terminated_length": 188.4375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.26789214089512825, "epoch": 0.2145437702640111, "frac_reward_zero_std": 0.0, "grad_norm": 0.11501730978488922, "kl": 0.010447415814269334, "learning_rate": 9.571005094951365e-07, "loss": -0.0176, "num_tokens": 127224379.0, "reward": 0.9259670972824097, "reward_std": 0.0509251244366169, "rewards/reward_func/mean": 0.9259670972824097, "rewards/reward_func/std": 0.050925128161907196, "step": 4632, "step_time": 21.821676589548588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3288490027189255, "epoch": 0.2145900880037054, "frac_reward_zero_std": 0.0, "grad_norm": 0.07984766364097595, "kl": 0.014920821180567145, "learning_rate": 9.570912459471978e-07, "loss": -0.1445, "num_tokens": 127250673.0, "reward": 0.5475016236305237, "reward_std": 0.3812312185764313, "rewards/reward_func/mean": 0.5475016236305237, "rewards/reward_func/std": 0.3812311887741089, "step": 4633, "step_time": 32.65467547252774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 134.4375, "completions/mean_terminated_length": 134.4375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.3110058158636093, "epoch": 0.21463640574339973, "frac_reward_zero_std": 1.0, "grad_norm": 0.003320001531392336, "kl": 0.0024003706639632583, "learning_rate": 9.57081982399259e-07, "loss": 0.0001, "num_tokens": 127272712.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4634, "step_time": 16.119121961295605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 153.1875, "completions/mean_terminated_length": 153.1875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.39740072190761566, "epoch": 0.21468272348309403, "frac_reward_zero_std": 1.0, "grad_norm": 0.002264913870021701, "kl": 0.0023553372593596578, "learning_rate": 9.5707271885132e-07, "loss": 0.0001, "num_tokens": 127306059.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4635, "step_time": 21.171541515737772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4089849814772606, "epoch": 0.21472904122278832, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030034270603209734, "kl": 0.002874799771234393, "learning_rate": 9.570634553033812e-07, "loss": 0.0001, "num_tokens": 127360193.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4636, "step_time": 28.686766408383846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 140.125, "completions/mean_terminated_length": 140.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.29447612166404724, "epoch": 0.21477535896248262, "frac_reward_zero_std": 1.0, "grad_norm": 0.00416575139388442, "kl": 0.002283772628288716, "learning_rate": 9.570541917554423e-07, "loss": 0.0001, "num_tokens": 127390147.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4637, "step_time": 20.987193293869495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 146.4375, "completions/mean_terminated_length": 146.4375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.29804428666830063, "epoch": 0.21482167670217694, "frac_reward_zero_std": 1.0, "grad_norm": 0.004655946046113968, "kl": 0.003044213750399649, "learning_rate": 9.570449282075034e-07, "loss": 0.0002, "num_tokens": 127417242.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4638, "step_time": 19.60076381638646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 226.8125, "completions/mean_terminated_length": 226.8125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.32897651195526123, "epoch": 0.21486799444187124, "frac_reward_zero_std": 0.0, "grad_norm": 0.9647154211997986, "kl": 0.021434172056615353, "learning_rate": 9.570356646595645e-07, "loss": -0.0003, "num_tokens": 127449847.0, "reward": 0.6860184073448181, "reward_std": 0.40906235575675964, "rewards/reward_func/mean": 0.6860184073448181, "rewards/reward_func/std": 0.40906238555908203, "step": 4639, "step_time": 26.903940606862307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 186.625, "completions/mean_terminated_length": 186.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.40722519904375076, "epoch": 0.21491431218156554, "frac_reward_zero_std": 1.0, "grad_norm": 0.006639092694967985, "kl": 0.005273929215036333, "learning_rate": 9.570264011116257e-07, "loss": 0.0003, "num_tokens": 127474737.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4640, "step_time": 23.84171436727047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 174.3125, "completions/mean_terminated_length": 174.3125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.18666213005781174, "epoch": 0.21496062992125983, "frac_reward_zero_std": 0.0, "grad_norm": 0.11597719043493271, "kl": 0.004080721584614366, "learning_rate": 9.570171375636868e-07, "loss": 0.0008, "num_tokens": 127497670.0, "reward": 0.7124775648117065, "reward_std": 0.07721612602472305, "rewards/reward_func/mean": 0.7124775648117065, "rewards/reward_func/std": 0.07721611857414246, "step": 4641, "step_time": 19.928462456911802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 183.9375, "completions/mean_terminated_length": 183.9375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.20664877444505692, "epoch": 0.21500694766095416, "frac_reward_zero_std": 0.0, "grad_norm": 0.10467500984668732, "kl": 0.016906562726944685, "learning_rate": 9.57007874015748e-07, "loss": -0.0046, "num_tokens": 127519061.0, "reward": 0.9768627882003784, "reward_std": 0.035443443804979324, "rewards/reward_func/mean": 0.9768627882003784, "rewards/reward_func/std": 0.03544343635439873, "step": 4642, "step_time": 21.659313656389713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 131.4375, "completions/mean_terminated_length": 131.4375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.23793626576662064, "epoch": 0.21505326540064845, "frac_reward_zero_std": 1.0, "grad_norm": 0.002327620517462492, "kl": 0.0018894985259976238, "learning_rate": 9.569986104678092e-07, "loss": 0.0001, "num_tokens": 127541212.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4643, "step_time": 16.452261183410883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.37210172414779663, "epoch": 0.21509958314034275, "frac_reward_zero_std": 1.0, "grad_norm": 0.006618363782763481, "kl": 0.0028220415115356445, "learning_rate": 9.569893469198704e-07, "loss": 0.0001, "num_tokens": 127570380.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4644, "step_time": 21.53874461352825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 149.3125, "completions/mean_terminated_length": 149.3125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.308118037879467, "epoch": 0.21514590088003704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026913676410913467, "kl": 0.002246640477096662, "learning_rate": 9.569800833719313e-07, "loss": 0.0001, "num_tokens": 127597985.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4645, "step_time": 18.83286164328456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 127.3125, "completions/mean_terminated_length": 127.3125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2763524055480957, "epoch": 0.21519221861973137, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024202989879995584, "kl": 0.0019912614952772856, "learning_rate": 9.569708198239926e-07, "loss": 0.0001, "num_tokens": 127622374.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4646, "step_time": 17.542009364813566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 178.4375, "completions/mean_terminated_length": 178.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.42642128467559814, "epoch": 0.21523853635942566, "frac_reward_zero_std": 1.0, "grad_norm": 0.009753278456628323, "kl": 0.006714976276271045, "learning_rate": 9.569615562760537e-07, "loss": 0.0003, "num_tokens": 127651565.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4647, "step_time": 23.6161276884377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 174.5625, "completions/mean_terminated_length": 174.5625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.4136885926127434, "epoch": 0.21528485409911996, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028628241270780563, "kl": 0.0025501035270281136, "learning_rate": 9.569522927281149e-07, "loss": 0.0001, "num_tokens": 127694294.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4648, "step_time": 27.321494657546282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 224.4375, "completions/mean_terminated_length": 224.4375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.4577714130282402, "epoch": 0.21533117183881426, "frac_reward_zero_std": 1.0, "grad_norm": 0.012676808051764965, "kl": 0.011420159135013819, "learning_rate": 9.56943029180176e-07, "loss": 0.0006, "num_tokens": 127721149.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4649, "step_time": 33.25904209911823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.14692023023962975, "epoch": 0.21537748957850858, "frac_reward_zero_std": 1.0, "grad_norm": 0.001845229766331613, "kl": 0.02969865733757615, "learning_rate": 9.569337656322371e-07, "loss": 0.0015, "num_tokens": 127742626.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4650, "step_time": 18.386997617781162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 125.9375, "completions/mean_terminated_length": 125.9375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2768767401576042, "epoch": 0.21542380731820288, "frac_reward_zero_std": 1.0, "grad_norm": 0.006206747610121965, "kl": 0.002629841648740694, "learning_rate": 9.569245020842982e-07, "loss": 0.0001, "num_tokens": 127764305.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4651, "step_time": 17.05055009201169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 164.1875, "completions/mean_terminated_length": 164.1875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.34208963066339493, "epoch": 0.21547012505789717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035113864578306675, "kl": 0.0023904606932774186, "learning_rate": 9.569152385363594e-07, "loss": 0.0001, "num_tokens": 127791476.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4652, "step_time": 21.758243769407272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "entropy": 0.24853120744228363, "epoch": 0.21551644279759147, "frac_reward_zero_std": 0.0, "grad_norm": 0.17795726656913757, "kl": 0.03794926730915904, "learning_rate": 9.569059749884205e-07, "loss": 0.0062, "num_tokens": 127827076.0, "reward": 0.9874951839447021, "reward_std": 0.01667311228811741, "rewards/reward_func/mean": 0.9874951839447021, "rewards/reward_func/std": 0.016673119738698006, "step": 4653, "step_time": 30.484981279820204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 213.375, "completions/mean_terminated_length": 213.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3825131356716156, "epoch": 0.2155627605372858, "frac_reward_zero_std": 1.0, "grad_norm": 0.009327673353254795, "kl": 0.008753576781600714, "learning_rate": 9.568967114404816e-07, "loss": 0.0004, "num_tokens": 127868458.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4654, "step_time": 28.964972458779812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 152.25, "completions/mean_terminated_length": 152.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.35802191495895386, "epoch": 0.2156090782769801, "frac_reward_zero_std": 1.0, "grad_norm": 0.005376492626965046, "kl": 0.0033036007080227137, "learning_rate": 9.568874478925427e-07, "loss": 0.0002, "num_tokens": 127899630.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4655, "step_time": 21.17877870053053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 305.4375, "completions/mean_terminated_length": 305.4375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.26090652868151665, "epoch": 0.21565539601667438, "frac_reward_zero_std": 0.0, "grad_norm": 0.08595925569534302, "kl": 0.015238664811477065, "learning_rate": 9.56878184344604e-07, "loss": -0.1317, "num_tokens": 127931541.0, "reward": 0.8587601184844971, "reward_std": 0.3353709280490875, "rewards/reward_func/mean": 0.8587601184844971, "rewards/reward_func/std": 0.3353709578514099, "step": 4656, "step_time": 35.50366682559252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 223.8125, "completions/mean_terminated_length": 223.8125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.362893283367157, "epoch": 0.21570171375636868, "frac_reward_zero_std": 1.0, "grad_norm": 0.003356839530169964, "kl": 0.003706058021634817, "learning_rate": 9.56868920796665e-07, "loss": 0.0002, "num_tokens": 127974210.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4657, "step_time": 32.763096299022436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.33154158294200897, "epoch": 0.215748031496063, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057208905927836895, "kl": 0.0037041493924334645, "learning_rate": 9.568596572487261e-07, "loss": 0.0002, "num_tokens": 127996785.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4658, "step_time": 17.63434297591448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 119.0625, "completions/mean_terminated_length": 119.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.27619320899248123, "epoch": 0.2157943492357573, "frac_reward_zero_std": 1.0, "grad_norm": 0.006155842915177345, "kl": 0.005472356686368585, "learning_rate": 9.568503937007872e-07, "loss": 0.0003, "num_tokens": 128016338.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4659, "step_time": 16.408082224428654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 129.6875, "completions/mean_terminated_length": 129.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2596445754170418, "epoch": 0.2158406669754516, "frac_reward_zero_std": 1.0, "grad_norm": 0.004416171461343765, "kl": 0.002276734303450212, "learning_rate": 9.568411301528486e-07, "loss": 0.0001, "num_tokens": 128037053.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4660, "step_time": 17.764114674180746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 161.3125, "completions/mean_terminated_length": 161.3125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.3849439322948456, "epoch": 0.2158869847151459, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018807444721460342, "kl": 0.001989488024264574, "learning_rate": 9.568318666049097e-07, "loss": 0.0001, "num_tokens": 128082946.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4661, "step_time": 27.11596456170082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.2866143584251404, "epoch": 0.21593330245484021, "frac_reward_zero_std": 1.0, "grad_norm": 0.002465636469423771, "kl": 0.002389605826465413, "learning_rate": 9.568226030569708e-07, "loss": 0.0001, "num_tokens": 128104934.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4662, "step_time": 16.766618456691504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 147.625, "completions/mean_terminated_length": 147.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3677098676562309, "epoch": 0.2159796201945345, "frac_reward_zero_std": 1.0, "grad_norm": 0.005699924658983946, "kl": 0.003419053158722818, "learning_rate": 9.56813339509032e-07, "loss": 0.0002, "num_tokens": 128141856.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4663, "step_time": 21.845496512949467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.38255808502435684, "epoch": 0.2160259379342288, "frac_reward_zero_std": 1.0, "grad_norm": 0.021539025008678436, "kl": 0.012102874228730798, "learning_rate": 9.56804075961093e-07, "loss": 0.0006, "num_tokens": 128163946.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4664, "step_time": 20.500273644924164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 130.625, "completions/mean_terminated_length": 130.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.38598378747701645, "epoch": 0.2160722556739231, "frac_reward_zero_std": 1.0, "grad_norm": 0.003358134301379323, "kl": 0.002884236047975719, "learning_rate": 9.567948124131542e-07, "loss": 0.0001, "num_tokens": 128184692.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4665, "step_time": 16.254790641367435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 207.8125, "completions/mean_terminated_length": 207.8125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.3718961030244827, "epoch": 0.21611857341361743, "frac_reward_zero_std": 0.0, "grad_norm": 0.1306808590888977, "kl": 0.007270547677762806, "learning_rate": 9.567855488652153e-07, "loss": 0.0207, "num_tokens": 128212289.0, "reward": 0.34135520458221436, "reward_std": 0.45313286781311035, "rewards/reward_func/mean": 0.34135520458221436, "rewards/reward_func/std": 0.45313286781311035, "step": 4666, "step_time": 29.924130588769913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.17390716075897217, "epoch": 0.21616489115331172, "frac_reward_zero_std": 1.0, "grad_norm": 0.003358956892043352, "kl": 0.029748273082077503, "learning_rate": 9.567762853172765e-07, "loss": 0.0015, "num_tokens": 128237859.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 4667, "step_time": 28.623353756964207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.1873151920735836, "epoch": 0.21621120889300602, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037431532982736826, "kl": 0.004099401994608343, "learning_rate": 9.567670217693376e-07, "loss": 0.0002, "num_tokens": 128260973.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4668, "step_time": 19.78599815070629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.18856868147850037, "epoch": 0.21625752663270031, "frac_reward_zero_std": 1.0, "grad_norm": 0.004834700841456652, "kl": 0.004626446811016649, "learning_rate": 9.567577582213987e-07, "loss": 0.0002, "num_tokens": 128292263.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4669, "step_time": 24.34179699793458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 138.25, "completions/mean_terminated_length": 138.25, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.3532930091023445, "epoch": 0.21630384437239464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026294998824596405, "kl": 0.0024022431462071836, "learning_rate": 9.567484946734598e-07, "loss": 0.0001, "num_tokens": 128312539.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4670, "step_time": 17.001453924924135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 152.875, "completions/mean_terminated_length": 152.875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.30836206674575806, "epoch": 0.21635016211208893, "frac_reward_zero_std": 0.0, "grad_norm": 0.14689210057258606, "kl": 0.004777474270667881, "learning_rate": 9.56739231125521e-07, "loss": 0.0891, "num_tokens": 128333065.0, "reward": 0.7721847891807556, "reward_std": 0.3014300763607025, "rewards/reward_func/mean": 0.7721847891807556, "rewards/reward_func/std": 0.3014300763607025, "step": 4671, "step_time": 20.48503626137972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2691037245094776, "epoch": 0.21639647985178323, "frac_reward_zero_std": 1.0, "grad_norm": 0.002530246740207076, "kl": 0.00207692536059767, "learning_rate": 9.56729967577582e-07, "loss": 0.0001, "num_tokens": 128353751.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4672, "step_time": 17.23697617277503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 162.8125, "completions/mean_terminated_length": 162.8125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4040471464395523, "epoch": 0.21644279759147753, "frac_reward_zero_std": 1.0, "grad_norm": 0.004170588217675686, "kl": 0.0028006028151139617, "learning_rate": 9.567207040296434e-07, "loss": 0.0001, "num_tokens": 128391508.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4673, "step_time": 24.134961064904928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.41561467945575714, "epoch": 0.21648911533117185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0052163079380989075, "kl": 0.005093224346637726, "learning_rate": 9.567114404817045e-07, "loss": 0.0003, "num_tokens": 128429346.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4674, "step_time": 24.104380950331688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 203.3125, "completions/mean_terminated_length": 203.3125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.45514459162950516, "epoch": 0.21653543307086615, "frac_reward_zero_std": 1.0, "grad_norm": 0.008697996847331524, "kl": 0.006443565711379051, "learning_rate": 9.567021769337655e-07, "loss": 0.0003, "num_tokens": 128451703.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4675, "step_time": 25.268934823572636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 140.4375, "completions/mean_terminated_length": 140.4375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.17368808761239052, "epoch": 0.21658175081056044, "frac_reward_zero_std": 0.0, "grad_norm": 0.2149912714958191, "kl": 0.006797351874411106, "learning_rate": 9.566929133858268e-07, "loss": -0.0114, "num_tokens": 128472350.0, "reward": 0.9025907516479492, "reward_std": 0.04832848533987999, "rewards/reward_func/mean": 0.9025907516479492, "rewards/reward_func/std": 0.048328500241041183, "step": 4676, "step_time": 16.623182900249958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 196.125, "completions/mean_terminated_length": 196.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.21596404165029526, "epoch": 0.21662806855025474, "frac_reward_zero_std": 1.0, "grad_norm": 0.00794797483831644, "kl": 0.009435880696401, "learning_rate": 9.56683649837888e-07, "loss": 0.0005, "num_tokens": 128495216.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4677, "step_time": 23.499925259500742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 138.125, "completions/mean_terminated_length": 138.125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.294400118291378, "epoch": 0.21667438628994906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044252327643334866, "kl": 0.002887769020162523, "learning_rate": 9.56674386289949e-07, "loss": 0.0001, "num_tokens": 128516962.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4678, "step_time": 16.759018633514643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 392.6875, "completions/mean_terminated_length": 392.6875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.16387999802827835, "epoch": 0.21672070402964336, "frac_reward_zero_std": 0.0, "grad_norm": 0.058388832956552505, "kl": 0.011111302766948938, "learning_rate": 9.566651227420102e-07, "loss": -0.0277, "num_tokens": 128553629.0, "reward": 0.9573274254798889, "reward_std": 0.05677105858922005, "rewards/reward_func/mean": 0.9573274254798889, "rewards/reward_func/std": 0.05677107349038124, "step": 4679, "step_time": 42.23031213134527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 340.8125, "completions/mean_terminated_length": 340.8125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.23604802042245865, "epoch": 0.21676702176933765, "frac_reward_zero_std": 0.0, "grad_norm": 0.07029406726360321, "kl": 0.007450093748047948, "learning_rate": 9.566558591940713e-07, "loss": -0.0874, "num_tokens": 128587290.0, "reward": 0.8636986613273621, "reward_std": 0.33853599429130554, "rewards/reward_func/mean": 0.8636986613273621, "rewards/reward_func/std": 0.33853599429130554, "step": 4680, "step_time": 44.18964160978794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 279.375, "completions/mean_terminated_length": 279.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.39158811420202255, "epoch": 0.21681333950903195, "frac_reward_zero_std": 0.0, "grad_norm": 0.1147027537226677, "kl": 0.020142321474850178, "learning_rate": 9.566465956461324e-07, "loss": -0.1632, "num_tokens": 128624624.0, "reward": 0.5518031120300293, "reward_std": 0.5043283104896545, "rewards/reward_func/mean": 0.5518031120300293, "rewards/reward_func/std": 0.5043283104896545, "step": 4681, "step_time": 37.18413728475571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 124.375, "completions/mean_terminated_length": 124.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3182864338159561, "epoch": 0.21685965724872627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038417920004576445, "kl": 0.0027310732402838767, "learning_rate": 9.566373320981935e-07, "loss": 0.0001, "num_tokens": 128645590.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4682, "step_time": 15.642555423080921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 163.6875, "completions/mean_terminated_length": 163.6875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.21090726926922798, "epoch": 0.21690597498842057, "frac_reward_zero_std": 1.0, "grad_norm": 0.004298835527151823, "kl": 0.0026743889320641756, "learning_rate": 9.566280685502547e-07, "loss": 0.0001, "num_tokens": 128671601.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 4683, "step_time": 21.579754520207644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 206.625, "completions/mean_terminated_length": 206.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.21174703538417816, "epoch": 0.21695229272811486, "frac_reward_zero_std": 1.0, "grad_norm": 0.009199468418955803, "kl": 0.006716641131788492, "learning_rate": 9.566188050023158e-07, "loss": 0.0003, "num_tokens": 128696443.0, "reward": 0.7044320702552795, "reward_std": 0.0, "rewards/reward_func/mean": 0.7044320702552795, "rewards/reward_func/std": 0.0, "step": 4684, "step_time": 24.04336714744568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 112.9375, "completions/mean_terminated_length": 112.9375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.27595193684101105, "epoch": 0.21699861046780916, "frac_reward_zero_std": 1.0, "grad_norm": 0.004088678862899542, "kl": 0.002328214410226792, "learning_rate": 9.56609541454377e-07, "loss": 0.0001, "num_tokens": 128716426.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4685, "step_time": 14.581633433699608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2818344533443451, "epoch": 0.21704492820750348, "frac_reward_zero_std": 1.0, "grad_norm": 0.008714600466191769, "kl": 0.0047147831646725535, "learning_rate": 9.566002779064383e-07, "loss": 0.0002, "num_tokens": 128737080.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4686, "step_time": 15.111976612359285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.22171951085329056, "epoch": 0.21709124594719778, "frac_reward_zero_std": 0.0, "grad_norm": 0.09821074455976486, "kl": 0.011160548659972847, "learning_rate": 9.565910143584994e-07, "loss": 0.0552, "num_tokens": 128764126.0, "reward": 0.9346067905426025, "reward_std": 0.04231547191739082, "rewards/reward_func/mean": 0.9346067905426025, "rewards/reward_func/std": 0.04231548309326172, "step": 4687, "step_time": 28.17325323075056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 152.0625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.32022595405578613, "epoch": 0.21713756368689208, "frac_reward_zero_std": 1.0, "grad_norm": 0.004113008733838797, "kl": 0.0030579391168430448, "learning_rate": 9.565817508105603e-07, "loss": 0.0002, "num_tokens": 128785695.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4688, "step_time": 18.334290079772472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.41011738777160645, "epoch": 0.21718388142658637, "frac_reward_zero_std": 1.0, "grad_norm": 0.00345940375700593, "kl": 0.0037335798260755837, "learning_rate": 9.565724872626214e-07, "loss": 0.0002, "num_tokens": 128809175.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4689, "step_time": 20.805212043225765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 136.8125, "completions/mean_terminated_length": 136.8125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24847476184368134, "epoch": 0.2172301991662807, "frac_reward_zero_std": 1.0, "grad_norm": 0.014227625913918018, "kl": 0.008940773201175034, "learning_rate": 9.565632237146828e-07, "loss": 0.0005, "num_tokens": 128832100.0, "reward": 0.0006008694763295352, "reward_std": 0.0, "rewards/reward_func/mean": 0.0006008694763295352, "rewards/reward_func/std": 0.0, "step": 4690, "step_time": 17.96142216026783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.15977605804800987, "epoch": 0.217276516905975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0043493714183568954, "kl": 0.010925154201686382, "learning_rate": 9.565539601667439e-07, "loss": 0.0005, "num_tokens": 128854196.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4691, "step_time": 23.239343974739313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 215.3125, "completions/mean_terminated_length": 215.3125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.44224586337804794, "epoch": 0.2173228346456693, "frac_reward_zero_std": 0.0, "grad_norm": 0.11012236773967743, "kl": 0.005374442785978317, "learning_rate": 9.56544696618805e-07, "loss": 0.0092, "num_tokens": 128877033.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4692, "step_time": 27.61267638579011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 155.5, "completions/mean_terminated_length": 155.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3069450259208679, "epoch": 0.21736915238536358, "frac_reward_zero_std": 1.0, "grad_norm": 0.002614505821838975, "kl": 0.002206849807407707, "learning_rate": 9.565354330708661e-07, "loss": 0.0001, "num_tokens": 128908865.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4693, "step_time": 21.37566214054823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 176.875, "completions/mean_terminated_length": 176.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.4276970997452736, "epoch": 0.2174154701250579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021815637592226267, "kl": 0.002318186656339094, "learning_rate": 9.565261695229273e-07, "loss": 0.0001, "num_tokens": 128954271.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4694, "step_time": 26.854221165180206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 187.0, "completions/mean_terminated_length": 187.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.403320737183094, "epoch": 0.2174617878647522, "frac_reward_zero_std": 1.0, "grad_norm": 0.007326861843466759, "kl": 0.006120213191024959, "learning_rate": 9.565169059749884e-07, "loss": 0.0003, "num_tokens": 128979487.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4695, "step_time": 21.921999126672745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 110.9375, "completions/mean_terminated_length": 110.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.27059856057167053, "epoch": 0.2175081056044465, "frac_reward_zero_std": 1.0, "grad_norm": 0.009745532646775246, "kl": 0.0035272304667159915, "learning_rate": 9.565076424270495e-07, "loss": 0.0002, "num_tokens": 128999790.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4696, "step_time": 13.533142525702715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2352730892598629, "epoch": 0.2175544233441408, "frac_reward_zero_std": 1.0, "grad_norm": 0.002535650972276926, "kl": 0.0015813313657417893, "learning_rate": 9.564983788791106e-07, "loss": 0.0001, "num_tokens": 129019199.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4697, "step_time": 15.000992849469185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 195.6875, "completions/mean_terminated_length": 195.6875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.22550852969288826, "epoch": 0.21760074108383512, "frac_reward_zero_std": 0.0, "grad_norm": 0.10005852580070496, "kl": 0.05339275160804391, "learning_rate": 9.564891153311718e-07, "loss": 0.0397, "num_tokens": 129040570.0, "reward": 0.7305104732513428, "reward_std": 0.12194602191448212, "rewards/reward_func/mean": 0.7305104732513428, "rewards/reward_func/std": 0.12194604426622391, "step": 4698, "step_time": 22.246045541018248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 207.0625, "completions/mean_terminated_length": 207.0625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.4338148683309555, "epoch": 0.21764705882352942, "frac_reward_zero_std": 1.0, "grad_norm": 0.00788185466080904, "kl": 0.005995885643642396, "learning_rate": 9.564798517832329e-07, "loss": 0.0003, "num_tokens": 129072107.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4699, "step_time": 27.115179523825645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.16094813868403435, "epoch": 0.2176933765632237, "frac_reward_zero_std": 0.0, "grad_norm": 0.12027033418416977, "kl": 0.003907462581992149, "learning_rate": 9.56470588235294e-07, "loss": 0.0094, "num_tokens": 129102935.0, "reward": 0.9616204500198364, "reward_std": 0.06865544617176056, "rewards/reward_func/mean": 0.9616204500198364, "rewards/reward_func/std": 0.06865545362234116, "step": 4700, "step_time": 22.509795740246773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 170.375, "completions/mean_terminated_length": 170.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4316684827208519, "epoch": 0.217739694302918, "frac_reward_zero_std": 1.0, "grad_norm": 0.002886262722313404, "kl": 0.002785950870020315, "learning_rate": 9.564613246873551e-07, "loss": 0.0001, "num_tokens": 129137389.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4701, "step_time": 22.80204911902547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 110.6875, "completions/mean_terminated_length": 110.6875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.264005608856678, "epoch": 0.21778601204261233, "frac_reward_zero_std": 1.0, "grad_norm": 0.0042200335301458836, "kl": 0.0020503849955275655, "learning_rate": 9.564520611394163e-07, "loss": 0.0001, "num_tokens": 129159000.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4702, "step_time": 14.584453262388706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 205.875, "completions/mean_terminated_length": 205.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.19852978736162186, "epoch": 0.21783232978230663, "frac_reward_zero_std": 0.0, "grad_norm": 0.09294604510068893, "kl": 0.012052624253556132, "learning_rate": 9.564427975914776e-07, "loss": -0.0466, "num_tokens": 129181542.0, "reward": 0.9442729949951172, "reward_std": 0.22290799021720886, "rewards/reward_func/mean": 0.9442729949951172, "rewards/reward_func/std": 0.22290800511837006, "step": 4703, "step_time": 24.002835299819708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 112.0, "completions/mean_terminated_length": 112.0, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.28455759584903717, "epoch": 0.21787864752200092, "frac_reward_zero_std": 1.0, "grad_norm": 0.004972612485289574, "kl": 0.0028877685545012355, "learning_rate": 9.564335340435387e-07, "loss": 0.0001, "num_tokens": 129202326.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4704, "step_time": 16.196747165173292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 160.0625, "completions/mean_terminated_length": 160.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.38754066824913025, "epoch": 0.21792496526169522, "frac_reward_zero_std": 1.0, "grad_norm": 0.002032277174293995, "kl": 0.0025206418649759144, "learning_rate": 9.564242704955998e-07, "loss": 0.0001, "num_tokens": 129252519.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4705, "step_time": 25.311698351055384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 150.0, "completions/max_terminated_length": 150.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.30276161432266235, "epoch": 0.21797128300138954, "frac_reward_zero_std": 1.0, "grad_norm": 0.004128539469093084, "kl": 0.0028701364062726498, "learning_rate": 9.56415006947661e-07, "loss": 0.0001, "num_tokens": 129276728.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4706, "step_time": 16.46551175415516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 166.9375, "completions/mean_terminated_length": 166.9375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.41504184901714325, "epoch": 0.21801760074108384, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027445501182228327, "kl": 0.002175698842620477, "learning_rate": 9.56405743399722e-07, "loss": 0.0001, "num_tokens": 129328999.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4707, "step_time": 27.024520061910152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 203.625, "completions/mean_terminated_length": 203.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.4600290209054947, "epoch": 0.21806391848077814, "frac_reward_zero_std": 1.0, "grad_norm": 0.006384314503520727, "kl": 0.008535626577213407, "learning_rate": 9.563964798517832e-07, "loss": 0.0004, "num_tokens": 129362049.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4708, "step_time": 28.051227170974016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.3612709268927574, "epoch": 0.21811023622047243, "frac_reward_zero_std": 1.0, "grad_norm": 0.002812664955854416, "kl": 0.0027132914983667433, "learning_rate": 9.563872163038443e-07, "loss": 0.0001, "num_tokens": 129384420.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4709, "step_time": 16.85134883597493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 205.9375, "completions/mean_terminated_length": 205.9375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.18895790353417397, "epoch": 0.21815655396016675, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013839430175721645, "kl": 0.002374694449827075, "learning_rate": 9.563779527559055e-07, "loss": 0.0001, "num_tokens": 129407699.0, "reward": 0.8668779134750366, "reward_std": 0.0, "rewards/reward_func/mean": 0.8668779134750366, "rewards/reward_func/std": 0.0, "step": 4710, "step_time": 23.17830441892147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 134.375, "completions/mean_terminated_length": 134.375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2579244151711464, "epoch": 0.21820287169986105, "frac_reward_zero_std": 1.0, "grad_norm": 0.005870082415640354, "kl": 0.0031024435884319246, "learning_rate": 9.563686892079666e-07, "loss": 0.0002, "num_tokens": 129427433.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4711, "step_time": 16.59689372777939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 115.8125, "completions/mean_terminated_length": 115.8125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.28431785851716995, "epoch": 0.21824918943955535, "frac_reward_zero_std": 1.0, "grad_norm": 0.003787873312830925, "kl": 0.0024733886239118874, "learning_rate": 9.563594256600277e-07, "loss": 0.0001, "num_tokens": 129447542.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4712, "step_time": 14.652054857462645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.40155162662267685, "epoch": 0.21829550717924964, "frac_reward_zero_std": 0.0, "grad_norm": 0.16008706390857697, "kl": 0.00958009366877377, "learning_rate": 9.563501621120888e-07, "loss": 0.0182, "num_tokens": 129471782.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4713, "step_time": 25.589435674250126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 171.6875, "completions/mean_terminated_length": 171.6875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.39728278666734695, "epoch": 0.21834182491894397, "frac_reward_zero_std": 1.0, "grad_norm": 0.005508288275450468, "kl": 0.004464473749976605, "learning_rate": 9.5634089856415e-07, "loss": 0.0002, "num_tokens": 129500737.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4714, "step_time": 22.905235670506954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 184.5625, "completions/mean_terminated_length": 184.5625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.34112362563610077, "epoch": 0.21838814265863826, "frac_reward_zero_std": 0.0, "grad_norm": 0.12135380506515503, "kl": 0.016973228193819523, "learning_rate": 9.56331635016211e-07, "loss": -0.0496, "num_tokens": 129523018.0, "reward": 0.7375502586364746, "reward_std": 0.3552190959453583, "rewards/reward_func/mean": 0.7375502586364746, "rewards/reward_func/std": 0.3552190959453583, "step": 4715, "step_time": 22.028970792889595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 131.1875, "completions/mean_terminated_length": 131.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.31367379426956177, "epoch": 0.21843446039833256, "frac_reward_zero_std": 1.0, "grad_norm": 0.005986443720757961, "kl": 0.0048665719805285335, "learning_rate": 9.563223714682724e-07, "loss": 0.0002, "num_tokens": 129543005.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4716, "step_time": 15.876225415617228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 232.8125, "completions/mean_terminated_length": 232.8125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.5032502710819244, "epoch": 0.21848077813802685, "frac_reward_zero_std": 0.0, "grad_norm": 0.13312603533267975, "kl": 0.007286880747415125, "learning_rate": 9.563131079203336e-07, "loss": 0.076, "num_tokens": 129566202.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 4717, "step_time": 30.825655966997147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 136.375, "completions/mean_terminated_length": 136.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.2676912546157837, "epoch": 0.21852709587772118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0040293484926223755, "kl": 0.0029119584360159934, "learning_rate": 9.563038443723945e-07, "loss": 0.0001, "num_tokens": 129588176.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4718, "step_time": 16.30700771510601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 160.8125, "completions/mean_terminated_length": 160.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.15503466874361038, "epoch": 0.21857341361741547, "frac_reward_zero_std": 1.0, "grad_norm": 0.002281376626342535, "kl": 0.0025667650625109673, "learning_rate": 9.562945808244556e-07, "loss": 0.0001, "num_tokens": 129612781.0, "reward": 0.1353352814912796, "reward_std": 0.0, "rewards/reward_func/mean": 0.1353352814912796, "rewards/reward_func/std": 0.0, "step": 4719, "step_time": 18.35327873378992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 129.0, "completions/max_terminated_length": 129.0, "completions/mean_length": 107.875, "completions/mean_terminated_length": 107.875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.24697128683328629, "epoch": 0.21861973135710977, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030700985807925463, "kl": 0.002165102807339281, "learning_rate": 9.56285317276517e-07, "loss": 0.0001, "num_tokens": 129632123.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4720, "step_time": 13.440684881061316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 162.6875, "completions/mean_terminated_length": 162.6875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.38421468436717987, "epoch": 0.21866604909680407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033809831365942955, "kl": 0.002879140607547015, "learning_rate": 9.56276053728578e-07, "loss": 0.0001, "num_tokens": 129692486.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4721, "step_time": 30.075780019164085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 117.375, "completions/mean_terminated_length": 117.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.26094746217131615, "epoch": 0.2187123668364984, "frac_reward_zero_std": 1.0, "grad_norm": 0.005386655684560537, "kl": 0.003322462609503418, "learning_rate": 9.562667901806392e-07, "loss": 0.0002, "num_tokens": 129714252.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4722, "step_time": 15.187982801347971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.2107616625726223, "epoch": 0.21875868457619269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058716400526463985, "kl": 0.00631087610963732, "learning_rate": 9.562575266327003e-07, "loss": 0.0003, "num_tokens": 129752628.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4723, "step_time": 32.07692110911012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 144.3125, "completions/mean_terminated_length": 144.3125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.28818415850400925, "epoch": 0.21880500231588698, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030706559773534536, "kl": 0.0022004097991157323, "learning_rate": 9.562482630847614e-07, "loss": 0.0001, "num_tokens": 129773161.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4724, "step_time": 17.998664502054453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 163.3125, "completions/mean_terminated_length": 163.3125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.4172137379646301, "epoch": 0.21885132005558128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025425860658288, "kl": 0.0025123218656517565, "learning_rate": 9.562389995368225e-07, "loss": 0.0001, "num_tokens": 129824510.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4725, "step_time": 27.130378130823374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 120.1875, "completions/mean_terminated_length": 120.1875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2462136559188366, "epoch": 0.2188976377952756, "frac_reward_zero_std": 1.0, "grad_norm": 0.003374190768226981, "kl": 0.0016968742129392922, "learning_rate": 9.562297359888837e-07, "loss": 0.0001, "num_tokens": 129844497.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4726, "step_time": 16.39966733008623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 160.5625, "completions/mean_terminated_length": 160.5625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.20630601793527603, "epoch": 0.2189439555349699, "frac_reward_zero_std": 1.0, "grad_norm": 0.005442303605377674, "kl": 0.003976957174018025, "learning_rate": 9.562204724409448e-07, "loss": 0.0002, "num_tokens": 129865194.0, "reward": 0.904837429523468, "reward_std": 0.0, "rewards/reward_func/mean": 0.904837429523468, "rewards/reward_func/std": 0.0, "step": 4727, "step_time": 19.599647972732782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 132.0, "completions/max_terminated_length": 132.0, "completions/mean_length": 127.875, "completions/mean_terminated_length": 127.875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.1109622921794653, "epoch": 0.2189902732746642, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014203027822077274, "kl": 0.0011672762630041689, "learning_rate": 9.56211208893006e-07, "loss": 0.0001, "num_tokens": 129887160.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 4728, "step_time": 15.035693380981684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.26559437066316605, "epoch": 0.2190365910143585, "frac_reward_zero_std": 1.0, "grad_norm": 0.002877059392631054, "kl": 0.0023826081887818873, "learning_rate": 9.56201945345067e-07, "loss": 0.0001, "num_tokens": 129910318.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4729, "step_time": 15.939006235450506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.24844319373369217, "epoch": 0.2190829087540528, "frac_reward_zero_std": 1.0, "grad_norm": 0.022856680676341057, "kl": 0.014928898774087429, "learning_rate": 9.561926817971284e-07, "loss": 0.0007, "num_tokens": 129947080.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4730, "step_time": 25.66942211985588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.35775817930698395, "epoch": 0.2191292264937471, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023563746362924576, "kl": 0.002303392451722175, "learning_rate": 9.561834182491893e-07, "loss": 0.0001, "num_tokens": 129977858.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4731, "step_time": 20.681682847440243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 125.3125, "completions/mean_terminated_length": 125.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.23523488640785217, "epoch": 0.2191755442334414, "frac_reward_zero_std": 1.0, "grad_norm": 0.005492073483765125, "kl": 0.003430097654927522, "learning_rate": 9.561741547012504e-07, "loss": 0.0002, "num_tokens": 129997815.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4732, "step_time": 15.587875299155712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 183.9375, "completions/mean_terminated_length": 183.9375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2596520856022835, "epoch": 0.2192218619731357, "frac_reward_zero_std": 1.0, "grad_norm": 0.001385444076731801, "kl": 0.0012672853044932708, "learning_rate": 9.561648911533118e-07, "loss": 0.0001, "num_tokens": 130052310.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4733, "step_time": 28.772629994899035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 223.1875, "completions/mean_terminated_length": 223.1875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "entropy": 0.256424181163311, "epoch": 0.21926817971283002, "frac_reward_zero_std": 0.0, "grad_norm": 0.12284564971923828, "kl": 0.011806728085502982, "learning_rate": 9.561556276053729e-07, "loss": 0.0019, "num_tokens": 130075881.0, "reward": 0.9938837289810181, "reward_std": 0.016712799668312073, "rewards/reward_func/mean": 0.9938837289810181, "rewards/reward_func/std": 0.016712794080376625, "step": 4734, "step_time": 24.939057677984238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 191.25, "completions/mean_terminated_length": 191.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.23606186732649803, "epoch": 0.21931449745252432, "frac_reward_zero_std": 0.0, "grad_norm": 0.11030472069978714, "kl": 0.00840791326481849, "learning_rate": 9.56146364057434e-07, "loss": -0.0696, "num_tokens": 130103517.0, "reward": 0.9772799015045166, "reward_std": 0.03029346466064453, "rewards/reward_func/mean": 0.9772799015045166, "rewards/reward_func/std": 0.03029346466064453, "step": 4735, "step_time": 23.88426313176751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 128.375, "completions/mean_terminated_length": 128.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.13496162928640842, "epoch": 0.21936081519221862, "frac_reward_zero_std": 1.0, "grad_norm": 0.005121160298585892, "kl": 0.003617467125877738, "learning_rate": 9.561371005094951e-07, "loss": 0.0002, "num_tokens": 130128099.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4736, "step_time": 16.25461396574974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 155.4375, "completions/mean_terminated_length": 155.4375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3519606217741966, "epoch": 0.2194071329319129, "frac_reward_zero_std": 1.0, "grad_norm": 0.004885755944997072, "kl": 0.0037717887898907065, "learning_rate": 9.561278369615563e-07, "loss": 0.0002, "num_tokens": 130156138.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4737, "step_time": 20.14709546044469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4937119781970978, "epoch": 0.21945345067160724, "frac_reward_zero_std": 1.0, "grad_norm": 0.007549159228801727, "kl": 0.0056273117661476135, "learning_rate": 9.561185734136174e-07, "loss": 0.0003, "num_tokens": 130183738.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4738, "step_time": 24.763157915323973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 180.9375, "completions/mean_terminated_length": 180.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.4217841774225235, "epoch": 0.21949976841130153, "frac_reward_zero_std": 1.0, "grad_norm": 0.002368347719311714, "kl": 0.002506342134438455, "learning_rate": 9.561093098656785e-07, "loss": 0.0001, "num_tokens": 130224489.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4739, "step_time": 26.37431424111128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 178.0625, "completions/mean_terminated_length": 178.0625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.361618809401989, "epoch": 0.21954608615099583, "frac_reward_zero_std": 1.0, "grad_norm": 0.009282790124416351, "kl": 0.008149751694872975, "learning_rate": 9.561000463177396e-07, "loss": 0.0004, "num_tokens": 130249818.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4740, "step_time": 25.666003689169884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 133.0625, "completions/mean_terminated_length": 133.0625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.23081785067915916, "epoch": 0.21959240389069012, "frac_reward_zero_std": 1.0, "grad_norm": 0.004358035046607256, "kl": 0.0024257359909825027, "learning_rate": 9.560907827698008e-07, "loss": 0.0001, "num_tokens": 130269371.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4741, "step_time": 17.47803793102503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 124.4375, "completions/mean_terminated_length": 124.4375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.2911137491464615, "epoch": 0.21963872163038445, "frac_reward_zero_std": 1.0, "grad_norm": 0.01487666554749012, "kl": 0.006362575339153409, "learning_rate": 9.560815192218619e-07, "loss": 0.0003, "num_tokens": 130292002.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4742, "step_time": 17.08535874634981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 161.375, "completions/mean_terminated_length": 161.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.38935375958681107, "epoch": 0.21968503937007874, "frac_reward_zero_std": 1.0, "grad_norm": 0.002340888138860464, "kl": 0.0022539437632076442, "learning_rate": 9.56072255673923e-07, "loss": 0.0001, "num_tokens": 130342776.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4743, "step_time": 27.16944271698594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.327069990336895, "epoch": 0.21973135710977304, "frac_reward_zero_std": 1.0, "grad_norm": 0.003524728585034609, "kl": 0.0026839629281312227, "learning_rate": 9.560629921259841e-07, "loss": 0.0001, "num_tokens": 130362880.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4744, "step_time": 16.383930154144764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 149.625, "completions/mean_terminated_length": 149.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.39243703335523605, "epoch": 0.21977767484946734, "frac_reward_zero_std": 1.0, "grad_norm": 0.003960326313972473, "kl": 0.0033166881185024977, "learning_rate": 9.560537285780453e-07, "loss": 0.0002, "num_tokens": 130391338.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4745, "step_time": 20.206228274852037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.4340764209628105, "epoch": 0.21982399258916166, "frac_reward_zero_std": 0.0, "grad_norm": 0.10383335500955582, "kl": 0.011215093079954386, "learning_rate": 9.560444650301066e-07, "loss": -0.1166, "num_tokens": 130419948.0, "reward": 0.4018140435218811, "reward_std": 0.4711059331893921, "rewards/reward_func/mean": 0.4018140435218811, "rewards/reward_func/std": 0.47110599279403687, "step": 4746, "step_time": 29.4142703153193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 108.3125, "completions/mean_terminated_length": 108.3125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.2552882730960846, "epoch": 0.21987031032885596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025769576895982027, "kl": 0.0020312996639404446, "learning_rate": 9.560352014821677e-07, "loss": 0.0001, "num_tokens": 130440225.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4747, "step_time": 13.208584818989038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 180.0625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.3985041454434395, "epoch": 0.21991662806855025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022527764085680246, "kl": 0.0023291026591323316, "learning_rate": 9.560259379342288e-07, "loss": 0.0001, "num_tokens": 130489458.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4748, "step_time": 24.394183471798897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 136.3125, "completions/mean_terminated_length": 136.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.27034758776426315, "epoch": 0.21996294580824455, "frac_reward_zero_std": 1.0, "grad_norm": 0.005365348886698484, "kl": 0.003320941119454801, "learning_rate": 9.560166743862898e-07, "loss": 0.0002, "num_tokens": 130525575.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4749, "step_time": 18.391599718481302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 125.4375, "completions/mean_terminated_length": 125.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.33369433879852295, "epoch": 0.22000926354793887, "frac_reward_zero_std": 1.0, "grad_norm": 0.0039688884280622005, "kl": 0.0027712912997230887, "learning_rate": 9.56007410838351e-07, "loss": 0.0001, "num_tokens": 130552526.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4750, "step_time": 15.59014567732811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 146.9375, "completions/mean_terminated_length": 146.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.28914664685726166, "epoch": 0.22005558128763317, "frac_reward_zero_std": 1.0, "grad_norm": 0.00288494979031384, "kl": 0.0020578180556185544, "learning_rate": 9.559981472904122e-07, "loss": 0.0001, "num_tokens": 130588653.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4751, "step_time": 20.300949413329363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 244.0625, "completions/mean_terminated_length": 244.0625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.31658507138490677, "epoch": 0.22010189902732746, "frac_reward_zero_std": 0.0, "grad_norm": 0.0900760367512703, "kl": 0.02131354482844472, "learning_rate": 9.559888837424733e-07, "loss": -0.1653, "num_tokens": 130626654.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 4752, "step_time": 32.26984718814492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 186.0625, "completions/mean_terminated_length": 186.0625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3751211538910866, "epoch": 0.22014821676702176, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029134678188711405, "kl": 0.0027626954251900315, "learning_rate": 9.559796201945345e-07, "loss": 0.0001, "num_tokens": 130680063.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4753, "step_time": 26.41571592912078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3934357687830925, "epoch": 0.22019453450671608, "frac_reward_zero_std": 0.0, "grad_norm": 0.09501536935567856, "kl": 0.016132954973727465, "learning_rate": 9.559703566465956e-07, "loss": 0.0564, "num_tokens": 130704269.0, "reward": 0.5964052677154541, "reward_std": 0.43766361474990845, "rewards/reward_func/mean": 0.5964052677154541, "rewards/reward_func/std": 0.43766364455223083, "step": 4754, "step_time": 25.017730347812176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 126.125, "completions/mean_terminated_length": 126.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3438038304448128, "epoch": 0.22024085224641038, "frac_reward_zero_std": 1.0, "grad_norm": 0.003388928947970271, "kl": 0.002402106241788715, "learning_rate": 9.559610930986567e-07, "loss": 0.0001, "num_tokens": 130725583.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4755, "step_time": 13.71153750270605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 190.9375, "completions/mean_terminated_length": 190.9375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2335614301264286, "epoch": 0.22028716998610468, "frac_reward_zero_std": 1.0, "grad_norm": 0.005704137030988932, "kl": 0.03125150501728058, "learning_rate": 9.559518295507178e-07, "loss": 0.0016, "num_tokens": 130750430.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4756, "step_time": 18.895425386726856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 177.9375, "completions/mean_terminated_length": 177.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3949665054678917, "epoch": 0.22033348772579897, "frac_reward_zero_std": 1.0, "grad_norm": 0.005754759069532156, "kl": 0.004841367830522358, "learning_rate": 9.55942566002779e-07, "loss": 0.0002, "num_tokens": 130774621.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4757, "step_time": 19.492938246577978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4030352756381035, "epoch": 0.2203798054654933, "frac_reward_zero_std": 1.0, "grad_norm": 0.005534633528441191, "kl": 0.0049605792155489326, "learning_rate": 9.5593330245484e-07, "loss": 0.0002, "num_tokens": 130797157.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4758, "step_time": 23.59635440632701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.31023237109184265, "epoch": 0.2204261232051876, "frac_reward_zero_std": 1.0, "grad_norm": 0.002594081684947014, "kl": 0.002225446922238916, "learning_rate": 9.559240389069012e-07, "loss": 0.0001, "num_tokens": 130822741.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4759, "step_time": 16.06799967214465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 143.0, "completions/mean_terminated_length": 143.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.34480416774749756, "epoch": 0.2204724409448819, "frac_reward_zero_std": 1.0, "grad_norm": 0.004018284380435944, "kl": 0.00251045823097229, "learning_rate": 9.559147753589626e-07, "loss": 0.0001, "num_tokens": 130845301.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4760, "step_time": 15.60106673464179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 178.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.18607999756932259, "epoch": 0.22051875868457618, "frac_reward_zero_std": 0.0, "grad_norm": 0.11738462001085281, "kl": 0.003470874042250216, "learning_rate": 9.559055118110235e-07, "loss": 0.0343, "num_tokens": 130891829.0, "reward": 0.9041609764099121, "reward_std": 0.0012101116590201855, "rewards/reward_func/mean": 0.9041609764099121, "rewards/reward_func/std": 0.0012101028114557266, "step": 4761, "step_time": 25.300740618258715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 189.1875, "completions/mean_terminated_length": 189.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.312733419239521, "epoch": 0.2205650764242705, "frac_reward_zero_std": 0.0, "grad_norm": 0.12185503542423248, "kl": 0.01029410946648568, "learning_rate": 9.558962482630846e-07, "loss": -0.1181, "num_tokens": 130913176.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 4762, "step_time": 22.558964394032955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.3007870614528656, "epoch": 0.2206113941639648, "frac_reward_zero_std": 1.0, "grad_norm": 0.004395459312945604, "kl": 0.00244244173518382, "learning_rate": 9.55886984715146e-07, "loss": 0.0001, "num_tokens": 130932896.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4763, "step_time": 16.356438282877207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3589734584093094, "epoch": 0.2206577119036591, "frac_reward_zero_std": 1.0, "grad_norm": 0.00452509056776762, "kl": 0.00330969673814252, "learning_rate": 9.55877721167207e-07, "loss": 0.0002, "num_tokens": 130959110.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4764, "step_time": 20.84293807297945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 224.4375, "completions/mean_terminated_length": 224.4375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.4439801424741745, "epoch": 0.2207040296433534, "frac_reward_zero_std": 0.0, "grad_norm": 0.10928815603256226, "kl": 0.0056369376834481955, "learning_rate": 9.558684576192682e-07, "loss": 0.227, "num_tokens": 130981293.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.875, "rewards/reward_func/std": 0.3415650427341461, "step": 4765, "step_time": 35.4761412255466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 113.875, "completions/mean_terminated_length": 113.875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.30517885088920593, "epoch": 0.22075034738304772, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023938727099448442, "kl": 0.001815505907870829, "learning_rate": 9.558591940713293e-07, "loss": 0.0001, "num_tokens": 131002379.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4766, "step_time": 14.039244782179594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.22106963396072388, "epoch": 0.22079666512274201, "frac_reward_zero_std": 0.0, "grad_norm": 0.17539939284324646, "kl": 0.011657172464765608, "learning_rate": 9.558499305233904e-07, "loss": -0.0224, "num_tokens": 131026671.0, "reward": 0.9669851064682007, "reward_std": 0.012887682765722275, "rewards/reward_func/mean": 0.9669851064682007, "rewards/reward_func/std": 0.012887690216302872, "step": 4767, "step_time": 20.48035392165184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 132.125, "completions/mean_terminated_length": 132.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2446160390973091, "epoch": 0.2208429828624363, "frac_reward_zero_std": 1.0, "grad_norm": 0.002266390947625041, "kl": 0.0019007090595550835, "learning_rate": 9.558406669754516e-07, "loss": 0.0001, "num_tokens": 131046177.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4768, "step_time": 14.428436130285263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 179.0, "completions/mean_terminated_length": 179.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.372202567756176, "epoch": 0.2208893006021306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038536088541150093, "kl": 0.0025483937351964414, "learning_rate": 9.558314034275127e-07, "loss": 0.0001, "num_tokens": 131072321.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4769, "step_time": 19.525206457823515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 145.375, "completions/mean_terminated_length": 145.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.13607894256711006, "epoch": 0.22093561834182493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031790786888450384, "kl": 0.021630794275552034, "learning_rate": 9.558221398795738e-07, "loss": 0.0011, "num_tokens": 131103911.0, "reward": 0.7292129397392273, "reward_std": 0.0, "rewards/reward_func/mean": 0.7292129397392273, "rewards/reward_func/std": 0.0, "step": 4770, "step_time": 18.31699935719371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 291.1875, "completions/mean_terminated_length": 291.1875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.22177626937627792, "epoch": 0.22098193608151923, "frac_reward_zero_std": 0.0, "grad_norm": 0.07899122685194016, "kl": 0.007844890584237874, "learning_rate": 9.55812876331635e-07, "loss": -0.0933, "num_tokens": 131129674.0, "reward": 0.9250829219818115, "reward_std": 0.24668878316879272, "rewards/reward_func/mean": 0.9250829219818115, "rewards/reward_func/std": 0.24668878316879272, "step": 4771, "step_time": 29.23802850022912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 136.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.26854807138442993, "epoch": 0.22102825382121352, "frac_reward_zero_std": 1.0, "grad_norm": 0.004177852068096399, "kl": 0.0025815270200837404, "learning_rate": 9.55803612783696e-07, "loss": 0.0001, "num_tokens": 131151574.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4772, "step_time": 14.716210912913084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.40225064009428024, "epoch": 0.22107457156090782, "frac_reward_zero_std": 1.0, "grad_norm": 0.005004691891372204, "kl": 0.003686443087644875, "learning_rate": 9.557943492357574e-07, "loss": 0.0002, "num_tokens": 131177611.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4773, "step_time": 20.878033470362425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 111.1875, "completions/mean_terminated_length": 111.1875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2879294231534004, "epoch": 0.22112088930060214, "frac_reward_zero_std": 1.0, "grad_norm": 0.004238381050527096, "kl": 0.002292108372785151, "learning_rate": 9.557850856878183e-07, "loss": 0.0001, "num_tokens": 131199086.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4774, "step_time": 13.041929300874472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3509957119822502, "epoch": 0.22116720704029644, "frac_reward_zero_std": 0.0, "grad_norm": 0.11466275155544281, "kl": 0.02688826760277152, "learning_rate": 9.557758221398794e-07, "loss": -0.0845, "num_tokens": 131224804.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 4775, "step_time": 25.629514146596193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.2786107286810875, "epoch": 0.22121352477999073, "frac_reward_zero_std": 0.0, "grad_norm": 0.08454053103923798, "kl": 0.010854382766410708, "learning_rate": 9.557665585919408e-07, "loss": -0.0484, "num_tokens": 131260614.0, "reward": 0.7811205387115479, "reward_std": 0.20829880237579346, "rewards/reward_func/mean": 0.7811205387115479, "rewards/reward_func/std": 0.20829881727695465, "step": 4776, "step_time": 29.490225601941347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 322.3125, "completions/mean_terminated_length": 322.3125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.34080687165260315, "epoch": 0.22125984251968503, "frac_reward_zero_std": 0.0, "grad_norm": 0.07830939441919327, "kl": 0.023424079176038504, "learning_rate": 9.557572950440019e-07, "loss": -0.189, "num_tokens": 131294955.0, "reward": 0.46473217010498047, "reward_std": 0.38188716769218445, "rewards/reward_func/mean": 0.46473217010498047, "rewards/reward_func/std": 0.38188719749450684, "step": 4777, "step_time": 36.8105660751462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.21230966225266457, "epoch": 0.22130616025937935, "frac_reward_zero_std": 1.0, "grad_norm": 0.002865222282707691, "kl": 0.0025091485294979066, "learning_rate": 9.55748031496063e-07, "loss": 0.0001, "num_tokens": 131333757.0, "reward": 0.8111499547958374, "reward_std": 0.0, "rewards/reward_func/mean": 0.8111499547958374, "rewards/reward_func/std": 0.0, "step": 4778, "step_time": 26.44652072712779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.38234594464302063, "epoch": 0.22135247799907365, "frac_reward_zero_std": 1.0, "grad_norm": 0.025664258748292923, "kl": 0.01316279056482017, "learning_rate": 9.557387679481241e-07, "loss": 0.0007, "num_tokens": 131363308.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4779, "step_time": 21.489420641213655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.2646244913339615, "epoch": 0.22139879573876795, "frac_reward_zero_std": 1.0, "grad_norm": 0.005343126133084297, "kl": 0.0025474660797044635, "learning_rate": 9.557295044001853e-07, "loss": 0.0001, "num_tokens": 131399071.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4780, "step_time": 18.119741652160883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4187124893069267, "epoch": 0.22144511347846224, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029853687155991793, "kl": 0.002423883182927966, "learning_rate": 9.557202408522464e-07, "loss": 0.0001, "num_tokens": 131428501.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4781, "step_time": 18.146856732666492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.253369003534317, "epoch": 0.22149143121815656, "frac_reward_zero_std": 0.0, "grad_norm": 0.11933569610118866, "kl": 0.009504554560407996, "learning_rate": 9.557109773043075e-07, "loss": -0.0532, "num_tokens": 131449309.0, "reward": 0.6608119010925293, "reward_std": 0.0635676234960556, "rewards/reward_func/mean": 0.6608119010925293, "rewards/reward_func/std": 0.0635676234960556, "step": 4782, "step_time": 18.973564580082893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.3002912551164627, "epoch": 0.22153774895785086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0045975628308951855, "kl": 0.002598756196675822, "learning_rate": 9.557017137563686e-07, "loss": 0.0001, "num_tokens": 131469575.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4783, "step_time": 14.004752777516842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.24681223556399345, "epoch": 0.22158406669754516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0032644090242683887, "kl": 0.0022140414803288877, "learning_rate": 9.556924502084298e-07, "loss": 0.0001, "num_tokens": 131494540.0, "reward": 0.9459594488143921, "reward_std": 0.0, "rewards/reward_func/mean": 0.9459594488143921, "rewards/reward_func/std": 0.0, "step": 4784, "step_time": 17.667529467493296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2953430190682411, "epoch": 0.22163038443723945, "frac_reward_zero_std": 0.0, "grad_norm": 0.10740058124065399, "kl": 0.03867760160937905, "learning_rate": 9.556831866604909e-07, "loss": -0.0442, "num_tokens": 131517790.0, "reward": 0.5540241599082947, "reward_std": 0.27742329239845276, "rewards/reward_func/mean": 0.5540241599082947, "rewards/reward_func/std": 0.27742329239845276, "step": 4785, "step_time": 24.566777862608433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 117.1875, "completions/mean_terminated_length": 117.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.29800402373075485, "epoch": 0.22167670217693378, "frac_reward_zero_std": 1.0, "grad_norm": 0.002167989732697606, "kl": 0.0017821225337684155, "learning_rate": 9.55673923112552e-07, "loss": 0.0001, "num_tokens": 131540881.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4786, "step_time": 13.696676794439554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.20346342399716377, "epoch": 0.22172301991662807, "frac_reward_zero_std": 0.0, "grad_norm": 0.12080593407154083, "kl": 0.03451876435428858, "learning_rate": 9.556646595646131e-07, "loss": -0.0104, "num_tokens": 131563147.0, "reward": 0.9727948904037476, "reward_std": 0.10882046073675156, "rewards/reward_func/mean": 0.9727948904037476, "rewards/reward_func/std": 0.10882046818733215, "step": 4787, "step_time": 18.88577552884817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 137.125, "completions/mean_terminated_length": 137.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.2903420254588127, "epoch": 0.22176933765632237, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019144221441820264, "kl": 0.0015364754654001445, "learning_rate": 9.556553960166743e-07, "loss": 0.0001, "num_tokens": 131585389.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4788, "step_time": 15.55664437264204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 122.5625, "completions/mean_terminated_length": 122.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.2679368704557419, "epoch": 0.22181565539601666, "frac_reward_zero_std": 1.0, "grad_norm": 0.006853099446743727, "kl": 0.00363956147339195, "learning_rate": 9.556461324687354e-07, "loss": 0.0002, "num_tokens": 131608550.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4789, "step_time": 14.307089641690254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 177.4375, "completions/mean_terminated_length": 177.4375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.17491300031542778, "epoch": 0.221861973135711, "frac_reward_zero_std": 1.0, "grad_norm": 0.001836139359511435, "kl": 0.0013805674971081316, "learning_rate": 9.556368689207967e-07, "loss": 0.0001, "num_tokens": 131645853.0, "reward": 0.8914703726768494, "reward_std": 0.0, "rewards/reward_func/mean": 0.8914703726768494, "rewards/reward_func/std": 0.0, "step": 4790, "step_time": 21.233795523643494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.241499662399292, "epoch": 0.22190829087540528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076012942008674145, "kl": 0.009055163012817502, "learning_rate": 9.556276053728579e-07, "loss": 0.0004, "num_tokens": 131677257.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4791, "step_time": 25.706017028540373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 119.5, "completions/mean_terminated_length": 119.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.25134844332933426, "epoch": 0.22195460861509958, "frac_reward_zero_std": 1.0, "grad_norm": 0.002584259258583188, "kl": 0.002078727964544669, "learning_rate": 9.556183418249188e-07, "loss": 0.0001, "num_tokens": 131698961.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4792, "step_time": 15.152140188962221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 178.125, "completions/mean_terminated_length": 178.125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.36989825963974, "epoch": 0.22200092635479388, "frac_reward_zero_std": 0.0, "grad_norm": 0.11231192946434021, "kl": 0.021663016639649868, "learning_rate": 9.5560907827698e-07, "loss": 0.0742, "num_tokens": 131720243.0, "reward": 0.625, "reward_std": 0.5, "rewards/reward_func/mean": 0.625, "rewards/reward_func/std": 0.5, "step": 4793, "step_time": 21.492061279714108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.37613045424222946, "epoch": 0.2220472440944882, "frac_reward_zero_std": 1.0, "grad_norm": 0.004512382671236992, "kl": 0.0037859281874261796, "learning_rate": 9.555998147290412e-07, "loss": 0.0002, "num_tokens": 131744785.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4794, "step_time": 22.01899578794837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 248.5625, "completions/mean_terminated_length": 248.5625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.23357480764389038, "epoch": 0.2220935618341825, "frac_reward_zero_std": 0.0, "grad_norm": 0.11967083811759949, "kl": 0.021958988159894943, "learning_rate": 9.555905511811024e-07, "loss": -0.1141, "num_tokens": 131783722.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 4795, "step_time": 28.449336130172014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 176.25, "completions/mean_terminated_length": 176.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3793416693806648, "epoch": 0.2221398795738768, "frac_reward_zero_std": 1.0, "grad_norm": 0.002750967163592577, "kl": 0.002727110870182514, "learning_rate": 9.555812876331635e-07, "loss": 0.0001, "num_tokens": 131820478.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4796, "step_time": 22.476028122007847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 232.0625, "completions/mean_terminated_length": 232.0625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.3448811545968056, "epoch": 0.2221861973135711, "frac_reward_zero_std": 0.0, "grad_norm": 0.09937860816717148, "kl": 0.014102430315688252, "learning_rate": 9.555720240852246e-07, "loss": -0.0193, "num_tokens": 131850783.0, "reward": 0.5833531022071838, "reward_std": 0.4670347571372986, "rewards/reward_func/mean": 0.5833531022071838, "rewards/reward_func/std": 0.4670347571372986, "step": 4797, "step_time": 25.142017655074596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 158.1875, "completions/mean_terminated_length": 158.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3435058891773224, "epoch": 0.2222325150532654, "frac_reward_zero_std": 1.0, "grad_norm": 0.00758472690358758, "kl": 0.006733081070706248, "learning_rate": 9.555627605372857e-07, "loss": 0.0003, "num_tokens": 131872082.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4798, "step_time": 16.301632039248943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 124.0, "completions/mean_terminated_length": 124.0, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.20530404523015022, "epoch": 0.2222788327929597, "frac_reward_zero_std": 1.0, "grad_norm": 0.004247116856276989, "kl": 0.002394238661509007, "learning_rate": 9.555534969893468e-07, "loss": 0.0001, "num_tokens": 131891426.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4799, "step_time": 13.246216677129269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 177.375, "completions/mean_terminated_length": 177.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.24138593301177025, "epoch": 0.222325150532654, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015958865405991673, "kl": 0.0024760275264270604, "learning_rate": 9.55544233441408e-07, "loss": 0.0001, "num_tokens": 131942264.0, "reward": 0.8385766744613647, "reward_std": 0.0, "rewards/reward_func/mean": 0.8385766744613647, "rewards/reward_func/std": 0.0, "step": 4800, "step_time": 25.961857695132494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 129.75, "completions/mean_terminated_length": 129.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.22506817057728767, "epoch": 0.2223714682723483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031876340508461, "kl": 0.0014770270208828151, "learning_rate": 9.55534969893469e-07, "loss": 0.0001, "num_tokens": 131962740.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4801, "step_time": 13.827475391328335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 164.5, "completions/mean_terminated_length": 164.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.4195042997598648, "epoch": 0.22241778601204262, "frac_reward_zero_std": 1.0, "grad_norm": 0.002598832594230771, "kl": 0.0027095703408122063, "learning_rate": 9.555257063455302e-07, "loss": 0.0001, "num_tokens": 132002220.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4802, "step_time": 20.75063246116042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 133.1875, "completions/mean_terminated_length": 133.1875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.30575740337371826, "epoch": 0.22246410375173692, "frac_reward_zero_std": 1.0, "grad_norm": 0.005229070782661438, "kl": 0.0030032155918888748, "learning_rate": 9.555164427975916e-07, "loss": 0.0002, "num_tokens": 132030191.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4803, "step_time": 16.250312250107527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.17106206342577934, "epoch": 0.22251042149143122, "frac_reward_zero_std": 0.0, "grad_norm": 0.11654699593782425, "kl": 0.00641520885983482, "learning_rate": 9.555071792496527e-07, "loss": -0.0502, "num_tokens": 132058807.0, "reward": 0.8629332184791565, "reward_std": 0.04850945621728897, "rewards/reward_func/mean": 0.8629332184791565, "rewards/reward_func/std": 0.048509448766708374, "step": 4804, "step_time": 21.568435087800026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 176.8125, "completions/mean_terminated_length": 176.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2175315022468567, "epoch": 0.2225567392311255, "frac_reward_zero_std": 0.0, "grad_norm": 0.15883836150169373, "kl": 0.03638976812362671, "learning_rate": 9.554979157017136e-07, "loss": -0.0371, "num_tokens": 132088628.0, "reward": 0.7926719784736633, "reward_std": 0.29351553320884705, "rewards/reward_func/mean": 0.7926719784736633, "rewards/reward_func/std": 0.29351553320884705, "step": 4805, "step_time": 19.971502542495728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.15308186411857605, "epoch": 0.22260305697081983, "frac_reward_zero_std": 1.0, "grad_norm": 0.004143744707107544, "kl": 0.0025578418280929327, "learning_rate": 9.55488652153775e-07, "loss": 0.0001, "num_tokens": 132110332.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 4806, "step_time": 17.15120692551136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 152.0, "completions/mean_terminated_length": 152.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.14756058901548386, "epoch": 0.22264937471051413, "frac_reward_zero_std": 0.0, "grad_norm": 0.2407907247543335, "kl": 0.011545327957719564, "learning_rate": 9.55479388605836e-07, "loss": 0.0067, "num_tokens": 132141404.0, "reward": 0.9204819202423096, "reward_std": 0.03945203125476837, "rewards/reward_func/mean": 0.9204819202423096, "rewards/reward_func/std": 0.03945203125476837, "step": 4807, "step_time": 17.25317219272256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 189.0, "completions/mean_terminated_length": 189.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.46452558040618896, "epoch": 0.22269569245020843, "frac_reward_zero_std": 1.0, "grad_norm": 0.008620009757578373, "kl": 0.007838520454242826, "learning_rate": 9.554701250578972e-07, "loss": 0.0004, "num_tokens": 132163964.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4808, "step_time": 20.910505171865225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "entropy": 0.20958810299634933, "epoch": 0.22274201018990272, "frac_reward_zero_std": 1.0, "grad_norm": 0.002280582906678319, "kl": 0.002195667417254299, "learning_rate": 9.554608615099583e-07, "loss": 0.0001, "num_tokens": 132201678.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4809, "step_time": 23.08414401486516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 140.8125, "completions/mean_terminated_length": 140.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3175320103764534, "epoch": 0.22278832792959705, "frac_reward_zero_std": 1.0, "grad_norm": 0.002707999898120761, "kl": 0.0018491519440431148, "learning_rate": 9.554515979620194e-07, "loss": 0.0001, "num_tokens": 132237851.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4810, "step_time": 18.741000294685364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 142.3125, "completions/mean_terminated_length": 142.3125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3955921605229378, "epoch": 0.22283464566929134, "frac_reward_zero_std": 1.0, "grad_norm": 0.004006635397672653, "kl": 0.0033857455127872527, "learning_rate": 9.554423344140806e-07, "loss": 0.0002, "num_tokens": 132268016.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4811, "step_time": 17.484014619141817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 120.375, "completions/mean_terminated_length": 120.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.352799691259861, "epoch": 0.22288096340898564, "frac_reward_zero_std": 1.0, "grad_norm": 0.004791226238012314, "kl": 0.0028124931850470603, "learning_rate": 9.554330708661417e-07, "loss": 0.0001, "num_tokens": 132289558.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4812, "step_time": 13.757609866559505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 180.5625, "completions/mean_terminated_length": 180.5625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.25405431538820267, "epoch": 0.22292728114867993, "frac_reward_zero_std": 0.0, "grad_norm": 0.13583828508853912, "kl": 0.02078463975340128, "learning_rate": 9.554238073182028e-07, "loss": -0.0579, "num_tokens": 132310479.0, "reward": 0.6409821510314941, "reward_std": 0.2502138018608093, "rewards/reward_func/mean": 0.6409821510314941, "rewards/reward_func/std": 0.2502138018608093, "step": 4813, "step_time": 20.335052020847797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2138473130762577, "epoch": 0.22297359888837426, "frac_reward_zero_std": 1.0, "grad_norm": 0.002683066064491868, "kl": 0.0016082872316474095, "learning_rate": 9.55414543770264e-07, "loss": 0.0001, "num_tokens": 132332031.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4814, "step_time": 14.282362803816795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 159.1875, "completions/mean_terminated_length": 159.1875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.18920167535543442, "epoch": 0.22301991662806855, "frac_reward_zero_std": 1.0, "grad_norm": 0.002966281259432435, "kl": 0.001978725107619539, "learning_rate": 9.55405280222325e-07, "loss": 0.0001, "num_tokens": 132368738.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4815, "step_time": 21.076286014169455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.23978042230010033, "epoch": 0.22306623436776285, "frac_reward_zero_std": 0.0, "grad_norm": 0.18582899868488312, "kl": 0.021093164570629597, "learning_rate": 9.553960166743864e-07, "loss": 0.0407, "num_tokens": 132389689.0, "reward": 0.8833723068237305, "reward_std": 0.005832654424011707, "rewards/reward_func/mean": 0.8833723068237305, "rewards/reward_func/std": 0.005832654424011707, "step": 4816, "step_time": 16.47979226708412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 150.125, "completions/mean_terminated_length": 150.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.36553626507520676, "epoch": 0.22311255210745715, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026367190293967724, "kl": 0.0022281501442193985, "learning_rate": 9.553867531264473e-07, "loss": 0.0001, "num_tokens": 132422779.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4817, "step_time": 20.063343908637762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.3270419165492058, "epoch": 0.22315886984715147, "frac_reward_zero_std": 1.0, "grad_norm": 0.008946054615080357, "kl": 0.005291900597512722, "learning_rate": 9.553774895785084e-07, "loss": 0.0003, "num_tokens": 132444917.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4818, "step_time": 14.193899523466825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 159.0625, "completions/mean_terminated_length": 159.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.2129964791238308, "epoch": 0.22320518758684577, "frac_reward_zero_std": 0.0, "grad_norm": 0.13394396007061005, "kl": 0.009990689344704151, "learning_rate": 9.553682260305696e-07, "loss": 0.0285, "num_tokens": 132468982.0, "reward": 0.9821569919586182, "reward_std": 0.03836126998066902, "rewards/reward_func/mean": 0.9821569919586182, "rewards/reward_func/std": 0.038361258804798126, "step": 4819, "step_time": 18.739660866558552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 129.0625, "completions/mean_terminated_length": 129.0625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.28548502177000046, "epoch": 0.22325150532654006, "frac_reward_zero_std": 1.0, "grad_norm": 0.007504717912524939, "kl": 0.0035440283245407045, "learning_rate": 9.55358962482631e-07, "loss": 0.0002, "num_tokens": 132490951.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4820, "step_time": 13.963666781783104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 134.25, "completions/mean_terminated_length": 134.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.2578406110405922, "epoch": 0.22329782306623436, "frac_reward_zero_std": 1.0, "grad_norm": 0.008697756566107273, "kl": 0.0033026170858647674, "learning_rate": 9.55349698934692e-07, "loss": 0.0002, "num_tokens": 132512747.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4821, "step_time": 14.992974765598774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 131.6875, "completions/mean_terminated_length": 131.6875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.29801688343286514, "epoch": 0.22334414080592868, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046247392892837524, "kl": 0.003248437773436308, "learning_rate": 9.553404353867531e-07, "loss": 0.0002, "num_tokens": 132532598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4822, "step_time": 15.101666435599327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2898029014468193, "epoch": 0.22339045854562298, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034714434295892715, "kl": 0.0023183961457107216, "learning_rate": 9.553311718388143e-07, "loss": 0.0001, "num_tokens": 132558277.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4823, "step_time": 16.132473524659872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 125.0, "completions/mean_terminated_length": 125.0, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.27723686397075653, "epoch": 0.22343677628531727, "frac_reward_zero_std": 1.0, "grad_norm": 0.002960590645670891, "kl": 0.002162082295399159, "learning_rate": 9.553219082908754e-07, "loss": 0.0001, "num_tokens": 132578789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4824, "step_time": 14.366881299763918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 155.9375, "completions/mean_terminated_length": 155.9375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.445813313126564, "epoch": 0.22348309402501157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027629341930150986, "kl": 0.0023690882953815162, "learning_rate": 9.553126447429365e-07, "loss": 0.0001, "num_tokens": 132622612.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4825, "step_time": 21.507322683930397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 203.4375, "completions/mean_terminated_length": 203.4375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.20725734159350395, "epoch": 0.2235294117647059, "frac_reward_zero_std": 1.0, "grad_norm": 0.005832557566463947, "kl": 0.009239424020051956, "learning_rate": 9.553033811949976e-07, "loss": 0.0005, "num_tokens": 132651915.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4826, "step_time": 22.097034111618996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 141.875, "completions/mean_terminated_length": 141.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.26871396601200104, "epoch": 0.2235757295044002, "frac_reward_zero_std": 0.0, "grad_norm": 0.1823527067899704, "kl": 0.037552046589553356, "learning_rate": 9.552941176470588e-07, "loss": 0.0016, "num_tokens": 132671913.0, "reward": 0.0625, "reward_std": 0.25, "rewards/reward_func/mean": 0.0625, "rewards/reward_func/std": 0.25, "step": 4827, "step_time": 16.322473485022783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 121.5, "completions/mean_terminated_length": 121.5, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "entropy": 0.2627522647380829, "epoch": 0.22362204724409449, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035464696120470762, "kl": 0.0020022848329972476, "learning_rate": 9.5528485409912e-07, "loss": 0.0001, "num_tokens": 132692609.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4828, "step_time": 13.485189318656921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.218659158796072, "epoch": 0.22366836498378878, "frac_reward_zero_std": 0.0, "grad_norm": 0.11234995722770691, "kl": 0.01561578898690641, "learning_rate": 9.55275590551181e-07, "loss": -0.0512, "num_tokens": 132722695.0, "reward": 0.4145066738128662, "reward_std": 0.293962687253952, "rewards/reward_func/mean": 0.4145066738128662, "rewards/reward_func/std": 0.293962687253952, "step": 4829, "step_time": 20.932147346436977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 116.0625, "completions/mean_terminated_length": 116.0625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2082267664372921, "epoch": 0.2237146827234831, "frac_reward_zero_std": 1.0, "grad_norm": 0.0050287009216845036, "kl": 0.0027412628987804055, "learning_rate": 9.552663270032421e-07, "loss": 0.0001, "num_tokens": 132741960.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4830, "step_time": 12.699023351073265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 224.3125, "completions/mean_terminated_length": 224.3125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.47856853902339935, "epoch": 0.2237610004631774, "frac_reward_zero_std": 0.0, "grad_norm": 0.15555942058563232, "kl": 0.008418293320573866, "learning_rate": 9.552570634553033e-07, "loss": 0.3276, "num_tokens": 132788941.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4831, "step_time": 46.43001113459468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "entropy": 0.2749846763908863, "epoch": 0.2238073182028717, "frac_reward_zero_std": 0.0, "grad_norm": 0.19707569479942322, "kl": 0.013525299145840108, "learning_rate": 9.552477999073644e-07, "loss": -0.1544, "num_tokens": 132809345.0, "reward": 0.5, "reward_std": 0.5163977742195129, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5163977742195129, "step": 4832, "step_time": 15.494266454130411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "entropy": 0.22295811399817467, "epoch": 0.223853635942566, "frac_reward_zero_std": 0.0, "grad_norm": 0.09239844977855682, "kl": 0.014077859930694103, "learning_rate": 9.552385363594257e-07, "loss": 0.0193, "num_tokens": 132846745.0, "reward": 0.9796926975250244, "reward_std": 0.036326806992292404, "rewards/reward_func/mean": 0.9796926975250244, "rewards/reward_func/std": 0.036326806992292404, "step": 4833, "step_time": 32.30364686995745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 123.875, "completions/mean_terminated_length": 123.875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.28765248507261276, "epoch": 0.22389995368226032, "frac_reward_zero_std": 1.0, "grad_norm": 0.00323854829184711, "kl": 0.0023553569335490465, "learning_rate": 9.552292728114869e-07, "loss": 0.0001, "num_tokens": 132867399.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4834, "step_time": 13.34771578013897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 159.75, "completions/mean_terminated_length": 159.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2327716276049614, "epoch": 0.2239462714219546, "frac_reward_zero_std": 1.0, "grad_norm": 0.007954198867082596, "kl": 0.006821100483648479, "learning_rate": 9.552200092635478e-07, "loss": 0.0003, "num_tokens": 132888211.0, "reward": 0.7316156029701233, "reward_std": 0.0, "rewards/reward_func/mean": 0.7316156029701233, "rewards/reward_func/std": 0.0, "step": 4835, "step_time": 16.152273803949356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.1323045715689659, "epoch": 0.2239925891616489, "frac_reward_zero_std": 1.0, "grad_norm": 0.004046451300382614, "kl": 0.04179170448333025, "learning_rate": 9.552107457156091e-07, "loss": 0.0021, "num_tokens": 132912361.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4836, "step_time": 16.389708008617163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 122.0, "completions/max_terminated_length": 122.0, "completions/mean_length": 108.6875, "completions/mean_terminated_length": 108.6875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.23037029802799225, "epoch": 0.2240389069013432, "frac_reward_zero_std": 1.0, "grad_norm": 0.003705455455929041, "kl": 0.0016988228308036923, "learning_rate": 9.552014821676702e-07, "loss": 0.0001, "num_tokens": 132932676.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4837, "step_time": 12.089770846068859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 181.5, "completions/mean_terminated_length": 181.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3590536415576935, "epoch": 0.22408522464103753, "frac_reward_zero_std": 0.0, "grad_norm": 0.14399102330207825, "kl": 0.019344626693055034, "learning_rate": 9.551922186197314e-07, "loss": -0.0162, "num_tokens": 132969884.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.3415650427341461, "step": 4838, "step_time": 24.38674383610487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 173.4375, "completions/mean_terminated_length": 173.4375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.17292950302362442, "epoch": 0.22413154238073182, "frac_reward_zero_std": 1.0, "grad_norm": 0.002405304927378893, "kl": 0.0020050150342285633, "learning_rate": 9.551829550717925e-07, "loss": 0.0001, "num_tokens": 133003427.0, "reward": 0.9167169332504272, "reward_std": 0.0, "rewards/reward_func/mean": 0.9167169332504272, "rewards/reward_func/std": 0.0, "step": 4839, "step_time": 20.764228850603104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 286.3125, "completions/mean_terminated_length": 286.3125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.19746238738298416, "epoch": 0.22417786012042612, "frac_reward_zero_std": 1.0, "grad_norm": 0.00419808691367507, "kl": 0.0033777932985685766, "learning_rate": 9.551736915238536e-07, "loss": 0.0002, "num_tokens": 133034696.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4840, "step_time": 28.575475122779608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 151.1875, "completions/mean_terminated_length": 151.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.16610990837216377, "epoch": 0.22422417786012042, "frac_reward_zero_std": 0.0, "grad_norm": 0.12614548206329346, "kl": 0.056223172694444656, "learning_rate": 9.551644279759147e-07, "loss": -0.017, "num_tokens": 133071227.0, "reward": 0.9750921726226807, "reward_std": 0.06806114315986633, "rewards/reward_func/mean": 0.9750921726226807, "rewards/reward_func/std": 0.06806114315986633, "step": 4841, "step_time": 19.617180079221725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 171.0, "completions/mean_terminated_length": 171.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.35999053716659546, "epoch": 0.22427049559981474, "frac_reward_zero_std": 1.0, "grad_norm": 0.02135041542351246, "kl": 0.005787533358670771, "learning_rate": 9.551551644279759e-07, "loss": 0.0003, "num_tokens": 133128059.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4842, "step_time": 26.984447102993727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 209.0, "completions/mean_terminated_length": 209.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.237036794424057, "epoch": 0.22431681333950904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010071357246488333, "kl": 0.001211098482599482, "learning_rate": 9.55145900880037e-07, "loss": 0.0001, "num_tokens": 133166395.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4843, "step_time": 24.326733384281397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.4216890260577202, "epoch": 0.22436313107920333, "frac_reward_zero_std": 0.0, "grad_norm": 0.07943207025527954, "kl": 0.004926699912175536, "learning_rate": 9.55136637332098e-07, "loss": 0.0642, "num_tokens": 133194901.0, "reward": 0.375, "reward_std": 0.5, "rewards/reward_func/mean": 0.375, "rewards/reward_func/std": 0.5, "step": 4844, "step_time": 28.6319671086967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 251.8125, "completions/mean_terminated_length": 251.8125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.17909115552902222, "epoch": 0.22440944881889763, "frac_reward_zero_std": 0.0, "grad_norm": 0.11312639713287354, "kl": 0.011401549680158496, "learning_rate": 9.551273737841592e-07, "loss": 0.0258, "num_tokens": 133234210.0, "reward": 0.7825256586074829, "reward_std": 0.021327367052435875, "rewards/reward_func/mean": 0.7825256586074829, "rewards/reward_func/std": 0.021327365189790726, "step": 4845, "step_time": 26.30367613211274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 169.375, "completions/mean_terminated_length": 169.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.12030527368187904, "epoch": 0.22445576655859195, "frac_reward_zero_std": 0.0, "grad_norm": 0.10640913993120193, "kl": 0.003732732468051836, "learning_rate": 9.551181102362206e-07, "loss": -0.0576, "num_tokens": 133259416.0, "reward": 0.8599221706390381, "reward_std": 0.14197945594787598, "rewards/reward_func/mean": 0.8599221706390381, "rewards/reward_func/std": 0.14197945594787598, "step": 4846, "step_time": 18.804424412548542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 143.5625, "completions/mean_terminated_length": 143.5625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.27932412177324295, "epoch": 0.22450208429828625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031063167843967676, "kl": 0.0021145372884348035, "learning_rate": 9.551088466882817e-07, "loss": 0.0001, "num_tokens": 133279201.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4847, "step_time": 15.13044085726142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 204.1875, "completions/mean_terminated_length": 204.1875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.4376760870218277, "epoch": 0.22454840203798054, "frac_reward_zero_std": 1.0, "grad_norm": 0.005663942079991102, "kl": 0.0044380699982866645, "learning_rate": 9.550995831403426e-07, "loss": 0.0002, "num_tokens": 133309508.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4848, "step_time": 22.924985982477665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3776480481028557, "epoch": 0.22459471977767484, "frac_reward_zero_std": 1.0, "grad_norm": 0.010664451867341995, "kl": 0.00503740314161405, "learning_rate": 9.550903195924037e-07, "loss": 0.0002, "num_tokens": 133353443.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4849, "step_time": 25.17474266514182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 213.4375, "completions/mean_terminated_length": 213.4375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.4941771626472473, "epoch": 0.22464103751736916, "frac_reward_zero_std": 0.0, "grad_norm": 0.11145656555891037, "kl": 0.010323689552024007, "learning_rate": 9.55081056044465e-07, "loss": 0.0053, "num_tokens": 133375498.0, "reward": 0.6875, "reward_std": 0.4787135720252991, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.4787135720252991, "step": 4850, "step_time": 22.1133300550282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3258415758609772, "epoch": 0.22468735525706346, "frac_reward_zero_std": 0.0, "grad_norm": 0.08351670205593109, "kl": 0.016132496297359467, "learning_rate": 9.550717924965262e-07, "loss": -0.1205, "num_tokens": 133404640.0, "reward": 0.48923105001449585, "reward_std": 0.3325233459472656, "rewards/reward_func/mean": 0.48923105001449585, "rewards/reward_func/std": 0.3325233459472656, "step": 4851, "step_time": 29.518002193421125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 187.3125, "completions/mean_terminated_length": 187.3125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.4024698808789253, "epoch": 0.22473367299675776, "frac_reward_zero_std": 1.0, "grad_norm": 0.01081777922809124, "kl": 0.007352304412052035, "learning_rate": 9.550625289485873e-07, "loss": 0.0004, "num_tokens": 133438549.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4852, "step_time": 22.758445031940937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.29424215108156204, "epoch": 0.22477999073645205, "frac_reward_zero_std": 0.0, "grad_norm": 0.12527966499328613, "kl": 0.029027292504906654, "learning_rate": 9.550532654006484e-07, "loss": -0.0059, "num_tokens": 133469387.0, "reward": 0.5771113634109497, "reward_std": 0.1708800345659256, "rewards/reward_func/mean": 0.5771113634109497, "rewards/reward_func/std": 0.1708800494670868, "step": 4853, "step_time": 20.81361961737275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.36771079152822495, "epoch": 0.22482630847614637, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024784598499536514, "kl": 0.0021381523984018713, "learning_rate": 9.550440018527096e-07, "loss": 0.0001, "num_tokens": 133499433.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4854, "step_time": 19.139034681022167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3991065099835396, "epoch": 0.22487262621584067, "frac_reward_zero_std": 1.0, "grad_norm": 0.004687669686973095, "kl": 0.004851402132771909, "learning_rate": 9.550347383047707e-07, "loss": 0.0002, "num_tokens": 133519964.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4855, "step_time": 18.17878869175911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 160.9375, "completions/mean_terminated_length": 160.9375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.29705192148685455, "epoch": 0.22491894395553497, "frac_reward_zero_std": 0.0, "grad_norm": 0.16597846150398254, "kl": 0.02548988931812346, "learning_rate": 9.550254747568318e-07, "loss": 0.0202, "num_tokens": 133542331.0, "reward": 0.7965884804725647, "reward_std": 0.31098392605781555, "rewards/reward_func/mean": 0.7965884804725647, "rewards/reward_func/std": 0.31098392605781555, "step": 4856, "step_time": 18.238618656992912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 180.1875, "completions/mean_terminated_length": 180.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.4650045931339264, "epoch": 0.22496526169522926, "frac_reward_zero_std": 1.0, "grad_norm": 0.01402257476001978, "kl": 0.011438263114541769, "learning_rate": 9.55016211208893e-07, "loss": 0.0006, "num_tokens": 133567422.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4857, "step_time": 20.366154816001654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 159.3125, "completions/mean_terminated_length": 159.3125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.3271801993250847, "epoch": 0.2250115794349236, "frac_reward_zero_std": 1.0, "grad_norm": 0.021235935389995575, "kl": 0.009877411066554487, "learning_rate": 9.55006947660954e-07, "loss": 0.0005, "num_tokens": 133592803.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4858, "step_time": 17.569607835263014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.2190491333603859, "epoch": 0.22505789717461788, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029440466314554214, "kl": 0.0017971152847167104, "learning_rate": 9.549976841130152e-07, "loss": 0.0001, "num_tokens": 133613083.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4859, "step_time": 16.454703859984875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 192.1875, "completions/mean_terminated_length": 192.1875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.38992976397275925, "epoch": 0.22510421491431218, "frac_reward_zero_std": 0.0, "grad_norm": 0.10814333707094193, "kl": 0.007516085519455373, "learning_rate": 9.549884205650763e-07, "loss": -0.0683, "num_tokens": 133642494.0, "reward": 0.05706879496574402, "reward_std": 0.22827517986297607, "rewards/reward_func/mean": 0.05706879496574402, "rewards/reward_func/std": 0.22827517986297607, "step": 4860, "step_time": 22.814553260803223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 160.1875, "completions/mean_terminated_length": 160.1875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.2716098949313164, "epoch": 0.22515053265400647, "frac_reward_zero_std": 1.0, "grad_norm": 0.005426190327852964, "kl": 0.002436083974316716, "learning_rate": 9.549791570171374e-07, "loss": 0.0001, "num_tokens": 133663489.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4861, "step_time": 17.264394018799067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 168.6875, "completions/mean_terminated_length": 168.6875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.43288785219192505, "epoch": 0.2251968503937008, "frac_reward_zero_std": 1.0, "grad_norm": 0.003112137783318758, "kl": 0.0028312166105024517, "learning_rate": 9.549698934691986e-07, "loss": 0.0001, "num_tokens": 133716876.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4862, "step_time": 26.97471345961094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 166.625, "completions/mean_terminated_length": 166.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.20235447958111763, "epoch": 0.2252431681333951, "frac_reward_zero_std": 0.0, "grad_norm": 0.14627960324287415, "kl": 0.04060222953557968, "learning_rate": 9.5496062992126e-07, "loss": 0.0097, "num_tokens": 133748438.0, "reward": 0.9768627882003784, "reward_std": 0.035443443804979324, "rewards/reward_func/mean": 0.9768627882003784, "rewards/reward_func/std": 0.03544343635439873, "step": 4863, "step_time": 19.009066738188267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 195.3125, "completions/mean_terminated_length": 195.3125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4221041575074196, "epoch": 0.2252894858730894, "frac_reward_zero_std": 1.0, "grad_norm": 0.003028387203812599, "kl": 0.0028028357191942632, "learning_rate": 9.54951366373321e-07, "loss": 0.0001, "num_tokens": 133784283.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4864, "step_time": 23.230973970144987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.19607597962021828, "epoch": 0.22533580361278369, "frac_reward_zero_std": 1.0, "grad_norm": 0.0043772365897893906, "kl": 0.01675504120066762, "learning_rate": 9.549421028253822e-07, "loss": 0.0008, "num_tokens": 133809422.0, "reward": 0.22842517495155334, "reward_std": 0.0, "rewards/reward_func/mean": 0.22842517495155334, "rewards/reward_func/std": 0.0, "step": 4865, "step_time": 17.712210282683372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 122.8125, "completions/mean_terminated_length": 122.8125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.272648885846138, "epoch": 0.225382121352478, "frac_reward_zero_std": 1.0, "grad_norm": 0.002636377001181245, "kl": 0.0020418757922016084, "learning_rate": 9.54932839277443e-07, "loss": 0.0001, "num_tokens": 133830955.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4866, "step_time": 14.760128911584616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 222.6875, "completions/mean_terminated_length": 222.6875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.3675032928586006, "epoch": 0.2254284390921723, "frac_reward_zero_std": 0.0, "grad_norm": 0.08764305710792542, "kl": 0.02246140781790018, "learning_rate": 9.549235757295044e-07, "loss": -0.0545, "num_tokens": 133859830.0, "reward": 0.1875, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.40311288833618164, "step": 4867, "step_time": 24.06194742396474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.33972568809986115, "epoch": 0.2254747568318666, "frac_reward_zero_std": 0.0, "grad_norm": 0.11992360651493073, "kl": 0.018236295087262988, "learning_rate": 9.549143121815655e-07, "loss": 0.0123, "num_tokens": 133891916.0, "reward": 0.4575090706348419, "reward_std": 0.4290746748447418, "rewards/reward_func/mean": 0.4575090706348419, "rewards/reward_func/std": 0.4290747046470642, "step": 4868, "step_time": 23.201321847736835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 188.4375, "completions/mean_terminated_length": 188.4375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.2368924766778946, "epoch": 0.2255210745715609, "frac_reward_zero_std": 0.0, "grad_norm": 0.07918360084295273, "kl": 0.006348276045173407, "learning_rate": 9.549050486336267e-07, "loss": -0.1143, "num_tokens": 133915491.0, "reward": 0.2714614272117615, "reward_std": 0.06292904913425446, "rewards/reward_func/mean": 0.2714614272117615, "rewards/reward_func/std": 0.06292904168367386, "step": 4869, "step_time": 24.9329380877316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3972603529691696, "epoch": 0.22556739231125522, "frac_reward_zero_std": 0.0, "grad_norm": 0.09982740879058838, "kl": 0.022194479126483202, "learning_rate": 9.548957850856878e-07, "loss": -0.0595, "num_tokens": 133946435.0, "reward": 0.7962749004364014, "reward_std": 0.3643563985824585, "rewards/reward_func/mean": 0.7962749004364014, "rewards/reward_func/std": 0.3643563985824585, "step": 4870, "step_time": 34.399493522942066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3247595801949501, "epoch": 0.22561371005094952, "frac_reward_zero_std": 1.0, "grad_norm": 0.00221113464795053, "kl": 0.0019436216680333018, "learning_rate": 9.54886521537749e-07, "loss": 0.0001, "num_tokens": 133984075.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4871, "step_time": 20.088803719729185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 130.3125, "completions/mean_terminated_length": 130.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.32415852695703506, "epoch": 0.2256600277906438, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054161823354661465, "kl": 0.002918284444604069, "learning_rate": 9.5487725798981e-07, "loss": 0.0001, "num_tokens": 134006480.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4872, "step_time": 15.15906186774373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.33338750898838043, "epoch": 0.2257063455303381, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047162859700620174, "kl": 0.0036268249386921525, "learning_rate": 9.548679944418712e-07, "loss": 0.0002, "num_tokens": 134030695.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4873, "step_time": 15.892490446567535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 218.1875, "completions/mean_terminated_length": 218.1875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "entropy": 0.1801813393831253, "epoch": 0.22575266327003243, "frac_reward_zero_std": 1.0, "grad_norm": 0.004071139730513096, "kl": 0.0053452952997758985, "learning_rate": 9.548587308939323e-07, "loss": 0.0003, "num_tokens": 134062810.0, "reward": 0.33445996046066284, "reward_std": 0.0, "rewards/reward_func/mean": 0.33445996046066284, "rewards/reward_func/std": 0.0, "step": 4874, "step_time": 23.980526633560658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 148.875, "completions/mean_terminated_length": 148.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.40750253200531006, "epoch": 0.22579898100972673, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073066893965005875, "kl": 0.005031371954828501, "learning_rate": 9.548494673459934e-07, "loss": 0.0003, "num_tokens": 134087144.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4875, "step_time": 17.886852901428938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.30172694474458694, "epoch": 0.22584529874942103, "frac_reward_zero_std": 1.0, "grad_norm": 0.0398525670170784, "kl": 0.003992043435573578, "learning_rate": 9.548402037980547e-07, "loss": 0.0002, "num_tokens": 134116128.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4876, "step_time": 19.714687902480364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 157.0625, "completions/mean_terminated_length": 157.0625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.20624738186597824, "epoch": 0.22589161648911532, "frac_reward_zero_std": 1.0, "grad_norm": 0.010315127670764923, "kl": 0.07822940871119499, "learning_rate": 9.548309402501159e-07, "loss": 0.0039, "num_tokens": 134136657.0, "reward": 0.7403417825698853, "reward_std": 0.0, "rewards/reward_func/mean": 0.7403417825698853, "rewards/reward_func/std": 0.0, "step": 4877, "step_time": 17.194463308900595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 137.25, "completions/mean_terminated_length": 137.25, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.31360918283462524, "epoch": 0.22593793422880964, "frac_reward_zero_std": 1.0, "grad_norm": 0.00660826126113534, "kl": 0.004416961804963648, "learning_rate": 9.548216767021768e-07, "loss": 0.0002, "num_tokens": 134158757.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4878, "step_time": 16.34247576072812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 204.625, "completions/mean_terminated_length": 204.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.24317792803049088, "epoch": 0.22598425196850394, "frac_reward_zero_std": 0.0, "grad_norm": 0.17206689715385437, "kl": 0.02334041357971728, "learning_rate": 9.54812413154238e-07, "loss": -0.0217, "num_tokens": 134182671.0, "reward": 0.7352595329284668, "reward_std": 0.10705790668725967, "rewards/reward_func/mean": 0.7352595329284668, "rewards/reward_func/std": 0.10705791413784027, "step": 4879, "step_time": 19.69738906994462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 110.9375, "completions/mean_terminated_length": 110.9375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.2758304551243782, "epoch": 0.22603056970819824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022120364010334015, "kl": 0.0017564271984156221, "learning_rate": 9.548031496062992e-07, "loss": 0.0001, "num_tokens": 134203342.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4880, "step_time": 12.976031545549631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 199.5625, "completions/mean_terminated_length": 199.5625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.32215501368045807, "epoch": 0.22607688744789253, "frac_reward_zero_std": 0.0, "grad_norm": 0.1277199685573578, "kl": 0.015220458968542516, "learning_rate": 9.547938860583604e-07, "loss": 0.0089, "num_tokens": 134237143.0, "reward": 0.9481692314147949, "reward_std": 0.001204445492476225, "rewards/reward_func/mean": 0.9481692314147949, "rewards/reward_func/std": 0.001204445492476225, "step": 4881, "step_time": 23.702144730836153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 150.5625, "completions/mean_terminated_length": 150.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.41165612637996674, "epoch": 0.22612320518758686, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016905659576877952, "kl": 0.0018852135981433094, "learning_rate": 9.547846225104215e-07, "loss": 0.0001, "num_tokens": 134279904.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4882, "step_time": 20.901509072631598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.21951115131378174, "epoch": 0.22616952292728115, "frac_reward_zero_std": 0.0, "grad_norm": 0.25192928314208984, "kl": 0.0586194870993495, "learning_rate": 9.547753589624826e-07, "loss": -0.0217, "num_tokens": 134300826.0, "reward": 0.5538828372955322, "reward_std": 0.5049868226051331, "rewards/reward_func/mean": 0.5538828372955322, "rewards/reward_func/std": 0.5049868226051331, "step": 4883, "step_time": 16.96540416777134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 171.5625, "completions/mean_terminated_length": 171.5625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.37507422268390656, "epoch": 0.22621584066697545, "frac_reward_zero_std": 1.0, "grad_norm": 0.004508259706199169, "kl": 0.0031737511744722724, "learning_rate": 9.547660954145437e-07, "loss": 0.0002, "num_tokens": 134330643.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4884, "step_time": 20.733138252049685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 164.4375, "completions/mean_terminated_length": 164.4375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.35645201057195663, "epoch": 0.22626215840666974, "frac_reward_zero_std": 0.0, "grad_norm": 0.18233071267604828, "kl": 0.011758289067074656, "learning_rate": 9.547568318666049e-07, "loss": -0.0929, "num_tokens": 134351210.0, "reward": 0.05026397109031677, "reward_std": 0.2010558843612671, "rewards/reward_func/mean": 0.05026397109031677, "rewards/reward_func/std": 0.2010558843612671, "step": 4885, "step_time": 20.797563511878252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 165.4375, "completions/mean_terminated_length": 165.4375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.3204440176486969, "epoch": 0.22630847614636407, "frac_reward_zero_std": 1.0, "grad_norm": 0.004753609653562307, "kl": 0.0034997077891603112, "learning_rate": 9.54747568318666e-07, "loss": 0.0002, "num_tokens": 134373169.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4886, "step_time": 18.538682401180267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 181.6875, "completions/mean_terminated_length": 181.6875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.10731195658445358, "epoch": 0.22635479388605836, "frac_reward_zero_std": 0.0, "grad_norm": 0.4505479335784912, "kl": 0.12920604646205902, "learning_rate": 9.547383047707271e-07, "loss": 0.0289, "num_tokens": 134398988.0, "reward": 0.9848394989967346, "reward_std": 0.06064195558428764, "rewards/reward_func/mean": 0.9848394989967346, "rewards/reward_func/std": 0.06064195930957794, "step": 4887, "step_time": 18.65380907431245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 130.6875, "completions/mean_terminated_length": 130.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.27973808348178864, "epoch": 0.22640111162575266, "frac_reward_zero_std": 1.0, "grad_norm": 0.005234105978161097, "kl": 0.0028177001513540745, "learning_rate": 9.547290412227882e-07, "loss": 0.0001, "num_tokens": 134419927.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4888, "step_time": 14.913063060492277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 123.0625, "completions/mean_terminated_length": 123.0625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.23313026875257492, "epoch": 0.22644742936544696, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038439063355326653, "kl": 0.0019966693071182817, "learning_rate": 9.547197776748494e-07, "loss": 0.0001, "num_tokens": 134439560.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4889, "step_time": 14.961370319128036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 164.6875, "completions/mean_terminated_length": 164.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3789483979344368, "epoch": 0.22649374710514128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033721274230629206, "kl": 0.0027932623052038252, "learning_rate": 9.547105141269107e-07, "loss": 0.0001, "num_tokens": 134471923.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4890, "step_time": 18.853180646896362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 237.8125, "completions/mean_terminated_length": 237.8125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "entropy": 0.23378483206033707, "epoch": 0.22654006484483558, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055068242363631725, "kl": 0.006216542446054518, "learning_rate": 9.547012505789716e-07, "loss": 0.0003, "num_tokens": 134494656.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4891, "step_time": 22.65500281378627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 259.9375, "completions/mean_terminated_length": 259.9375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3920636996626854, "epoch": 0.22658638258452987, "frac_reward_zero_std": 0.0, "grad_norm": 0.13632963597774506, "kl": 0.01857087411917746, "learning_rate": 9.546919870310327e-07, "loss": -0.2208, "num_tokens": 134523295.0, "reward": 0.11859824508428574, "reward_std": 0.24871979653835297, "rewards/reward_func/mean": 0.11859824508428574, "rewards/reward_func/std": 0.24871981143951416, "step": 4892, "step_time": 38.42748509719968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 158.4375, "completions/mean_terminated_length": 158.4375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2201010026037693, "epoch": 0.22663270032422417, "frac_reward_zero_std": 0.0, "grad_norm": 0.1770990639925003, "kl": 0.009945785510353744, "learning_rate": 9.54682723483094e-07, "loss": 0.0005, "num_tokens": 134544246.0, "reward": 0.9550249576568604, "reward_std": 0.040965043008327484, "rewards/reward_func/mean": 0.9550249576568604, "rewards/reward_func/std": 0.040965043008327484, "step": 4893, "step_time": 17.550763957202435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 168.0, "completions/mean_terminated_length": 168.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.4384583607316017, "epoch": 0.2266790180639185, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027022287249565125, "kl": 0.002615723351482302, "learning_rate": 9.546734599351552e-07, "loss": 0.0001, "num_tokens": 134597286.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4894, "step_time": 26.55898481607437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 172.0, "completions/mean_terminated_length": 172.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.19407839700579643, "epoch": 0.2267253358036128, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021540976595133543, "kl": 0.0016915185115067288, "learning_rate": 9.546641963872163e-07, "loss": 0.0001, "num_tokens": 134627958.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4895, "step_time": 20.132581286132336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 128.8125, "completions/mean_terminated_length": 128.8125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.26934972405433655, "epoch": 0.22677165354330708, "frac_reward_zero_std": 1.0, "grad_norm": 0.006218805443495512, "kl": 0.004028201394248754, "learning_rate": 9.546549328392774e-07, "loss": 0.0002, "num_tokens": 134650403.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4896, "step_time": 14.688025809824467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.14547937363386154, "epoch": 0.22681797128300138, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035513699986040592, "kl": 0.004947292327415198, "learning_rate": 9.546456692913386e-07, "loss": 0.0002, "num_tokens": 134672585.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4897, "step_time": 19.39472997561097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 181.3125, "completions/mean_terminated_length": 181.3125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.45307208597660065, "epoch": 0.2268642890226957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023464923724532127, "kl": 0.0026051446329802275, "learning_rate": 9.546364057433997e-07, "loss": 0.0001, "num_tokens": 134717230.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4898, "step_time": 26.225048411637545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 167.875, "completions/mean_terminated_length": 167.875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3557990789413452, "epoch": 0.22691060676239, "frac_reward_zero_std": 0.0, "grad_norm": 0.1502242088317871, "kl": 0.014027463272213936, "learning_rate": 9.546271421954608e-07, "loss": 0.0429, "num_tokens": 134738252.0, "reward": 0.4478321075439453, "reward_std": 0.3582656681537628, "rewards/reward_func/mean": 0.4478321075439453, "rewards/reward_func/std": 0.3582656681537628, "step": 4899, "step_time": 18.294767417013645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 165.0, "completions/max_terminated_length": 165.0, "completions/mean_length": 137.3125, "completions/mean_terminated_length": 137.3125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.24131276458501816, "epoch": 0.2269569245020843, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026569655165076256, "kl": 0.002151376858819276, "learning_rate": 9.54617878647522e-07, "loss": 0.0001, "num_tokens": 134760081.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4900, "step_time": 15.34033501893282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 144.0, "completions/mean_terminated_length": 144.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3678324818611145, "epoch": 0.2270032422417786, "frac_reward_zero_std": 1.0, "grad_norm": 0.00403813598677516, "kl": 0.004269542056135833, "learning_rate": 9.54608615099583e-07, "loss": 0.0002, "num_tokens": 134792385.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4901, "step_time": 17.436243526637554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 190.0, "completions/max_terminated_length": 190.0, "completions/mean_length": 152.0625, "completions/mean_terminated_length": 152.0625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.17099004611372948, "epoch": 0.22704955998147291, "frac_reward_zero_std": 0.0, "grad_norm": 0.35982704162597656, "kl": 0.03482277039438486, "learning_rate": 9.545993515516442e-07, "loss": -0.0242, "num_tokens": 134813042.0, "reward": 0.4540383219718933, "reward_std": 0.2948898673057556, "rewards/reward_func/mean": 0.4540383219718933, "rewards/reward_func/std": 0.2948898673057556, "step": 4902, "step_time": 16.71489003673196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 230.3125, "completions/mean_terminated_length": 230.3125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.1927366815507412, "epoch": 0.2270958777211672, "frac_reward_zero_std": 0.0, "grad_norm": 0.13495691120624542, "kl": 0.01703414274379611, "learning_rate": 9.545900880037053e-07, "loss": -0.1234, "num_tokens": 134849895.0, "reward": 0.5759466886520386, "reward_std": 0.20994091033935547, "rewards/reward_func/mean": 0.5759466886520386, "rewards/reward_func/std": 0.20994091033935547, "step": 4903, "step_time": 26.500569820404053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 147.0, "completions/max_terminated_length": 147.0, "completions/mean_length": 120.1875, "completions/mean_terminated_length": 120.1875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.2903394475579262, "epoch": 0.2271421954608615, "frac_reward_zero_std": 1.0, "grad_norm": 0.010166214779019356, "kl": 0.004449092783033848, "learning_rate": 9.545808244557664e-07, "loss": 0.0002, "num_tokens": 134871130.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4904, "step_time": 14.358096912503242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 169.625, "completions/mean_terminated_length": 169.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.35955333709716797, "epoch": 0.2271885132005558, "frac_reward_zero_std": 1.0, "grad_norm": 0.004139995668083429, "kl": 0.00286450277781114, "learning_rate": 9.545715609078276e-07, "loss": 0.0001, "num_tokens": 134893124.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4905, "step_time": 17.006395772099495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 173.6875, "completions/mean_terminated_length": 173.6875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.30060071498155594, "epoch": 0.22723483094025013, "frac_reward_zero_std": 1.0, "grad_norm": 0.002366934670135379, "kl": 0.001916272594826296, "learning_rate": 9.54562297359889e-07, "loss": 0.0001, "num_tokens": 134917535.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4906, "step_time": 18.552723117172718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 182.75, "completions/mean_terminated_length": 182.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.20699544623494148, "epoch": 0.22728114867994442, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019894258584827185, "kl": 0.0015113575500436127, "learning_rate": 9.5455303381195e-07, "loss": 0.0001, "num_tokens": 134947003.0, "reward": 0.9355069994926453, "reward_std": 0.0, "rewards/reward_func/mean": 0.9355069994926453, "rewards/reward_func/std": 0.0, "step": 4907, "step_time": 20.712279092520475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 136.8125, "completions/mean_terminated_length": 136.8125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.32988011091947556, "epoch": 0.22732746641963872, "frac_reward_zero_std": 1.0, "grad_norm": 0.013620874844491482, "kl": 0.003609262639656663, "learning_rate": 9.545437702640112e-07, "loss": 0.0002, "num_tokens": 134972168.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4908, "step_time": 15.561688888818026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.24873162060976028, "epoch": 0.22737378415933301, "frac_reward_zero_std": 1.0, "grad_norm": 0.005233952309936285, "kl": 0.00603331346064806, "learning_rate": 9.54534506716072e-07, "loss": 0.0003, "num_tokens": 135011504.0, "reward": 0.7873122096061707, "reward_std": 0.0, "rewards/reward_func/mean": 0.7873122096061707, "rewards/reward_func/std": 0.0, "step": 4909, "step_time": 27.324167914688587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.179918572306633, "epoch": 0.22742010189902734, "frac_reward_zero_std": 0.0, "grad_norm": 0.11866836994886398, "kl": 0.02200413029640913, "learning_rate": 9.545252431681334e-07, "loss": -0.0081, "num_tokens": 135034449.0, "reward": 0.9923305511474609, "reward_std": 0.016488710418343544, "rewards/reward_func/mean": 0.9923305511474609, "rewards/reward_func/std": 0.016488708555698395, "step": 4910, "step_time": 17.212855003774166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 161.125, "completions/mean_terminated_length": 161.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.18011927977204323, "epoch": 0.22746641963872163, "frac_reward_zero_std": 1.0, "grad_norm": 0.008853848092257977, "kl": 0.0037003319012001157, "learning_rate": 9.545159796201945e-07, "loss": 0.0002, "num_tokens": 135055475.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4911, "step_time": 16.66002894937992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.17935670539736748, "epoch": 0.22751273737841593, "frac_reward_zero_std": 0.0, "grad_norm": 0.10368917882442474, "kl": 0.005186059512197971, "learning_rate": 9.545067160722557e-07, "loss": 0.0001, "num_tokens": 135092625.0, "reward": 0.7021901607513428, "reward_std": 0.036369748413562775, "rewards/reward_func/mean": 0.7021901607513428, "rewards/reward_func/std": 0.03636974096298218, "step": 4912, "step_time": 21.843701250851154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 137.5, "completions/mean_terminated_length": 137.5, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.31793226301670074, "epoch": 0.22755905511811023, "frac_reward_zero_std": 1.0, "grad_norm": 0.002297846833243966, "kl": 0.0020571271888911724, "learning_rate": 9.544974525243168e-07, "loss": 0.0001, "num_tokens": 135113817.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4913, "step_time": 15.05922843515873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 194.0, "completions/mean_terminated_length": 194.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.17006618157029152, "epoch": 0.22760537285780455, "frac_reward_zero_std": 1.0, "grad_norm": 0.006562127731740475, "kl": 0.005386265926063061, "learning_rate": 9.54488188976378e-07, "loss": 0.0003, "num_tokens": 135138057.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4914, "step_time": 18.65556411445141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 162.6875, "completions/mean_terminated_length": 162.6875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.4126560464501381, "epoch": 0.22765169059749885, "frac_reward_zero_std": 1.0, "grad_norm": 0.004423164296895266, "kl": 0.002680208534002304, "learning_rate": 9.54478925428439e-07, "loss": 0.0001, "num_tokens": 135177540.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4915, "step_time": 20.52718361467123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 108.1875, "completions/mean_terminated_length": 108.1875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.32868946343660355, "epoch": 0.22769800833719314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030916209798306227, "kl": 0.0024434159859083593, "learning_rate": 9.544696618805002e-07, "loss": 0.0001, "num_tokens": 135201207.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4916, "step_time": 13.112210553139448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.48267073929309845, "epoch": 0.22774432607688744, "frac_reward_zero_std": 1.0, "grad_norm": 0.007472446653991938, "kl": 0.005911440821364522, "learning_rate": 9.544603983325613e-07, "loss": 0.0003, "num_tokens": 135230311.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4917, "step_time": 23.791458789259195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.34144359081983566, "epoch": 0.22779064381658176, "frac_reward_zero_std": 0.0, "grad_norm": 0.16799886524677277, "kl": 0.022874554619193077, "learning_rate": 9.544511347846224e-07, "loss": -0.0042, "num_tokens": 135251256.0, "reward": 0.9706025123596191, "reward_std": 0.08568520843982697, "rewards/reward_func/mean": 0.9706025123596191, "rewards/reward_func/std": 0.08568520843982697, "step": 4918, "step_time": 19.11159225180745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 115.6875, "completions/mean_terminated_length": 115.6875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "entropy": 0.2824091613292694, "epoch": 0.22783696155627606, "frac_reward_zero_std": 1.0, "grad_norm": 0.0046332888305187225, "kl": 0.0032130761828739196, "learning_rate": 9.544418712366835e-07, "loss": 0.0002, "num_tokens": 135272195.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4919, "step_time": 12.575989741832018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3163532689213753, "epoch": 0.22788327929597035, "frac_reward_zero_std": 0.0, "grad_norm": 0.1030849814414978, "kl": 0.017714201007038355, "learning_rate": 9.544326076887449e-07, "loss": -0.0116, "num_tokens": 135307485.0, "reward": 0.6404792070388794, "reward_std": 0.17094388604164124, "rewards/reward_func/mean": 0.6404792070388794, "rewards/reward_func/std": 0.17094388604164124, "step": 4920, "step_time": 29.51052325963974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 123.6875, "completions/mean_terminated_length": 123.6875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.34754548966884613, "epoch": 0.22792959703566465, "frac_reward_zero_std": 1.0, "grad_norm": 0.00761881098151207, "kl": 0.004353253520093858, "learning_rate": 9.544233441408058e-07, "loss": 0.0002, "num_tokens": 135330440.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4921, "step_time": 13.971477288752794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 172.125, "completions/mean_terminated_length": 172.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.36169762909412384, "epoch": 0.22797591477535897, "frac_reward_zero_std": 1.0, "grad_norm": 0.002824055962264538, "kl": 0.002495995140634477, "learning_rate": 9.54414080592867e-07, "loss": 0.0001, "num_tokens": 135351450.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4922, "step_time": 19.587568532675505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.4654117077589035, "epoch": 0.22802223251505327, "frac_reward_zero_std": 0.0, "grad_norm": 0.11375361680984497, "kl": 0.0049705225974321365, "learning_rate": 9.544048170449282e-07, "loss": 0.0723, "num_tokens": 135378478.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4923, "step_time": 23.855288103222847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.34130582958459854, "epoch": 0.22806855025474757, "frac_reward_zero_std": 1.0, "grad_norm": 0.010463550686836243, "kl": 0.007971058366820216, "learning_rate": 9.543955534969894e-07, "loss": 0.0004, "num_tokens": 135399584.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4924, "step_time": 19.53496042266488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 141.0625, "completions/mean_terminated_length": 141.0625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.15614399872720242, "epoch": 0.22811486799444186, "frac_reward_zero_std": 1.0, "grad_norm": 0.002858600812032819, "kl": 0.003169603645801544, "learning_rate": 9.543862899490505e-07, "loss": 0.0002, "num_tokens": 135420673.0, "reward": 0.9200444221496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.9200444221496582, "rewards/reward_func/std": 0.0, "step": 4925, "step_time": 15.308533191680908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 346.0625, "completions/mean_terminated_length": 346.0625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.13987799733877182, "epoch": 0.22816118573413618, "frac_reward_zero_std": 0.0, "grad_norm": 0.07261361181735992, "kl": 0.024310471722856164, "learning_rate": 9.543770264011116e-07, "loss": 0.0669, "num_tokens": 135462450.0, "reward": 0.673902153968811, "reward_std": 0.044730495661497116, "rewards/reward_func/mean": 0.673902153968811, "rewards/reward_func/std": 0.04473051056265831, "step": 4926, "step_time": 35.43450753763318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 142.125, "completions/mean_terminated_length": 142.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3148866668343544, "epoch": 0.22820750347383048, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025384712498635054, "kl": 0.001969317498151213, "learning_rate": 9.543677628531727e-07, "loss": 0.0001, "num_tokens": 135493700.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4927, "step_time": 17.43789406493306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 167.25, "completions/mean_terminated_length": 167.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.17270001769065857, "epoch": 0.22825382121352478, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038994576316326857, "kl": 0.0033217769814655185, "learning_rate": 9.543584993052339e-07, "loss": 0.0002, "num_tokens": 135529992.0, "reward": 0.910879909992218, "reward_std": 0.0, "rewards/reward_func/mean": 0.910879909992218, "rewards/reward_func/std": 0.0, "step": 4928, "step_time": 21.56278222426772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 131.3125, "completions/mean_terminated_length": 131.3125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.23226439207792282, "epoch": 0.22830013895321907, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023610901553183794, "kl": 0.0018795774085447192, "learning_rate": 9.54349235757295e-07, "loss": 0.0001, "num_tokens": 135549549.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4929, "step_time": 13.779592674225569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 147.5, "completions/mean_terminated_length": 147.5, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3198471963405609, "epoch": 0.2283464566929134, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041888924315571785, "kl": 0.0027100183069705963, "learning_rate": 9.543399722093561e-07, "loss": 0.0001, "num_tokens": 135572741.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4930, "step_time": 16.231150288134813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.43099071830511093, "epoch": 0.2283927744326077, "frac_reward_zero_std": 0.0, "grad_norm": 0.11954791843891144, "kl": 0.009016307070851326, "learning_rate": 9.543307086614172e-07, "loss": -0.0155, "num_tokens": 135596293.0, "reward": 0.8848613500595093, "reward_std": 0.06865546852350235, "rewards/reward_func/mean": 0.8848613500595093, "rewards/reward_func/std": 0.06865545362234116, "step": 4931, "step_time": 23.168150942772627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 209.0625, "completions/mean_terminated_length": 209.0625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.1873588114976883, "epoch": 0.228439092172302, "frac_reward_zero_std": 1.0, "grad_norm": 0.010554197244346142, "kl": 0.04245026782155037, "learning_rate": 9.543214451134784e-07, "loss": 0.0021, "num_tokens": 135628486.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4932, "step_time": 22.609506770968437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 180.5625, "completions/mean_terminated_length": 180.5625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.40849726647138596, "epoch": 0.22848540991199628, "frac_reward_zero_std": 1.0, "grad_norm": 0.004541236907243729, "kl": 0.003340022871270776, "learning_rate": 9.543121815655397e-07, "loss": 0.0002, "num_tokens": 135686847.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4933, "step_time": 29.117306102067232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 144.4375, "completions/mean_terminated_length": 144.4375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2771778255701065, "epoch": 0.2285317276516906, "frac_reward_zero_std": 1.0, "grad_norm": 0.009852061048150063, "kl": 0.007017065770924091, "learning_rate": 9.543029180176006e-07, "loss": 0.0004, "num_tokens": 135706598.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4934, "step_time": 14.810244608670473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 159.6875, "completions/mean_terminated_length": 159.6875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.28022734820842743, "epoch": 0.2285780453913849, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047379145398736, "kl": 0.0032934931805357337, "learning_rate": 9.542936544696617e-07, "loss": 0.0002, "num_tokens": 135730769.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4935, "step_time": 17.938908737152815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 135.375, "completions/mean_terminated_length": 135.375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.3413321375846863, "epoch": 0.2286243631310792, "frac_reward_zero_std": 1.0, "grad_norm": 0.003127356292679906, "kl": 0.002794792235363275, "learning_rate": 9.542843909217229e-07, "loss": 0.0001, "num_tokens": 135751655.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4936, "step_time": 14.597114082425833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 131.5, "completions/mean_terminated_length": 131.5, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "entropy": 0.3287753537297249, "epoch": 0.2286706808707735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030219038017094135, "kl": 0.0021067450288683176, "learning_rate": 9.542751273737842e-07, "loss": 0.0001, "num_tokens": 135785455.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4937, "step_time": 17.11738248169422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 174.125, "completions/mean_terminated_length": 174.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.1680516116321087, "epoch": 0.22871699861046782, "frac_reward_zero_std": 1.0, "grad_norm": 0.002871938282623887, "kl": 0.0023986430023796856, "learning_rate": 9.542658638258453e-07, "loss": 0.0001, "num_tokens": 135823729.0, "reward": 0.8702397346496582, "reward_std": 0.0, "rewards/reward_func/mean": 0.8702397346496582, "rewards/reward_func/std": 0.0, "step": 4938, "step_time": 21.362465284764767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.3788619041442871, "epoch": 0.22876331635016212, "frac_reward_zero_std": 0.0, "grad_norm": 0.10780991613864899, "kl": 0.01446099323220551, "learning_rate": 9.542566002779065e-07, "loss": -0.0269, "num_tokens": 135855373.0, "reward": 0.034384146332740784, "reward_std": 0.035511795431375504, "rewards/reward_func/mean": 0.034384146332740784, "rewards/reward_func/std": 0.035511795431375504, "step": 4939, "step_time": 25.42097443714738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 166.0, "completions/max_terminated_length": 166.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.2784198522567749, "epoch": 0.2288096340898564, "frac_reward_zero_std": 1.0, "grad_norm": 0.00905576627701521, "kl": 0.007316823350265622, "learning_rate": 9.542473367299676e-07, "loss": 0.0004, "num_tokens": 135877403.0, "reward": 0.49658530950546265, "reward_std": 0.0, "rewards/reward_func/mean": 0.49658530950546265, "rewards/reward_func/std": 0.0, "step": 4940, "step_time": 15.711755707859993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 185.5625, "completions/mean_terminated_length": 185.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.19240785762667656, "epoch": 0.2288559518295507, "frac_reward_zero_std": 0.0, "grad_norm": 0.10340874642133713, "kl": 0.010724761057645082, "learning_rate": 9.542380731820287e-07, "loss": -0.0762, "num_tokens": 135900740.0, "reward": 0.5074652433395386, "reward_std": 0.19226588308811188, "rewards/reward_func/mean": 0.5074652433395386, "rewards/reward_func/std": 0.19226589798927307, "step": 4941, "step_time": 21.3562855347991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 128.0625, "completions/mean_terminated_length": 128.0625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.21739665046334267, "epoch": 0.22890226956924503, "frac_reward_zero_std": 1.0, "grad_norm": 0.00455134455114603, "kl": 0.002902865700889379, "learning_rate": 9.542288096340898e-07, "loss": 0.0001, "num_tokens": 135920325.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4942, "step_time": 13.519710768014193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 128.0, "completions/max_terminated_length": 128.0, "completions/mean_length": 117.0, "completions/mean_terminated_length": 117.0, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.2576649561524391, "epoch": 0.22894858730893933, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026868912391364574, "kl": 0.0017338355246465653, "learning_rate": 9.54219546086151e-07, "loss": 0.0001, "num_tokens": 135940117.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4943, "step_time": 12.438980303704739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 213.4375, "completions/mean_terminated_length": 213.4375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.22489995509386063, "epoch": 0.22899490504863362, "frac_reward_zero_std": 1.0, "grad_norm": 0.003607120830565691, "kl": 0.003690137469675392, "learning_rate": 9.54210282538212e-07, "loss": 0.0002, "num_tokens": 135968828.0, "reward": 0.3035200238227844, "reward_std": 0.0, "rewards/reward_func/mean": 0.3035200238227844, "rewards/reward_func/std": 0.0, "step": 4944, "step_time": 23.624397356063128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.30274175852537155, "epoch": 0.22904122278832792, "frac_reward_zero_std": 0.0, "grad_norm": 0.09864536672830582, "kl": 0.035230671521276236, "learning_rate": 9.542010189902732e-07, "loss": -0.1351, "num_tokens": 135992430.0, "reward": 0.37219002842903137, "reward_std": 0.43586432933807373, "rewards/reward_func/mean": 0.37219002842903137, "rewards/reward_func/std": 0.43586432933807373, "step": 4945, "step_time": 24.21020607277751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 170.5625, "completions/mean_terminated_length": 170.5625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.28714311867952347, "epoch": 0.22908754052802224, "frac_reward_zero_std": 0.0, "grad_norm": 0.1121155172586441, "kl": 0.0127812679274939, "learning_rate": 9.541917554423343e-07, "loss": -0.0098, "num_tokens": 136019207.0, "reward": 0.8189885020256042, "reward_std": 0.32016006112098694, "rewards/reward_func/mean": 0.8189885020256042, "rewards/reward_func/std": 0.3201601207256317, "step": 4946, "step_time": 19.498099893331528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 185.1875, "completions/mean_terminated_length": 185.1875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.19009604305028915, "epoch": 0.22913385826771654, "frac_reward_zero_std": 0.0, "grad_norm": 0.12893837690353394, "kl": 0.008450278546661139, "learning_rate": 9.541824918943955e-07, "loss": -0.0237, "num_tokens": 136060538.0, "reward": 0.9297850131988525, "reward_std": 0.06395463645458221, "rewards/reward_func/mean": 0.9297850131988525, "rewards/reward_func/std": 0.06395463645458221, "step": 4947, "step_time": 22.743079613894224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 165.1875, "completions/mean_terminated_length": 165.1875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.19660202413797379, "epoch": 0.22918017600741084, "frac_reward_zero_std": 1.0, "grad_norm": 0.003081459319218993, "kl": 0.002807235694490373, "learning_rate": 9.541732283464566e-07, "loss": 0.0001, "num_tokens": 136097453.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4948, "step_time": 21.365657705813646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 203.0, "completions/max_terminated_length": 203.0, "completions/mean_length": 171.875, "completions/mean_terminated_length": 171.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.3890600800514221, "epoch": 0.22922649374710513, "frac_reward_zero_std": 1.0, "grad_norm": 0.002776462584733963, "kl": 0.003068011370487511, "learning_rate": 9.541639647985177e-07, "loss": 0.0002, "num_tokens": 136128763.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4949, "step_time": 20.01358639076352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 261.375, "completions/mean_terminated_length": 261.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.4045885503292084, "epoch": 0.22927281148679945, "frac_reward_zero_std": 0.0, "grad_norm": 0.09543655812740326, "kl": 0.027542250230908394, "learning_rate": 9.54154701250579e-07, "loss": -0.0888, "num_tokens": 136160545.0, "reward": 0.597649872303009, "reward_std": 0.4781515896320343, "rewards/reward_func/mean": 0.597649872303009, "rewards/reward_func/std": 0.4781516194343567, "step": 4950, "step_time": 29.6665663048625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 183.0625, "completions/mean_terminated_length": 183.0625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.22729328647255898, "epoch": 0.22931912922649375, "frac_reward_zero_std": 0.0, "grad_norm": 0.12908326089382172, "kl": 0.04984285309910774, "learning_rate": 9.541454377026402e-07, "loss": -0.0778, "num_tokens": 136197266.0, "reward": 0.4950423836708069, "reward_std": 0.35888388752937317, "rewards/reward_func/mean": 0.4950423836708069, "rewards/reward_func/std": 0.35888388752937317, "step": 4951, "step_time": 23.91749533265829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 164.625, "completions/mean_terminated_length": 164.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.17654425278306007, "epoch": 0.22936544696618805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0059771547093987465, "kl": 0.03696775436401367, "learning_rate": 9.54136174154701e-07, "loss": 0.0018, "num_tokens": 136219148.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4952, "step_time": 17.012957394123077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.37332598865032196, "epoch": 0.22941176470588234, "frac_reward_zero_std": 0.0, "grad_norm": 0.1353083699941635, "kl": 0.03166711889207363, "learning_rate": 9.541269106067624e-07, "loss": -0.061, "num_tokens": 136271302.0, "reward": 0.13612115383148193, "reward_std": 0.2926517128944397, "rewards/reward_func/mean": 0.13612115383148193, "rewards/reward_func/std": 0.2926517128944397, "step": 4953, "step_time": 29.59253205731511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 232.3125, "completions/mean_terminated_length": 232.3125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.27667227387428284, "epoch": 0.22945808244557667, "frac_reward_zero_std": 1.0, "grad_norm": 0.0070470962673425674, "kl": 0.008905272232368588, "learning_rate": 9.541176470588235e-07, "loss": 0.0004, "num_tokens": 136307147.0, "reward": 0.7378081679344177, "reward_std": 0.0, "rewards/reward_func/mean": 0.7378081679344177, "rewards/reward_func/std": 0.0, "step": 4954, "step_time": 26.62858249619603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 158.3125, "completions/mean_terminated_length": 158.3125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.3905433714389801, "epoch": 0.22950440018527096, "frac_reward_zero_std": 1.0, "grad_norm": 0.013934897258877754, "kl": 0.014455066993832588, "learning_rate": 9.541083835108847e-07, "loss": 0.0007, "num_tokens": 136329488.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4955, "step_time": 20.871957015246153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 197.1875, "completions/mean_terminated_length": 197.1875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.40288519859313965, "epoch": 0.22955071792496526, "frac_reward_zero_std": 1.0, "grad_norm": 0.004134077113121748, "kl": 0.003971407248172909, "learning_rate": 9.540991199629458e-07, "loss": 0.0002, "num_tokens": 136355107.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4956, "step_time": 21.19922338426113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 198.0, "completions/mean_terminated_length": 198.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.1957165263593197, "epoch": 0.22959703566465955, "frac_reward_zero_std": 1.0, "grad_norm": 0.003859918564558029, "kl": 0.003108838980551809, "learning_rate": 9.54089856415007e-07, "loss": 0.0002, "num_tokens": 136382419.0, "reward": 0.6563555598258972, "reward_std": 0.0, "rewards/reward_func/mean": 0.6563555598258972, "rewards/reward_func/std": 0.0, "step": 4957, "step_time": 20.1811338737607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 140.375, "completions/mean_terminated_length": 140.375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.1737431362271309, "epoch": 0.22964335340435388, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034294351935386658, "kl": 0.0024284476530738175, "learning_rate": 9.54080592867068e-07, "loss": 0.0001, "num_tokens": 136403161.0, "reward": 0.2865048050880432, "reward_std": 0.0, "rewards/reward_func/mean": 0.2865048050880432, "rewards/reward_func/std": 0.0, "step": 4958, "step_time": 16.0653938613832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 137.375, "completions/mean_terminated_length": 137.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.2510499581694603, "epoch": 0.22968967114404817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020902531687170267, "kl": 0.0016211385518545285, "learning_rate": 9.540713293191292e-07, "loss": 0.0001, "num_tokens": 136423807.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4959, "step_time": 14.679680604487658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.3182985484600067, "epoch": 0.22973598888374247, "frac_reward_zero_std": 1.0, "grad_norm": 0.015090767294168472, "kl": 0.006587028503417969, "learning_rate": 9.540620657711903e-07, "loss": 0.0003, "num_tokens": 136449533.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4960, "step_time": 17.82992237433791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 128.1875, "completions/mean_terminated_length": 128.1875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3285192921757698, "epoch": 0.22978230662343677, "frac_reward_zero_std": 1.0, "grad_norm": 0.006869843229651451, "kl": 0.005431405734270811, "learning_rate": 9.540528022232514e-07, "loss": 0.0003, "num_tokens": 136485856.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4961, "step_time": 20.039498902857304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 126.6875, "completions/mean_terminated_length": 126.6875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.326532818377018, "epoch": 0.2298286243631311, "frac_reward_zero_std": 1.0, "grad_norm": 0.002713272115215659, "kl": 0.0021965482737869024, "learning_rate": 9.540435386753125e-07, "loss": 0.0001, "num_tokens": 136510251.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4962, "step_time": 15.315373342484236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 146.125, "completions/mean_terminated_length": 146.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.33709999918937683, "epoch": 0.22987494210282539, "frac_reward_zero_std": 1.0, "grad_norm": 0.003737780964002013, "kl": 0.0025583981187082827, "learning_rate": 9.540342751273739e-07, "loss": 0.0001, "num_tokens": 136533021.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4963, "step_time": 16.635175190865993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 245.4375, "completions/mean_terminated_length": 245.4375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.19287850707769394, "epoch": 0.22992125984251968, "frac_reward_zero_std": 0.0, "grad_norm": 0.09211494028568268, "kl": 0.009767316281795502, "learning_rate": 9.54025011579435e-07, "loss": -0.0335, "num_tokens": 136557172.0, "reward": 0.302733838558197, "reward_std": 0.08072902262210846, "rewards/reward_func/mean": 0.302733838558197, "rewards/reward_func/std": 0.08072902262210846, "step": 4964, "step_time": 25.37386042997241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 133.5, "completions/mean_terminated_length": 133.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3527248054742813, "epoch": 0.22996757758221398, "frac_reward_zero_std": 1.0, "grad_norm": 0.003740234998986125, "kl": 0.0025834679254330695, "learning_rate": 9.54015748031496e-07, "loss": 0.0001, "num_tokens": 136577900.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4965, "step_time": 14.393873788416386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.4117540717124939, "epoch": 0.2300138953219083, "frac_reward_zero_std": 1.0, "grad_norm": 0.008569171652197838, "kl": 0.008226197911426425, "learning_rate": 9.54006484483557e-07, "loss": 0.0004, "num_tokens": 136602977.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4966, "step_time": 18.987396009266376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.32656220346689224, "epoch": 0.2300602130616026, "frac_reward_zero_std": 1.0, "grad_norm": 0.007840272039175034, "kl": 0.0038428864208981395, "learning_rate": 9.539972209356184e-07, "loss": 0.0002, "num_tokens": 136626272.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4967, "step_time": 15.917999155819416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 147.25, "completions/mean_terminated_length": 147.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.29315244406461716, "epoch": 0.2301065308012969, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031389619689434767, "kl": 0.002125636616256088, "learning_rate": 9.539879573876795e-07, "loss": 0.0001, "num_tokens": 136649220.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4968, "step_time": 17.14676835387945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 159.625, "completions/mean_terminated_length": 159.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.21956472471356392, "epoch": 0.2301528485409912, "frac_reward_zero_std": 0.0, "grad_norm": 0.33373013138771057, "kl": 0.030344389146193862, "learning_rate": 9.539786938397406e-07, "loss": -0.0922, "num_tokens": 136682702.0, "reward": 0.5341726541519165, "reward_std": 0.24100929498672485, "rewards/reward_func/mean": 0.5341726541519165, "rewards/reward_func/std": 0.24100928008556366, "step": 4969, "step_time": 20.75757497921586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 232.8125, "completions/mean_terminated_length": 232.8125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "entropy": 0.47347675263881683, "epoch": 0.2301991662806855, "frac_reward_zero_std": 0.0, "grad_norm": 0.10812296718358994, "kl": 0.005712226149626076, "learning_rate": 9.539694302918017e-07, "loss": 0.0767, "num_tokens": 136710075.0, "reward": 0.9375, "reward_std": 0.25, "rewards/reward_func/mean": 0.9375, "rewards/reward_func/std": 0.25, "step": 4970, "step_time": 25.8978082947433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 133.0, "completions/mean_terminated_length": 133.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.13757196068763733, "epoch": 0.2302454840203798, "frac_reward_zero_std": 0.0, "grad_norm": 0.2606033682823181, "kl": 0.05828773230314255, "learning_rate": 9.539601667438629e-07, "loss": 0.0083, "num_tokens": 136730731.0, "reward": 0.9269284009933472, "reward_std": 0.023441297933459282, "rewards/reward_func/mean": 0.9269284009933472, "rewards/reward_func/std": 0.023441297933459282, "step": 4971, "step_time": 14.104417875409126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 184.9375, "completions/mean_terminated_length": 184.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.4354928955435753, "epoch": 0.2302918017600741, "frac_reward_zero_std": 1.0, "grad_norm": 0.007435917388647795, "kl": 0.0059344901237636805, "learning_rate": 9.53950903195924e-07, "loss": 0.0003, "num_tokens": 136755914.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4972, "step_time": 20.306267257779837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 173.875, "completions/mean_terminated_length": 173.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.35089272260665894, "epoch": 0.2303381194997684, "frac_reward_zero_std": 1.0, "grad_norm": 0.01030554249882698, "kl": 0.011292512994259596, "learning_rate": 9.539416396479851e-07, "loss": 0.0006, "num_tokens": 136778376.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4973, "step_time": 18.960453048348427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 173.9375, "completions/mean_terminated_length": 173.9375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2830249294638634, "epoch": 0.23038443723946272, "frac_reward_zero_std": 0.0, "grad_norm": 0.13870753347873688, "kl": 0.012759183533489704, "learning_rate": 9.539323761000462e-07, "loss": 0.0427, "num_tokens": 136799639.0, "reward": 0.8004477024078369, "reward_std": 0.3188842236995697, "rewards/reward_func/mean": 0.8004477024078369, "rewards/reward_func/std": 0.3188842535018921, "step": 4974, "step_time": 19.32561694830656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 123.5625, "completions/mean_terminated_length": 123.5625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.3059925436973572, "epoch": 0.23043075497915702, "frac_reward_zero_std": 1.0, "grad_norm": 0.004238604102283716, "kl": 0.00254950430826284, "learning_rate": 9.539231125521074e-07, "loss": 0.0001, "num_tokens": 136821072.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4975, "step_time": 15.8724498860538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 204.9375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3470360338687897, "epoch": 0.23047707271885132, "frac_reward_zero_std": 0.0, "grad_norm": 0.11621661484241486, "kl": 0.005828688619658351, "learning_rate": 9.539138490041687e-07, "loss": -0.0766, "num_tokens": 136846847.0, "reward": 0.5920567512512207, "reward_std": 0.4122554361820221, "rewards/reward_func/mean": 0.5920567512512207, "rewards/reward_func/std": 0.4122554361820221, "step": 4976, "step_time": 23.909298427402973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 269.0625, "completions/mean_terminated_length": 269.0625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "entropy": 0.310733363032341, "epoch": 0.2305233904585456, "frac_reward_zero_std": 0.0, "grad_norm": 0.14859719574451447, "kl": 0.01813937397673726, "learning_rate": 9.539045854562296e-07, "loss": -0.0418, "num_tokens": 136885728.0, "reward": 0.6814736723899841, "reward_std": 0.271925151348114, "rewards/reward_func/mean": 0.6814736723899841, "rewards/reward_func/std": 0.2719251811504364, "step": 4977, "step_time": 30.854093376547098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 128.0, "completions/mean_terminated_length": 128.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.3014540448784828, "epoch": 0.23056970819823994, "frac_reward_zero_std": 1.0, "grad_norm": 0.004114687442779541, "kl": 0.0025400363956578076, "learning_rate": 9.538953219082907e-07, "loss": 0.0001, "num_tokens": 136906944.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4978, "step_time": 14.466225132346153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 157.0, "completions/mean_terminated_length": 157.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.31905554980039597, "epoch": 0.23061602593793423, "frac_reward_zero_std": 1.0, "grad_norm": 0.009289965033531189, "kl": 0.005353609682060778, "learning_rate": 9.538860583603519e-07, "loss": 0.0003, "num_tokens": 136943536.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4979, "step_time": 20.306705847382545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 191.125, "completions/mean_terminated_length": 191.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.40436309576034546, "epoch": 0.23066234367762853, "frac_reward_zero_std": 1.0, "grad_norm": 0.009574116207659245, "kl": 0.008132883347570896, "learning_rate": 9.538767948124132e-07, "loss": 0.0004, "num_tokens": 136969442.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4980, "step_time": 20.005169097334146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 131.25, "completions/mean_terminated_length": 131.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "entropy": 0.27942420542240143, "epoch": 0.23070866141732282, "frac_reward_zero_std": 1.0, "grad_norm": 0.002362277591601014, "kl": 0.0019525960087776184, "learning_rate": 9.538675312644743e-07, "loss": 0.0001, "num_tokens": 136990118.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4981, "step_time": 14.53041435033083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 134.9375, "completions/mean_terminated_length": 134.9375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.2817884013056755, "epoch": 0.23075497915701715, "frac_reward_zero_std": 1.0, "grad_norm": 0.003836382180452347, "kl": 0.0029382259235717356, "learning_rate": 9.538582677165355e-07, "loss": 0.0001, "num_tokens": 137011285.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4982, "step_time": 14.541017945855856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 155.0625, "completions/mean_terminated_length": 155.0625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.1965767741203308, "epoch": 0.23080129689671144, "frac_reward_zero_std": 1.0, "grad_norm": 0.005989167373627424, "kl": 0.0038433902082033455, "learning_rate": 9.538490041685966e-07, "loss": 0.0002, "num_tokens": 137034038.0, "reward": 0.9259610772132874, "reward_std": 0.0, "rewards/reward_func/mean": 0.9259610772132874, "rewards/reward_func/std": 0.0, "step": 4983, "step_time": 17.173334512859583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 151.25, "completions/mean_terminated_length": 151.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.13582609966397285, "epoch": 0.23084761463640574, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019398063886910677, "kl": 0.0012038782879244536, "learning_rate": 9.538397406206577e-07, "loss": 0.0001, "num_tokens": 137078842.0, "reward": 0.8187307715415955, "reward_std": 0.0, "rewards/reward_func/mean": 0.8187307715415955, "rewards/reward_func/std": 0.0, "step": 4984, "step_time": 21.77512374892831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 118.4375, "completions/mean_terminated_length": 118.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.27934522181749344, "epoch": 0.23089393237610004, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035192626528441906, "kl": 0.0022731263597961515, "learning_rate": 9.538304770727188e-07, "loss": 0.0001, "num_tokens": 137099729.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4985, "step_time": 13.443418379873037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.18417320027947426, "epoch": 0.23094025011579436, "frac_reward_zero_std": 0.0, "grad_norm": 0.11577610671520233, "kl": 0.009522537817247212, "learning_rate": 9.5382121352478e-07, "loss": 0.0377, "num_tokens": 137129561.0, "reward": 0.9662111401557922, "reward_std": 0.027031106874346733, "rewards/reward_func/mean": 0.9662111401557922, "rewards/reward_func/std": 0.02703109383583069, "step": 4986, "step_time": 29.63142754882574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.1748918518424034, "epoch": 0.23098656785548866, "frac_reward_zero_std": 0.0, "grad_norm": 0.09933748096227646, "kl": 0.002387493441347033, "learning_rate": 9.53811949976841e-07, "loss": -0.0282, "num_tokens": 137158747.0, "reward": 0.9167327880859375, "reward_std": 0.03250420093536377, "rewards/reward_func/mean": 0.9167327880859375, "rewards/reward_func/std": 0.03250420466065407, "step": 4987, "step_time": 18.521799258887768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.32956328243017197, "epoch": 0.23103288559518295, "frac_reward_zero_std": 0.0, "grad_norm": 0.10205793380737305, "kl": 0.014923338778316975, "learning_rate": 9.538026864289022e-07, "loss": -0.0224, "num_tokens": 137197113.0, "reward": 0.8125, "reward_std": 0.40311288833618164, "rewards/reward_func/mean": 0.8125, "rewards/reward_func/std": 0.40311288833618164, "step": 4988, "step_time": 25.59490615129471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.4404328987002373, "epoch": 0.23107920333487725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031527713872492313, "kl": 0.0027730937872547656, "learning_rate": 9.537934228809633e-07, "loss": 0.0001, "num_tokens": 137241481.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4989, "step_time": 23.048565436154604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 177.0625, "completions/mean_terminated_length": 177.0625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.20850744098424911, "epoch": 0.23112552107457157, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055336784571409225, "kl": 0.006053523859009147, "learning_rate": 9.537841593330245e-07, "loss": 0.0003, "num_tokens": 137273658.0, "reward": 0.9181891679763794, "reward_std": 0.0, "rewards/reward_func/mean": 0.9181891679763794, "rewards/reward_func/std": 0.0, "step": 4990, "step_time": 21.69393128529191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 132.5625, "completions/mean_terminated_length": 132.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.23681363463401794, "epoch": 0.23117183881426587, "frac_reward_zero_std": 1.0, "grad_norm": 0.003796979319304228, "kl": 0.002893943339586258, "learning_rate": 9.537748957850856e-07, "loss": 0.0001, "num_tokens": 137293923.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4991, "step_time": 14.124348815530539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 182.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3799362927675247, "epoch": 0.23121815655396016, "frac_reward_zero_std": 1.0, "grad_norm": 0.01965484395623207, "kl": 0.019588962895795703, "learning_rate": 9.537656322371467e-07, "loss": 0.001, "num_tokens": 137320299.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 4992, "step_time": 24.335781812667847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 193.0, "completions/max_terminated_length": 193.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.44669947773218155, "epoch": 0.23126447429365446, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035574499052017927, "kl": 0.002802410745061934, "learning_rate": 9.53756368689208e-07, "loss": 0.0001, "num_tokens": 137350626.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4993, "step_time": 19.18287806212902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 156.9375, "completions/mean_terminated_length": 156.9375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.309749573469162, "epoch": 0.23131079203334878, "frac_reward_zero_std": 1.0, "grad_norm": 0.002668954897671938, "kl": 0.002336340432520956, "learning_rate": 9.537471051412692e-07, "loss": 0.0001, "num_tokens": 137374081.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4994, "step_time": 16.534566815942526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 170.0625, "completions/mean_terminated_length": 170.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2118827849626541, "epoch": 0.23135710977304308, "frac_reward_zero_std": 0.0, "grad_norm": 0.11174722015857697, "kl": 0.005126630654558539, "learning_rate": 9.5373784159333e-07, "loss": 0.0033, "num_tokens": 137396066.0, "reward": 0.5191009044647217, "reward_std": 0.013752984814345837, "rewards/reward_func/mean": 0.5191009044647217, "rewards/reward_func/std": 0.013752982951700687, "step": 4995, "step_time": 18.149486247450113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 130.0, "completions/mean_terminated_length": 130.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.2366933375597, "epoch": 0.23140342751273738, "frac_reward_zero_std": 1.0, "grad_norm": 0.006469685584306717, "kl": 0.0036037511890754104, "learning_rate": 9.537285780453912e-07, "loss": 0.0002, "num_tokens": 137417058.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4996, "step_time": 14.295692507177591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.3441452980041504, "epoch": 0.23144974525243167, "frac_reward_zero_std": 0.0, "grad_norm": 0.12067591398954391, "kl": 0.020268420223146677, "learning_rate": 9.537193144974525e-07, "loss": -0.0449, "num_tokens": 137448916.0, "reward": 0.014472972601652145, "reward_std": 0.05553954839706421, "rewards/reward_func/mean": 0.014472972601652145, "rewards/reward_func/std": 0.05553954839706421, "step": 4997, "step_time": 31.663094013929367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 163.25, "completions/mean_terminated_length": 163.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.4020567238330841, "epoch": 0.231496062992126, "frac_reward_zero_std": 1.0, "grad_norm": 0.009164032526314259, "kl": 0.004412938025780022, "learning_rate": 9.537100509495137e-07, "loss": 0.0002, "num_tokens": 137474872.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4998, "step_time": 18.051149625331163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 136.0, "completions/max_terminated_length": 136.0, "completions/mean_length": 120.8125, "completions/mean_terminated_length": 120.8125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.27976813167333603, "epoch": 0.2315423807318203, "frac_reward_zero_std": 1.0, "grad_norm": 0.006004109513014555, "kl": 0.0033063965383917093, "learning_rate": 9.537007874015748e-07, "loss": 0.0002, "num_tokens": 137495285.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "step": 4999, "step_time": 13.659004848450422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 161.25, "completions/mean_terminated_length": 161.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.36634664237499237, "epoch": 0.2315886984715146, "frac_reward_zero_std": 1.0, "grad_norm": 0.011201155371963978, "kl": 0.011701725656166673, "learning_rate": 9.53691523853636e-07, "loss": 0.0006, "num_tokens": 137521881.0, "reward": 0.0, "reward_std": 0.0, "rewards/reward_func/mean": 0.0, "rewards/reward_func/std": 0.0, "step": 5000, "step_time": 18.465140528976917 } ], "logging_steps": 1, "max_steps": 107950, "num_input_tokens_seen": 137521881, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }