{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0015, "eval_steps": 500, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04213663242626353, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.36506643891334534, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0001, "num_tokens": 34720.0, "reward": -0.4953559935092926, "reward_std": 0.10753969848155975, "rewards/rollout_reward_func/mean": -0.4953559935092926, "rewards/rollout_reward_func/std": 0.11737043410539627, "sampling/importance_sampling_ratio/max": 1.01706862449646, "sampling/importance_sampling_ratio/mean": 0.9976009130477905, "sampling/importance_sampling_ratio/min": 0.8514507412910461, "sampling/sampling_logp_difference/max": 0.160813570022583, "sampling/sampling_logp_difference/mean": 0.0033812490291893482, "step": 1, "step_time": 10.583001638000042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.056682895723497495, "epoch": 4e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.230589509010315, "kl": 0.0, "learning_rate": 2.8571428571428573e-06, "loss": -0.0012, "num_tokens": 72329.0, "reward": -0.4865216612815857, "reward_std": 0.08774720132350922, "rewards/rollout_reward_func/mean": -0.4865216612815857, "rewards/rollout_reward_func/std": 0.1045704185962677, "sampling/importance_sampling_ratio/max": 1.5042492151260376, "sampling/importance_sampling_ratio/mean": 1.0045233964920044, "sampling/importance_sampling_ratio/min": 0.7999074459075928, "sampling/sampling_logp_difference/max": 0.4082939624786377, "sampling/sampling_logp_difference/mean": 0.010556299239397049, "step": 2, "step_time": 9.847310764999861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.04655165857911925, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.6786770224571228, "kl": 0.0003507627650863876, "learning_rate": 5.7142857142857145e-06, "loss": 0.0005, "num_tokens": 107126.0, "reward": -0.4738577902317047, "reward_std": 0.14315202832221985, "rewards/rollout_reward_func/mean": -0.4738577902317047, "rewards/rollout_reward_func/std": 0.14606907963752747, "sampling/importance_sampling_ratio/max": 1.2076377868652344, "sampling/importance_sampling_ratio/mean": 1.0011494159698486, "sampling/importance_sampling_ratio/min": 0.8115481734275818, "sampling/sampling_logp_difference/max": 0.20881152153015137, "sampling/sampling_logp_difference/mean": 0.005916388239711523, "step": 3, "step_time": 9.515110119000042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06136378643714124, "epoch": 8e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.5827341079711914, "kl": 0.0021565575205606535, "learning_rate": 8.571428571428573e-06, "loss": 0.0007, "num_tokens": 147637.0, "reward": -0.42704540491104126, "reward_std": 0.1624506115913391, "rewards/rollout_reward_func/mean": -0.42704540491104126, "rewards/rollout_reward_func/std": 0.17692908644676208, "sampling/importance_sampling_ratio/max": 1.2322131395339966, "sampling/importance_sampling_ratio/mean": 1.0034774541854858, "sampling/importance_sampling_ratio/min": 0.7550826072692871, "sampling/sampling_logp_difference/max": 0.2809281349182129, "sampling/sampling_logp_difference/mean": 0.013284817337989807, "step": 4, "step_time": 10.327476732999912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06213088113145204, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 0.1783960461616516, "kl": 0.010104762356288433, "learning_rate": 1.1428571428571429e-05, "loss": 0.0004, "num_tokens": 183808.0, "reward": -0.5187968015670776, "reward_std": 0.10742770880460739, "rewards/rollout_reward_func/mean": -0.5187968015670776, "rewards/rollout_reward_func/std": 0.114708311855793, "sampling/importance_sampling_ratio/max": 1.5868523120880127, "sampling/importance_sampling_ratio/mean": 1.0063602924346924, "sampling/importance_sampling_ratio/min": 0.8623139262199402, "sampling/sampling_logp_difference/max": 0.46175241470336914, "sampling/sampling_logp_difference/mean": 0.009860638529062271, "step": 5, "step_time": 9.938454288000116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.083112089203496, "epoch": 0.00012, "frac_reward_zero_std": 0.0, "grad_norm": 2.643200635910034, "kl": 0.04417062703578267, "learning_rate": 1.4285714285714285e-05, "loss": 0.0007, "num_tokens": 221524.0, "reward": -0.526121973991394, "reward_std": 0.16247841715812683, "rewards/rollout_reward_func/mean": -0.526121973991394, "rewards/rollout_reward_func/std": 0.16897279024124146, "sampling/importance_sampling_ratio/max": 1.3004112243652344, "sampling/importance_sampling_ratio/mean": 0.9950670003890991, "sampling/importance_sampling_ratio/min": 0.6120707392692566, "sampling/sampling_logp_difference/max": 0.4909074306488037, "sampling/sampling_logp_difference/mean": 0.01979595422744751, "step": 6, "step_time": 11.122546247000173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.10465507118351525, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 0.6755809783935547, "kl": 0.06485261598226089, "learning_rate": 1.7142857142857145e-05, "loss": 0.0006, "num_tokens": 262058.0, "reward": -0.45354732871055603, "reward_std": 0.11865675449371338, "rewards/rollout_reward_func/mean": -0.45354732871055603, "rewards/rollout_reward_func/std": 0.1317674219608307, "sampling/importance_sampling_ratio/max": 1.3646963834762573, "sampling/importance_sampling_ratio/mean": 0.9980586767196655, "sampling/importance_sampling_ratio/min": 0.7788010835647583, "sampling/sampling_logp_difference/max": 0.310931921005249, "sampling/sampling_logp_difference/mean": 0.013520372100174427, "step": 7, "step_time": 10.180680208999888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.06767257605679333, "epoch": 0.00016, "frac_reward_zero_std": 0.0, "grad_norm": 0.7358646392822266, "kl": 0.10022372181984451, "learning_rate": 2e-05, "loss": -0.0003, "num_tokens": 300372.0, "reward": -0.438113808631897, "reward_std": 0.1628333181142807, "rewards/rollout_reward_func/mean": -0.438113808631897, "rewards/rollout_reward_func/std": 0.18601371347904205, "sampling/importance_sampling_ratio/max": 1.5303767919540405, "sampling/importance_sampling_ratio/mean": 1.010486364364624, "sampling/importance_sampling_ratio/min": 0.5898699164390564, "sampling/sampling_logp_difference/max": 0.52785325050354, "sampling/sampling_logp_difference/mean": 0.026673050597310066, "step": 8, "step_time": 10.86024607800016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05736969155987026, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 0.4610992968082428, "kl": 0.14014234533533454, "learning_rate": 2.2857142857142858e-05, "loss": 0.0011, "num_tokens": 340975.0, "reward": -0.5177702307701111, "reward_std": 0.10829603672027588, "rewards/rollout_reward_func/mean": -0.5177702307701111, "rewards/rollout_reward_func/std": 0.11044981330633163, "sampling/importance_sampling_ratio/max": 1.7585923671722412, "sampling/importance_sampling_ratio/mean": 1.0206780433654785, "sampling/importance_sampling_ratio/min": 0.9417206048965454, "sampling/sampling_logp_difference/max": 0.5645136833190918, "sampling/sampling_logp_difference/mean": 0.019472315907478333, "step": 9, "step_time": 11.569765732000178 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05415748237646767, "epoch": 0.0002, "frac_reward_zero_std": 0.0, "grad_norm": 0.42527446150779724, "kl": 0.15265877915955645, "learning_rate": 2.5714285714285714e-05, "loss": 0.0002, "num_tokens": 380815.0, "reward": -0.5060328245162964, "reward_std": 0.12003964185714722, "rewards/rollout_reward_func/mean": -0.5060328245162964, "rewards/rollout_reward_func/std": 0.13507631421089172, "sampling/importance_sampling_ratio/max": 1.2756245136260986, "sampling/importance_sampling_ratio/mean": 1.0032804012298584, "sampling/importance_sampling_ratio/min": 0.8109822869300842, "sampling/sampling_logp_difference/max": 0.24343585968017578, "sampling/sampling_logp_difference/mean": 0.010368866845965385, "step": 10, "step_time": 11.035615327000073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03652133769719512, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 0.7140417098999023, "kl": 0.5853048088065407, "learning_rate": 2.857142857142857e-05, "loss": 0.0007, "num_tokens": 419201.0, "reward": -0.49228060245513916, "reward_std": 0.1299666315317154, "rewards/rollout_reward_func/mean": -0.49228060245513916, "rewards/rollout_reward_func/std": 0.1354321837425232, "sampling/importance_sampling_ratio/max": 1.192307949066162, "sampling/importance_sampling_ratio/mean": 0.9962027668952942, "sampling/importance_sampling_ratio/min": 0.18285171687602997, "sampling/sampling_logp_difference/max": 1.6990797519683838, "sampling/sampling_logp_difference/mean": 0.022965161129832268, "step": 11, "step_time": 12.10885376400006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.010421836544992402, "epoch": 0.00024, "frac_reward_zero_std": 0.0, "grad_norm": 0.0725681409239769, "kl": 0.09164208328971435, "learning_rate": 3.142857142857143e-05, "loss": 0.0002, "num_tokens": 455342.0, "reward": -0.42868927121162415, "reward_std": 0.1808679699897766, "rewards/rollout_reward_func/mean": -0.42868927121162415, "rewards/rollout_reward_func/std": 0.1826806664466858, "sampling/importance_sampling_ratio/max": 1.0040708780288696, "sampling/importance_sampling_ratio/mean": 0.9997765421867371, "sampling/importance_sampling_ratio/min": 0.9736456274986267, "sampling/sampling_logp_difference/max": 0.026707857847213745, "sampling/sampling_logp_difference/mean": 0.0005151446675881743, "step": 12, "step_time": 11.46827613999983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.005160837485163938, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 0.003414501203224063, "kl": 0.054030168646306775, "learning_rate": 3.428571428571429e-05, "loss": 0.0001, "num_tokens": 487838.0, "reward": -0.5129345655441284, "reward_std": 0.09682037681341171, "rewards/rollout_reward_func/mean": -0.5129345655441284, "rewards/rollout_reward_func/std": 0.10943721234798431, "sampling/importance_sampling_ratio/max": 1.0242013931274414, "sampling/importance_sampling_ratio/mean": 1.0002690553665161, "sampling/importance_sampling_ratio/min": 0.9995214939117432, "sampling/sampling_logp_difference/max": 0.023913156241178513, "sampling/sampling_logp_difference/mean": 0.00029178019030950963, "step": 13, "step_time": 13.078657843000315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0049433065178163815, "epoch": 0.00028, "frac_reward_zero_std": 0.0, "grad_norm": 0.006202552933245897, "kl": 0.11296933640962958, "learning_rate": 3.7142857142857143e-05, "loss": 0.0002, "num_tokens": 523251.0, "reward": -0.4549490213394165, "reward_std": 0.1518395096063614, "rewards/rollout_reward_func/mean": -0.4549490213394165, "rewards/rollout_reward_func/std": 0.15026704967021942, "sampling/importance_sampling_ratio/max": 1.0146245956420898, "sampling/importance_sampling_ratio/mean": 1.0001541376113892, "sampling/importance_sampling_ratio/min": 0.9956898093223572, "sampling/sampling_logp_difference/max": 0.014518730342388153, "sampling/sampling_logp_difference/mean": 0.0002873566118068993, "step": 14, "step_time": 11.818882993999978 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.05449013704492245, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 1.2176028490066528, "kl": 0.3055893306495818, "learning_rate": 4e-05, "loss": 0.0004, "num_tokens": 554972.0, "reward": -0.4694768786430359, "reward_std": 0.1240941733121872, "rewards/rollout_reward_func/mean": -0.4694768786430359, "rewards/rollout_reward_func/std": 0.13716520369052887, "sampling/importance_sampling_ratio/max": 2.6555840969085693, "sampling/importance_sampling_ratio/mean": 0.9897008538246155, "sampling/importance_sampling_ratio/min": 0.24389615654945374, "sampling/sampling_logp_difference/max": 1.4110127687454224, "sampling/sampling_logp_difference/mean": 0.05285795032978058, "step": 15, "step_time": 12.903524508000032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.029744337291049305, "epoch": 0.00032, "frac_reward_zero_std": 0.0, "grad_norm": 0.5681190490722656, "kl": 1.5352033322367333, "learning_rate": 4.2857142857142856e-05, "loss": 0.0024, "num_tokens": 591125.0, "reward": -0.49898892641067505, "reward_std": 0.1363566815853119, "rewards/rollout_reward_func/mean": -0.49898892641067505, "rewards/rollout_reward_func/std": 0.17710748314857483, "sampling/importance_sampling_ratio/max": 1.0899990797042847, "sampling/importance_sampling_ratio/mean": 0.9730724692344666, "sampling/importance_sampling_ratio/min": 0.11985374242067337, "sampling/sampling_logp_difference/max": 2.121483087539673, "sampling/sampling_logp_difference/mean": 0.06464457511901855, "step": 16, "step_time": 10.869735754999738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.005307975381583674, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 0.022409122437238693, "kl": 2.8874126337468624, "learning_rate": 4.5714285714285716e-05, "loss": 0.0053, "num_tokens": 624998.0, "reward": -0.42809420824050903, "reward_std": 0.10570663213729858, "rewards/rollout_reward_func/mean": -0.42809420824050903, "rewards/rollout_reward_func/std": 0.11102946847677231, "sampling/importance_sampling_ratio/max": 1.113768219947815, "sampling/importance_sampling_ratio/mean": 1.001915454864502, "sampling/importance_sampling_ratio/min": 0.9997419118881226, "sampling/sampling_logp_difference/max": 0.10774913430213928, "sampling/sampling_logp_difference/mean": 0.0018536553252488375, "step": 17, "step_time": 10.573343859999682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0018984277339768596, "epoch": 0.00036, "frac_reward_zero_std": 0.0, "grad_norm": 0.001003155717626214, "kl": 2.4075989934303834, "learning_rate": 4.8571428571428576e-05, "loss": 0.0045, "num_tokens": 660422.0, "reward": -0.4815124273300171, "reward_std": 0.17238172888755798, "rewards/rollout_reward_func/mean": -0.4815124273300171, "rewards/rollout_reward_func/std": 0.17162199318408966, "sampling/importance_sampling_ratio/max": 1.0038193464279175, "sampling/importance_sampling_ratio/mean": 1.0001835823059082, "sampling/importance_sampling_ratio/min": 0.9999402761459351, "sampling/sampling_logp_difference/max": 0.00381203880533576, "sampling/sampling_logp_difference/mean": 0.00019016250735148787, "step": 18, "step_time": 10.092614914000478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.001787107794370968, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 0.0019859045278280973, "kl": 2.131671732679621, "learning_rate": 5.142857142857143e-05, "loss": 0.004, "num_tokens": 695119.0, "reward": -0.45420968532562256, "reward_std": 0.1451229602098465, "rewards/rollout_reward_func/mean": -0.45420968532562256, "rewards/rollout_reward_func/std": 0.16084149479866028, "sampling/importance_sampling_ratio/max": 1.012938380241394, "sampling/importance_sampling_ratio/mean": 1.0002269744873047, "sampling/importance_sampling_ratio/min": 0.9999643564224243, "sampling/sampling_logp_difference/max": 0.012855470180511475, "sampling/sampling_logp_difference/mean": 0.0002334651944693178, "step": 19, "step_time": 11.423286533999544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0023332082309934776, "epoch": 0.0004, "frac_reward_zero_std": 0.0, "grad_norm": 0.0014479915844276547, "kl": 1.2728006265497243, "learning_rate": 5.428571428571428e-05, "loss": 0.0024, "num_tokens": 734861.0, "reward": -0.4497176706790924, "reward_std": 0.12124527990818024, "rewards/rollout_reward_func/mean": -0.4497176706790924, "rewards/rollout_reward_func/std": 0.12396564334630966, "sampling/importance_sampling_ratio/max": 1.0169790983200073, "sampling/importance_sampling_ratio/mean": 1.0002472400665283, "sampling/importance_sampling_ratio/min": 0.999942421913147, "sampling/sampling_logp_difference/max": 0.01683656871318817, "sampling/sampling_logp_difference/mean": 0.00025150534929707646, "step": 20, "step_time": 10.984022518999609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0017557634673721623, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 0.008899558335542679, "kl": 2.1504041726390426, "learning_rate": 5.714285714285714e-05, "loss": 0.004, "num_tokens": 770251.0, "reward": -0.4525429606437683, "reward_std": 0.1307295560836792, "rewards/rollout_reward_func/mean": -0.4525429606437683, "rewards/rollout_reward_func/std": 0.13270536065101624, "sampling/importance_sampling_ratio/max": 1.007300615310669, "sampling/importance_sampling_ratio/mean": 1.0000929832458496, "sampling/importance_sampling_ratio/min": 0.9999077320098877, "sampling/sampling_logp_difference/max": 0.007273993454873562, "sampling/sampling_logp_difference/mean": 9.861911530606449e-05, "step": 21, "step_time": 11.750048858000582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0014675547063234262, "epoch": 0.00044, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015787131851539016, "kl": 2.142753223429594, "learning_rate": 6e-05, "loss": 0.004, "num_tokens": 804919.0, "reward": -0.4373324513435364, "reward_std": 0.14926397800445557, "rewards/rollout_reward_func/mean": -0.4373324513435364, "rewards/rollout_reward_func/std": 0.1527157872915268, "sampling/importance_sampling_ratio/max": 1.0027562379837036, "sampling/importance_sampling_ratio/mean": 1.0000301599502563, "sampling/importance_sampling_ratio/min": 0.9997082352638245, "sampling/sampling_logp_difference/max": 0.002752469852566719, "sampling/sampling_logp_difference/mean": 4.924799213767983e-05, "step": 22, "step_time": 10.57756851799968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0016250821427092887, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 0.007080785930156708, "kl": 2.471247175708413, "learning_rate": 6.285714285714286e-05, "loss": 0.0046, "num_tokens": 840304.0, "reward": -0.5088313221931458, "reward_std": 0.12063620984554291, "rewards/rollout_reward_func/mean": -0.5088313221931458, "rewards/rollout_reward_func/std": 0.11942265927791595, "sampling/importance_sampling_ratio/max": 1.000611424446106, "sampling/importance_sampling_ratio/mean": 0.9999845027923584, "sampling/importance_sampling_ratio/min": 0.9976289868354797, "sampling/sampling_logp_difference/max": 0.0023738397285342216, "sampling/sampling_logp_difference/mean": 5.626135680358857e-05, "step": 23, "step_time": 11.196098771000152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.002501545510313008, "epoch": 0.00048, "frac_reward_zero_std": 0.0, "grad_norm": 0.002404371974989772, "kl": 1.4448929702630267, "learning_rate": 6.571428571428571e-05, "loss": 0.0027, "num_tokens": 878691.0, "reward": -0.4859924912452698, "reward_std": 0.17074629664421082, "rewards/rollout_reward_func/mean": -0.4859924912452698, "rewards/rollout_reward_func/std": 0.18796174228191376, "sampling/importance_sampling_ratio/max": 1.008969783782959, "sampling/importance_sampling_ratio/mean": 1.0001083612442017, "sampling/importance_sampling_ratio/min": 0.9983657002449036, "sampling/sampling_logp_difference/max": 0.008929748088121414, "sampling/sampling_logp_difference/mean": 0.00018690252909436822, "step": 24, "step_time": 11.436561123000047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.00125838804160594, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 0.00037121796049177647, "kl": 2.728724977933325, "learning_rate": 6.857142857142858e-05, "loss": 0.0051, "num_tokens": 912630.0, "reward": -0.43685033917427063, "reward_std": 0.14631888270378113, "rewards/rollout_reward_func/mean": -0.43685033917427063, "rewards/rollout_reward_func/std": 0.14818531274795532, "sampling/importance_sampling_ratio/max": 1.0000602006912231, "sampling/importance_sampling_ratio/mean": 0.999987781047821, "sampling/importance_sampling_ratio/min": 0.9996044039726257, "sampling/sampling_logp_difference/max": 0.000395664683310315, "sampling/sampling_logp_difference/mean": 2.7497631890582852e-05, "step": 25, "step_time": 10.338325488999317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.004701976798969554, "epoch": 0.00052, "frac_reward_zero_std": 0.0, "grad_norm": 0.0168001726269722, "kl": 2.670653583481908, "learning_rate": 7.142857142857143e-05, "loss": 0.005, "num_tokens": 945129.0, "reward": -0.49788784980773926, "reward_std": 0.11309097707271576, "rewards/rollout_reward_func/mean": -0.49788784980773926, "rewards/rollout_reward_func/std": 0.11171143501996994, "sampling/importance_sampling_ratio/max": 1.0177559852600098, "sampling/importance_sampling_ratio/mean": 1.0001691579818726, "sampling/importance_sampling_ratio/min": 0.9972944855690002, "sampling/sampling_logp_difference/max": 0.017600178718566895, "sampling/sampling_logp_difference/mean": 0.0002922487910836935, "step": 26, "step_time": 11.807520030999513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.005813408846734092, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 0.04519229754805565, "kl": 2.455354232341051, "learning_rate": 7.428571428571429e-05, "loss": 0.0046, "num_tokens": 979857.0, "reward": -0.48440974950790405, "reward_std": 0.1276542842388153, "rewards/rollout_reward_func/mean": -0.48440974950790405, "rewards/rollout_reward_func/std": 0.13863544166088104, "sampling/importance_sampling_ratio/max": 1.0226891040802002, "sampling/importance_sampling_ratio/mean": 1.0000253915786743, "sampling/importance_sampling_ratio/min": 0.9920601844787598, "sampling/sampling_logp_difference/max": 0.02243557944893837, "sampling/sampling_logp_difference/mean": 0.00045728118857368827, "step": 27, "step_time": 11.074748723000539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.007451985773514025, "epoch": 0.00056, "frac_reward_zero_std": 0.0, "grad_norm": 0.020447248592972755, "kl": 2.786820446451202, "learning_rate": 7.714285714285715e-05, "loss": 0.0052, "num_tokens": 1010868.0, "reward": -0.4477247893810272, "reward_std": 0.12471087276935577, "rewards/rollout_reward_func/mean": -0.4477247893810272, "rewards/rollout_reward_func/std": 0.1252235472202301, "sampling/importance_sampling_ratio/max": 1.0000356435775757, "sampling/importance_sampling_ratio/mean": 0.9992380142211914, "sampling/importance_sampling_ratio/min": 0.9429383277893066, "sampling/sampling_logp_difference/max": 0.058754414319992065, "sampling/sampling_logp_difference/mean": 0.0007906818063929677, "step": 28, "step_time": 11.204566938999733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.005258000070170965, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 0.12415958195924759, "kl": 1.8602933554793708, "learning_rate": 8e-05, "loss": 0.0036, "num_tokens": 1047027.0, "reward": -0.47356218099594116, "reward_std": 0.17148526012897491, "rewards/rollout_reward_func/mean": -0.47356218099594116, "rewards/rollout_reward_func/std": 0.17493870854377747, "sampling/importance_sampling_ratio/max": 1.0000348091125488, "sampling/importance_sampling_ratio/mean": 0.9999445676803589, "sampling/importance_sampling_ratio/min": 0.9985746145248413, "sampling/sampling_logp_difference/max": 0.0014263943303376436, "sampling/sampling_logp_difference/mean": 6.454815593315288e-05, "step": 29, "step_time": 10.754735779999919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.005060947762103751, "epoch": 0.0006, "frac_reward_zero_std": 0.0, "grad_norm": 0.015699295327067375, "kl": 3.0101311548302476, "learning_rate": 8.285714285714287e-05, "loss": 0.0056, "num_tokens": 1078040.0, "reward": -0.44326093792915344, "reward_std": 0.1546049863100052, "rewards/rollout_reward_func/mean": -0.44326093792915344, "rewards/rollout_reward_func/std": 0.16109547019004822, "sampling/importance_sampling_ratio/max": 1.0001112222671509, "sampling/importance_sampling_ratio/mean": 0.999657928943634, "sampling/importance_sampling_ratio/min": 0.9917479157447815, "sampling/sampling_logp_difference/max": 0.00828632339835167, "sampling/sampling_logp_difference/mean": 0.00035381870111450553, "step": 30, "step_time": 11.130074380999304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.010435697484354023, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 0.011954938992857933, "kl": 2.1385079924175443, "learning_rate": 8.571428571428571e-05, "loss": 0.004, "num_tokens": 1114270.0, "reward": -0.5106649994850159, "reward_std": 0.15311290323734283, "rewards/rollout_reward_func/mean": -0.5106649994850159, "rewards/rollout_reward_func/std": 0.16842880845069885, "sampling/importance_sampling_ratio/max": 1.0115782022476196, "sampling/importance_sampling_ratio/mean": 0.9992225170135498, "sampling/importance_sampling_ratio/min": 0.9679325819015503, "sampling/sampling_logp_difference/max": 0.03259281814098358, "sampling/sampling_logp_difference/mean": 0.001033698208630085, "step": 31, "step_time": 10.868044704000113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.015111156193597708, "epoch": 0.00064, "frac_reward_zero_std": 0.0, "grad_norm": 0.07620638608932495, "kl": 1.894992141673356, "learning_rate": 8.857142857142857e-05, "loss": 0.0033, "num_tokens": 1148908.0, "reward": -0.42877840995788574, "reward_std": 0.12491989135742188, "rewards/rollout_reward_func/mean": -0.42877840995788574, "rewards/rollout_reward_func/std": 0.15547259151935577, "sampling/importance_sampling_ratio/max": 1.0025962591171265, "sampling/importance_sampling_ratio/mean": 0.997442364692688, "sampling/importance_sampling_ratio/min": 0.8747695088386536, "sampling/sampling_logp_difference/max": 0.133794903755188, "sampling/sampling_logp_difference/mean": 0.0027670767158269882, "step": 32, "step_time": 12.017423004999955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.03369333760929294, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 0.15489332377910614, "kl": 2.431340977549553, "learning_rate": 9.142857142857143e-05, "loss": 0.0043, "num_tokens": 1182817.0, "reward": -0.43676623702049255, "reward_std": 0.19219474494457245, "rewards/rollout_reward_func/mean": -0.43676623702049255, "rewards/rollout_reward_func/std": 0.18783578276634216, "sampling/importance_sampling_ratio/max": 1.029738187789917, "sampling/importance_sampling_ratio/mean": 0.9952020645141602, "sampling/importance_sampling_ratio/min": 0.8523503541946411, "sampling/sampling_logp_difference/max": 0.1597576141357422, "sampling/sampling_logp_difference/mean": 0.005681644193828106, "step": 33, "step_time": 11.107914947999916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.08210996998241171, "epoch": 0.00068, "frac_reward_zero_std": 0.0, "grad_norm": 3.0649993419647217, "kl": 2.27309197653085, "learning_rate": 9.428571428571429e-05, "loss": 0.0194, "num_tokens": 1216117.0, "reward": -0.5223650932312012, "reward_std": 0.15171676874160767, "rewards/rollout_reward_func/mean": -0.5223650932312012, "rewards/rollout_reward_func/std": 0.14959992468357086, "sampling/importance_sampling_ratio/max": 2.7629706859588623, "sampling/importance_sampling_ratio/mean": 1.0301706790924072, "sampling/importance_sampling_ratio/min": 0.6142791509628296, "sampling/sampling_logp_difference/max": 1.0163064002990723, "sampling/sampling_logp_difference/mean": 0.04744146391749382, "step": 34, "step_time": 11.677880323999716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0031382560227939393, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 0.008347594179213047, "kl": 2.0830320498597246, "learning_rate": 9.714285714285715e-05, "loss": 0.0039, "num_tokens": 1249382.0, "reward": -0.45180174708366394, "reward_std": 0.19802165031433105, "rewards/rollout_reward_func/mean": -0.45180174708366394, "rewards/rollout_reward_func/std": 0.19667227566242218, "sampling/importance_sampling_ratio/max": 1.0000340938568115, "sampling/importance_sampling_ratio/mean": 0.9999133348464966, "sampling/importance_sampling_ratio/min": 0.997832179069519, "sampling/sampling_logp_difference/max": 0.002170148305594921, "sampling/sampling_logp_difference/mean": 9.547994704917073e-05, "step": 35, "step_time": 10.335512492000134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.010472146794199944, "epoch": 0.00072, "frac_reward_zero_std": 0.0, "grad_norm": 0.0902613028883934, "kl": 2.3452741815708578, "learning_rate": 0.0001, "loss": 0.0043, "num_tokens": 1281161.0, "reward": -0.4157106280326843, "reward_std": 0.1772732436656952, "rewards/rollout_reward_func/mean": -0.4157106280326843, "rewards/rollout_reward_func/std": 0.1765340268611908, "sampling/importance_sampling_ratio/max": 1.0063142776489258, "sampling/importance_sampling_ratio/mean": 0.9980408549308777, "sampling/importance_sampling_ratio/min": 0.8053573369979858, "sampling/sampling_logp_difference/max": 0.21646922826766968, "sampling/sampling_logp_difference/mean": 0.0023312268313020468, "step": 36, "step_time": 11.72868796600028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0007935629546409473, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 5.5115637223934755e-05, "kl": 2.979110558206912, "learning_rate": 9.999942030039711e-05, "loss": 0.0056, "num_tokens": 1309941.0, "reward": -0.4328516721725464, "reward_std": 0.11836086213588715, "rewards/rollout_reward_func/mean": -0.4328516721725464, "rewards/rollout_reward_func/std": 0.12694095075130463, "sampling/importance_sampling_ratio/max": 1.0000559091567993, "sampling/importance_sampling_ratio/mean": 1.0000051259994507, "sampling/importance_sampling_ratio/min": 0.9999697208404541, "sampling/sampling_logp_difference/max": 5.5903590691741556e-05, "sampling/sampling_logp_difference/mean": 6.916895472386386e-06, "step": 37, "step_time": 10.544788530000005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0007049098967399914, "epoch": 0.00076, "frac_reward_zero_std": 0.0, "grad_norm": 1.5064314538904e-05, "kl": 3.3144653998315334, "learning_rate": 9.999768121951115e-05, "loss": 0.0062, "num_tokens": 1340982.0, "reward": -0.4486263394355774, "reward_std": 0.18141409754753113, "rewards/rollout_reward_func/mean": -0.4486263394355774, "rewards/rollout_reward_func/std": 0.18459345400333405, "sampling/importance_sampling_ratio/max": 1.0000298023223877, "sampling/importance_sampling_ratio/mean": 1.0000050067901611, "sampling/importance_sampling_ratio/min": 0.9999734163284302, "sampling/sampling_logp_difference/max": 2.9786722734570503e-05, "sampling/sampling_logp_difference/mean": 6.072340511309449e-06, "step": 38, "step_time": 10.22810908700012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.002045353794528637, "epoch": 0.00078, "frac_reward_zero_std": 0.0, "grad_norm": 0.02569839172065258, "kl": 2.3597085239986058, "learning_rate": 9.999478281110987e-05, "loss": 0.0045, "num_tokens": 1374938.0, "reward": -0.4885009527206421, "reward_std": 0.12055516242980957, "rewards/rollout_reward_func/mean": -0.4885009527206421, "rewards/rollout_reward_func/std": 0.12474851310253143, "sampling/importance_sampling_ratio/max": 1.0000395774841309, "sampling/importance_sampling_ratio/mean": 0.9997438788414001, "sampling/importance_sampling_ratio/min": 0.9747907519340515, "sampling/sampling_logp_difference/max": 0.025532402098178864, "sampling/sampling_logp_difference/mean": 0.00027372626936994493, "step": 39, "step_time": 10.025046984000483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.001170793577330187, "epoch": 0.0008, "frac_reward_zero_std": 0.0, "grad_norm": 0.0011198727879673243, "kl": 2.575058995746076, "learning_rate": 9.999072516480423e-05, "loss": 0.0048, "num_tokens": 1405986.0, "reward": -0.49351146817207336, "reward_std": 0.11518004536628723, "rewards/rollout_reward_func/mean": -0.49351146817207336, "rewards/rollout_reward_func/std": 0.12014364451169968, "sampling/importance_sampling_ratio/max": 1.0233973264694214, "sampling/importance_sampling_ratio/mean": 1.0002632141113281, "sampling/importance_sampling_ratio/min": 0.9999645948410034, "sampling/sampling_logp_difference/max": 0.023127814754843712, "sampling/sampling_logp_difference/mean": 0.0002636197314132005, "step": 40, "step_time": 11.72272543899976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.000821109762910055, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 0.002684971783310175, "kl": 2.9493437994582337, "learning_rate": 9.998550840604579e-05, "loss": 0.0055, "num_tokens": 1438504.0, "reward": -0.44643768668174744, "reward_std": 0.18104322254657745, "rewards/rollout_reward_func/mean": -0.44643768668174744, "rewards/rollout_reward_func/std": 0.19707725942134857, "sampling/importance_sampling_ratio/max": 1.0007014274597168, "sampling/importance_sampling_ratio/mean": 1.000012755393982, "sampling/importance_sampling_ratio/min": 0.9998999834060669, "sampling/sampling_logp_difference/max": 0.0007011198904365301, "sampling/sampling_logp_difference/mean": 1.5726367564639077e-05, "step": 41, "step_time": 10.699221135000244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006376944766088855, "epoch": 0.00084, "frac_reward_zero_std": 0.0, "grad_norm": 9.305521962232888e-05, "kl": 1.718819092882768, "learning_rate": 9.997913269612266e-05, "loss": 0.0032, "num_tokens": 1473957.0, "reward": -0.4666598439216614, "reward_std": 0.1580410599708557, "rewards/rollout_reward_func/mean": -0.4666598439216614, "rewards/rollout_reward_func/std": 0.15718111395835876, "sampling/importance_sampling_ratio/max": 1.000121831893921, "sampling/importance_sampling_ratio/mean": 1.0000066757202148, "sampling/importance_sampling_ratio/min": 0.9999860525131226, "sampling/sampling_logp_difference/max": 0.00012180398334749043, "sampling/sampling_logp_difference/mean": 8.024451744859107e-06, "step": 42, "step_time": 11.311061955000014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.000830665525427321, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017081554979085922, "kl": 2.11324807908386, "learning_rate": 9.997159823215467e-05, "loss": 0.004, "num_tokens": 1507226.0, "reward": -0.5063143372535706, "reward_std": 0.11844731867313385, "rewards/rollout_reward_func/mean": -0.5063143372535706, "rewards/rollout_reward_func/std": 0.11648620665073395, "sampling/importance_sampling_ratio/max": 1.0000935792922974, "sampling/importance_sampling_ratio/mean": 1.0000015497207642, "sampling/importance_sampling_ratio/min": 0.999706506729126, "sampling/sampling_logp_difference/max": 0.00029356300365179777, "sampling/sampling_logp_difference/mean": 1.2860547030868474e-05, "step": 43, "step_time": 12.482276944999057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006207615115272347, "epoch": 0.00088, "frac_reward_zero_std": 0.0, "grad_norm": 2.307548129465431e-05, "kl": 2.9673098786734045, "learning_rate": 9.996290524708723e-05, "loss": 0.0056, "num_tokens": 1539066.0, "reward": -0.4601210951805115, "reward_std": 0.150915265083313, "rewards/rollout_reward_func/mean": -0.4601210951805115, "rewards/rollout_reward_func/std": 0.1748569756746292, "sampling/importance_sampling_ratio/max": 1.0000370740890503, "sampling/importance_sampling_ratio/mean": 1.000004768371582, "sampling/importance_sampling_ratio/min": 0.9999825358390808, "sampling/sampling_logp_difference/max": 3.706202551256865e-05, "sampling/sampling_logp_difference/mean": 6.1202204051369336e-06, "step": 44, "step_time": 10.131044015999805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006315578757494222, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 2.3875041733845137e-05, "kl": 2.4858804441367397, "learning_rate": 9.995305400968402e-05, "loss": 0.0047, "num_tokens": 1570815.0, "reward": -0.46754807233810425, "reward_std": 0.11122138798236847, "rewards/rollout_reward_func/mean": -0.46754807233810425, "rewards/rollout_reward_func/std": 0.12286069989204407, "sampling/importance_sampling_ratio/max": 1.0000466108322144, "sampling/importance_sampling_ratio/mean": 1.0000091791152954, "sampling/importance_sampling_ratio/min": 0.9999758005142212, "sampling/sampling_logp_difference/max": 4.659905971493572e-05, "sampling/sampling_logp_difference/mean": 1.0045773706224281e-05, "step": 45, "step_time": 11.542584926000018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006228533748071641, "epoch": 0.00092, "frac_reward_zero_std": 0.0, "grad_norm": 1.2772155059792567e-05, "kl": 2.317846190145474, "learning_rate": 9.994204482451885e-05, "loss": 0.0043, "num_tokens": 1602554.0, "reward": -0.4817233085632324, "reward_std": 0.14001183211803436, "rewards/rollout_reward_func/mean": -0.4817233085632324, "rewards/rollout_reward_func/std": 0.13829727470874786, "sampling/importance_sampling_ratio/max": 1.0000641345977783, "sampling/importance_sampling_ratio/mean": 1.0000078678131104, "sampling/importance_sampling_ratio/min": 0.9999889731407166, "sampling/sampling_logp_difference/max": 6.411726644728333e-05, "sampling/sampling_logp_difference/mean": 9.003964805742726e-06, "step": 46, "step_time": 11.153836768000701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006155803639558144, "epoch": 0.00094, "frac_reward_zero_std": 0.0, "grad_norm": 1.8806122170644812e-05, "kl": 2.859031245112419, "learning_rate": 9.992987803196614e-05, "loss": 0.0054, "num_tokens": 1633642.0, "reward": -0.47458043694496155, "reward_std": 0.13620741665363312, "rewards/rollout_reward_func/mean": -0.47458043694496155, "rewards/rollout_reward_func/std": 0.1397487074136734, "sampling/importance_sampling_ratio/max": 1.0000464916229248, "sampling/importance_sampling_ratio/mean": 1.0000081062316895, "sampling/importance_sampling_ratio/min": 0.9999859929084778, "sampling/sampling_logp_difference/max": 4.647710011340678e-05, "sampling/sampling_logp_difference/mean": 9.343058081867639e-06, "step": 47, "step_time": 11.58096628200019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006193299777805805, "epoch": 0.00096, "frac_reward_zero_std": 0.0, "grad_norm": 3.139285763609223e-05, "kl": 1.9367534266784787, "learning_rate": 9.99165540081904e-05, "loss": 0.0036, "num_tokens": 1668978.0, "reward": -0.41765451431274414, "reward_std": 0.1612691581249237, "rewards/rollout_reward_func/mean": -0.41765451431274414, "rewards/rollout_reward_func/std": 0.17802941799163818, "sampling/importance_sampling_ratio/max": 1.0000407695770264, "sampling/importance_sampling_ratio/mean": 1.0000057220458984, "sampling/importance_sampling_ratio/min": 0.9999812841415405, "sampling/sampling_logp_difference/max": 4.07575280405581e-05, "sampling/sampling_logp_difference/mean": 6.66259802528657e-06, "step": 48, "step_time": 11.587671636999858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006365674234984908, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 2.2500289560412057e-05, "kl": 2.572330966591835, "learning_rate": 9.990207316513463e-05, "loss": 0.0048, "num_tokens": 1697807.0, "reward": -0.5218106508255005, "reward_std": 0.10971593111753464, "rewards/rollout_reward_func/mean": -0.5218106508255005, "rewards/rollout_reward_func/std": 0.11586499959230423, "sampling/importance_sampling_ratio/max": 1.00003981590271, "sampling/importance_sampling_ratio/mean": 1.0000057220458984, "sampling/importance_sampling_ratio/min": 0.999975323677063, "sampling/sampling_logp_difference/max": 3.980120527558029e-05, "sampling/sampling_logp_difference/mean": 7.030243068584241e-06, "step": 49, "step_time": 12.058197169999858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006443986458180007, "epoch": 0.001, "frac_reward_zero_std": 0.0, "grad_norm": 2.7668247639667243e-05, "kl": 2.39994593441952, "learning_rate": 9.98864359505076e-05, "loss": 0.0045, "num_tokens": 1732535.0, "reward": -0.4483959674835205, "reward_std": 0.1457299292087555, "rewards/rollout_reward_func/mean": -0.4483959674835205, "rewards/rollout_reward_func/std": 0.15753024816513062, "sampling/importance_sampling_ratio/max": 1.0000965595245361, "sampling/importance_sampling_ratio/mean": 1.000012755393982, "sampling/importance_sampling_ratio/min": 0.9999797344207764, "sampling/sampling_logp_difference/max": 9.653686720412225e-05, "sampling/sampling_logp_difference/mean": 1.4131255738902837e-05, "step": 50, "step_time": 10.359382950999816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006518985392176546, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 3.547618689481169e-05, "kl": 1.7859944765614273, "learning_rate": 9.986964284776992e-05, "loss": 0.0033, "num_tokens": 1763607.0, "reward": -0.47022631764411926, "reward_std": 0.16651226580142975, "rewards/rollout_reward_func/mean": -0.47022631764411926, "rewards/rollout_reward_func/std": 0.17189878225326538, "sampling/importance_sampling_ratio/max": 1.0000604391098022, "sampling/importance_sampling_ratio/mean": 1.000004768371582, "sampling/importance_sampling_ratio/min": 0.9999511241912842, "sampling/sampling_logp_difference/max": 6.0423146351240575e-05, "sampling/sampling_logp_difference/mean": 8.4705961853615e-06, "step": 51, "step_time": 12.569274266999855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006323890884232242, "epoch": 0.00104, "frac_reward_zero_std": 0.0, "grad_norm": 1.1937954695895314e-05, "kl": 2.6566081009805202, "learning_rate": 9.985169437611922e-05, "loss": 0.005, "num_tokens": 1796876.0, "reward": -0.509475827217102, "reward_std": 0.11160407960414886, "rewards/rollout_reward_func/mean": -0.509475827217102, "rewards/rollout_reward_func/std": 0.1130366399884224, "sampling/importance_sampling_ratio/max": 1.0000765323638916, "sampling/importance_sampling_ratio/mean": 1.0000137090682983, "sampling/importance_sampling_ratio/min": 0.9999876618385315, "sampling/sampling_logp_difference/max": 7.65095028327778e-05, "sampling/sampling_logp_difference/mean": 1.4244134035834577e-05, "step": 52, "step_time": 10.43953393899983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006390170674421825, "epoch": 0.00106, "frac_reward_zero_std": 0.0, "grad_norm": 2.9535973226302303e-05, "kl": 1.9292238809430273, "learning_rate": 9.983259109047396e-05, "loss": 0.0036, "num_tokens": 1830883.0, "reward": -0.4491024613380432, "reward_std": 0.15517105162143707, "rewards/rollout_reward_func/mean": -0.4491024613380432, "rewards/rollout_reward_func/std": 0.16186949610710144, "sampling/importance_sampling_ratio/max": 1.0000708103179932, "sampling/importance_sampling_ratio/mean": 1.0000100135803223, "sampling/importance_sampling_ratio/min": 0.9999610185623169, "sampling/sampling_logp_difference/max": 7.079487841110677e-05, "sampling/sampling_logp_difference/mean": 1.1913212802028283e-05, "step": 53, "step_time": 12.08518075500001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006665495311608538, "epoch": 0.00108, "frac_reward_zero_std": 0.0, "grad_norm": 1.6661115296301432e-05, "kl": 2.2144708826672286, "learning_rate": 9.981233358145643e-05, "loss": 0.0042, "num_tokens": 1862673.0, "reward": -0.4556786119937897, "reward_std": 0.16825318336486816, "rewards/rollout_reward_func/mean": -0.4556786119937897, "rewards/rollout_reward_func/std": 0.1777384877204895, "sampling/importance_sampling_ratio/max": 1.0000735521316528, "sampling/importance_sampling_ratio/mean": 1.0000085830688477, "sampling/importance_sampling_ratio/min": 0.999980628490448, "sampling/sampling_logp_difference/max": 7.353299588430673e-05, "sampling/sampling_logp_difference/mean": 1.047209843818564e-05, "step": 54, "step_time": 11.007667734000506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006211438776517753, "epoch": 0.0011, "frac_reward_zero_std": 0.0, "grad_norm": 1.1390176041459199e-05, "kl": 3.496194064617157, "learning_rate": 9.979092247537435e-05, "loss": 0.0066, "num_tokens": 1894457.0, "reward": -0.5075165033340454, "reward_std": 0.13905443251132965, "rewards/rollout_reward_func/mean": -0.5075165033340454, "rewards/rollout_reward_func/std": 0.14110973477363586, "sampling/importance_sampling_ratio/max": 1.000069260597229, "sampling/importance_sampling_ratio/mean": 1.0000133514404297, "sampling/importance_sampling_ratio/min": 0.999997615814209, "sampling/sampling_logp_difference/max": 6.924373155925423e-05, "sampling/sampling_logp_difference/mean": 1.3386375940172002e-05, "step": 55, "step_time": 11.282138439999471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006376511955750175, "epoch": 0.00112, "frac_reward_zero_std": 0.0, "grad_norm": 1.527481435914524e-05, "kl": 2.392844100482762, "learning_rate": 9.976835843420156e-05, "loss": 0.0045, "num_tokens": 1927719.0, "reward": -0.4271199703216553, "reward_std": 0.20685634016990662, "rewards/rollout_reward_func/mean": -0.4271199703216553, "rewards/rollout_reward_func/std": 0.20447339117527008, "sampling/importance_sampling_ratio/max": 1.0000630617141724, "sampling/importance_sampling_ratio/mean": 1.0000090599060059, "sampling/importance_sampling_ratio/min": 0.9999785423278809, "sampling/sampling_logp_difference/max": 6.304663838818669e-05, "sampling/sampling_logp_difference/mean": 9.967272490030155e-06, "step": 56, "step_time": 11.81715834400029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006685723128612153, "epoch": 0.00114, "frac_reward_zero_std": 0.0, "grad_norm": 2.624979788379278e-05, "kl": 2.3886548094451427, "learning_rate": 9.974464215555756e-05, "loss": 0.0045, "num_tokens": 1959495.0, "reward": -0.4609224796295166, "reward_std": 0.12867671251296997, "rewards/rollout_reward_func/mean": -0.4609224796295166, "rewards/rollout_reward_func/std": 0.14006492495536804, "sampling/importance_sampling_ratio/max": 1.0000592470169067, "sampling/importance_sampling_ratio/mean": 1.0000089406967163, "sampling/importance_sampling_ratio/min": 0.9999776482582092, "sampling/sampling_logp_difference/max": 5.92273281654343e-05, "sampling/sampling_logp_difference/mean": 1.0794034096761607e-05, "step": 57, "step_time": 10.286673817000292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006622151267947629, "epoch": 0.00116, "frac_reward_zero_std": 0.0, "grad_norm": 1.8251206711283885e-05, "kl": 2.1817052587866783, "learning_rate": 9.971977437268594e-05, "loss": 0.0041, "num_tokens": 1992702.0, "reward": -0.44977936148643494, "reward_std": 0.11797778308391571, "rewards/rollout_reward_func/mean": -0.44977936148643494, "rewards/rollout_reward_func/std": 0.11889227479696274, "sampling/importance_sampling_ratio/max": 1.0000721216201782, "sampling/importance_sampling_ratio/mean": 1.0000098943710327, "sampling/importance_sampling_ratio/min": 0.9999783039093018, "sampling/sampling_logp_difference/max": 7.20885582268238e-05, "sampling/sampling_logp_difference/mean": 1.1335238013998605e-05, "step": 58, "step_time": 11.882596950000561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006698454853903968, "epoch": 0.00118, "frac_reward_zero_std": 0.0, "grad_norm": 2.0426890841918066e-05, "kl": 2.7415281403809786, "learning_rate": 9.969375585443172e-05, "loss": 0.0051, "num_tokens": 2023008.0, "reward": -0.45059582591056824, "reward_std": 0.133949413895607, "rewards/rollout_reward_func/mean": -0.45059582591056824, "rewards/rollout_reward_func/std": 0.15718908607959747, "sampling/importance_sampling_ratio/max": 1.0000840425491333, "sampling/importance_sampling_ratio/mean": 1.0000112056732178, "sampling/importance_sampling_ratio/min": 0.999981701374054, "sampling/sampling_logp_difference/max": 8.402469393331558e-05, "sampling/sampling_logp_difference/mean": 1.222399623657111e-05, "step": 59, "step_time": 10.513273939999635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006553210005222354, "epoch": 0.0012, "frac_reward_zero_std": 0.0, "grad_norm": 2.9337732485146262e-05, "kl": 2.6224523090447036, "learning_rate": 9.966658740521754e-05, "loss": 0.0049, "num_tokens": 2054084.0, "reward": -0.5213782787322998, "reward_std": 0.11130588501691818, "rewards/rollout_reward_func/mean": -0.5213782787322998, "rewards/rollout_reward_func/std": 0.13300269842147827, "sampling/importance_sampling_ratio/max": 1.000054121017456, "sampling/importance_sampling_ratio/mean": 1.0000088214874268, "sampling/importance_sampling_ratio/min": 0.9999767541885376, "sampling/sampling_logp_difference/max": 5.4102332796901464e-05, "sampling/sampling_logp_difference/mean": 1.0590176316327415e-05, "step": 60, "step_time": 11.05362786600017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006773007989977486, "epoch": 0.00122, "frac_reward_zero_std": 0.0, "grad_norm": 1.810842877603136e-05, "kl": 3.1815832147064356, "learning_rate": 9.963826986501882e-05, "loss": 0.006, "num_tokens": 2084335.0, "reward": -0.4689787030220032, "reward_std": 0.08655819296836853, "rewards/rollout_reward_func/mean": -0.4689787030220032, "rewards/rollout_reward_func/std": 0.09671928733587265, "sampling/importance_sampling_ratio/max": 1.0000590085983276, "sampling/importance_sampling_ratio/mean": 1.0000102519989014, "sampling/importance_sampling_ratio/min": 0.9999881982803345, "sampling/sampling_logp_difference/max": 5.898933159187436e-05, "sampling/sampling_logp_difference/mean": 1.1276682016614359e-05, "step": 61, "step_time": 9.853760416000114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006641297259193379, "epoch": 0.00124, "frac_reward_zero_std": 0.0, "grad_norm": 3.755722354981117e-05, "kl": 2.3530580727383494, "learning_rate": 9.960880410933783e-05, "loss": 0.0044, "num_tokens": 2116836.0, "reward": -0.48839297890663147, "reward_std": 0.11407047510147095, "rewards/rollout_reward_func/mean": -0.48839297890663147, "rewards/rollout_reward_func/std": 0.13152439892292023, "sampling/importance_sampling_ratio/max": 1.0000629425048828, "sampling/importance_sampling_ratio/mean": 1.0000091791152954, "sampling/importance_sampling_ratio/min": 0.9999599456787109, "sampling/sampling_logp_difference/max": 6.291928002610803e-05, "sampling/sampling_logp_difference/mean": 1.072325358109083e-05, "step": 62, "step_time": 12.218522935000692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006711007954436354, "epoch": 0.00126, "frac_reward_zero_std": 0.0, "grad_norm": 2.2082309442339465e-05, "kl": 2.3587748343124986, "learning_rate": 9.957819104917648e-05, "loss": 0.0044, "num_tokens": 2147147.0, "reward": -0.4642190635204315, "reward_std": 0.17485395073890686, "rewards/rollout_reward_func/mean": -0.4642190635204315, "rewards/rollout_reward_func/std": 0.17033183574676514, "sampling/importance_sampling_ratio/max": 1.0000776052474976, "sampling/importance_sampling_ratio/mean": 1.0000108480453491, "sampling/importance_sampling_ratio/min": 0.9999765753746033, "sampling/sampling_logp_difference/max": 7.758568972349167e-05, "sampling/sampling_logp_difference/mean": 1.2784492355422117e-05, "step": 63, "step_time": 10.816209253000125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006661249899480026, "epoch": 0.00128, "frac_reward_zero_std": 0.0, "grad_norm": 2.198375659645535e-05, "kl": 3.386210630647838, "learning_rate": 9.954643163100835e-05, "loss": 0.0063, "num_tokens": 2175955.0, "reward": -0.4366099238395691, "reward_std": 0.11404688656330109, "rewards/rollout_reward_func/mean": -0.4366099238395691, "rewards/rollout_reward_func/std": 0.11409632116556168, "sampling/importance_sampling_ratio/max": 1.0000742673873901, "sampling/importance_sampling_ratio/mean": 1.0000113248825073, "sampling/importance_sampling_ratio/min": 0.9999960660934448, "sampling/sampling_logp_difference/max": 7.424886280205101e-05, "sampling/sampling_logp_difference/mean": 1.218299803440459e-05, "step": 64, "step_time": 11.2057395060001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006794004984840285, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 5.13264058099594e-05, "kl": 3.4261841475963593, "learning_rate": 9.951352683674924e-05, "loss": 0.0064, "num_tokens": 2204870.0, "reward": -0.47110214829444885, "reward_std": 0.14154572784900665, "rewards/rollout_reward_func/mean": -0.47110214829444885, "rewards/rollout_reward_func/std": 0.15598046779632568, "sampling/importance_sampling_ratio/max": 1.0000603199005127, "sampling/importance_sampling_ratio/mean": 1.0000087022781372, "sampling/importance_sampling_ratio/min": 0.999940037727356, "sampling/sampling_logp_difference/max": 6.02992222411558e-05, "sampling/sampling_logp_difference/mean": 1.1587392691581044e-05, "step": 65, "step_time": 10.006985341000927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006701685379084665, "epoch": 0.00132, "frac_reward_zero_std": 0.0, "grad_norm": 3.4127660910598934e-05, "kl": 3.0201471662148833, "learning_rate": 9.947947768372698e-05, "loss": 0.0057, "num_tokens": 2234457.0, "reward": -0.5099438428878784, "reward_std": 0.09435595571994781, "rewards/rollout_reward_func/mean": -0.5099438428878784, "rewards/rollout_reward_func/std": 0.09857697784900665, "sampling/importance_sampling_ratio/max": 1.0000659227371216, "sampling/importance_sampling_ratio/mean": 1.0000109672546387, "sampling/importance_sampling_ratio/min": 0.9999938607215881, "sampling/sampling_logp_difference/max": 6.590578414034098e-05, "sampling/sampling_logp_difference/mean": 1.2094817066099495e-05, "step": 66, "step_time": 11.312162051000314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006757073315384332, "epoch": 0.00134, "frac_reward_zero_std": 0.0, "grad_norm": 2.9790588087053038e-05, "kl": 2.9807451650267467, "learning_rate": 9.944428522464987e-05, "loss": 0.0056, "num_tokens": 2265528.0, "reward": -0.46639153361320496, "reward_std": 0.15289044380187988, "rewards/rollout_reward_func/mean": -0.46639153361320496, "rewards/rollout_reward_func/std": 0.15913020074367523, "sampling/importance_sampling_ratio/max": 1.0000736713409424, "sampling/importance_sampling_ratio/mean": 1.0000100135803223, "sampling/importance_sampling_ratio/min": 0.9999826550483704, "sampling/sampling_logp_difference/max": 7.36506626708433e-05, "sampling/sampling_logp_difference/mean": 1.1719173926394433e-05, "step": 67, "step_time": 10.908072316999778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006568060161953326, "epoch": 0.00136, "frac_reward_zero_std": 0.0, "grad_norm": 3.412132355151698e-05, "kl": 3.028280718252063, "learning_rate": 9.940795054757413e-05, "loss": 0.0057, "num_tokens": 2295782.0, "reward": -0.4513282775878906, "reward_std": 0.12359032034873962, "rewards/rollout_reward_func/mean": -0.4513282775878906, "rewards/rollout_reward_func/std": 0.1289566457271576, "sampling/importance_sampling_ratio/max": 1.0000699758529663, "sampling/importance_sampling_ratio/mean": 1.0000100135803223, "sampling/importance_sampling_ratio/min": 0.9999933242797852, "sampling/sampling_logp_difference/max": 6.99566735420376e-05, "sampling/sampling_logp_difference/mean": 1.1112841093563475e-05, "step": 68, "step_time": 11.857117537999557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006762990342394914, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 2.2748485207557678e-05, "kl": 2.8810745738446712, "learning_rate": 9.937047477587032e-05, "loss": 0.0054, "num_tokens": 2326067.0, "reward": -0.48495256900787354, "reward_std": 0.10630611330270767, "rewards/rollout_reward_func/mean": -0.48495256900787354, "rewards/rollout_reward_func/std": 0.1172296553850174, "sampling/importance_sampling_ratio/max": 1.0000613927841187, "sampling/importance_sampling_ratio/mean": 1.000011920928955, "sampling/importance_sampling_ratio/min": 0.9999844431877136, "sampling/sampling_logp_difference/max": 6.137110176496208e-05, "sampling/sampling_logp_difference/mean": 1.3601927093986887e-05, "step": 69, "step_time": 10.554767250000168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0007178151245170739, "epoch": 0.0014, "frac_reward_zero_std": 0.0, "grad_norm": 4.0173592424253e-05, "kl": 2.523578153923154, "learning_rate": 9.933185906818858e-05, "loss": 0.0047, "num_tokens": 2356445.0, "reward": -0.5431434512138367, "reward_std": 0.09289485216140747, "rewards/rollout_reward_func/mean": -0.5431434512138367, "rewards/rollout_reward_func/std": 0.11382929980754852, "sampling/importance_sampling_ratio/max": 1.000108003616333, "sampling/importance_sampling_ratio/mean": 1.000010371208191, "sampling/importance_sampling_ratio/min": 0.9999796152114868, "sampling/sampling_logp_difference/max": 0.00010799059236887842, "sampling/sampling_logp_difference/mean": 1.2703325410257094e-05, "step": 70, "step_time": 10.974104634000241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006705400555802044, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 2.3166499886428937e-05, "kl": 3.6824486549012363, "learning_rate": 9.929210461842278e-05, "loss": 0.0069, "num_tokens": 2383101.0, "reward": -0.47830814123153687, "reward_std": 0.14856326580047607, "rewards/rollout_reward_func/mean": -0.47830814123153687, "rewards/rollout_reward_func/std": 0.14815828204154968, "sampling/importance_sampling_ratio/max": 1.0000710487365723, "sampling/importance_sampling_ratio/mean": 1.0000113248825073, "sampling/importance_sampling_ratio/min": 0.9999920129776001, "sampling/sampling_logp_difference/max": 7.103009556885809e-05, "sampling/sampling_logp_difference/mean": 1.2573122148751281e-05, "step": 71, "step_time": 11.173864316999925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006955651369935367, "epoch": 0.00144, "frac_reward_zero_std": 0.0, "grad_norm": 3.546903462847695e-05, "kl": 2.761172599857673, "learning_rate": 9.925121265567366e-05, "loss": 0.0052, "num_tokens": 2413413.0, "reward": -0.4632667899131775, "reward_std": 0.15058688819408417, "rewards/rollout_reward_func/mean": -0.4632667899131775, "rewards/rollout_reward_func/std": 0.16061994433403015, "sampling/importance_sampling_ratio/max": 1.0000720024108887, "sampling/importance_sampling_ratio/mean": 1.0000090599060059, "sampling/importance_sampling_ratio/min": 0.9999868273735046, "sampling/sampling_logp_difference/max": 7.198394450824708e-05, "sampling/sampling_logp_difference/mean": 1.0825106073752977e-05, "step": 72, "step_time": 11.13052461299958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.000684412763803266, "epoch": 0.00146, "frac_reward_zero_std": 0.0, "grad_norm": 2.656666219991166e-05, "kl": 2.885673111770302, "learning_rate": 9.920918444421082e-05, "loss": 0.0054, "num_tokens": 2443679.0, "reward": -0.45213764905929565, "reward_std": 0.17062178254127502, "rewards/rollout_reward_func/mean": -0.45213764905929565, "rewards/rollout_reward_func/std": 0.16972234845161438, "sampling/importance_sampling_ratio/max": 1.0000693798065186, "sampling/importance_sampling_ratio/mean": 1.000009536743164, "sampling/importance_sampling_ratio/min": 0.9999594688415527, "sampling/sampling_logp_difference/max": 6.936074350960553e-05, "sampling/sampling_logp_difference/mean": 1.2211980902065989e-05, "step": 73, "step_time": 11.717069765000133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006837020846433006, "epoch": 0.00148, "frac_reward_zero_std": 0.0, "grad_norm": 5.652607069350779e-05, "kl": 3.2031972631812096, "learning_rate": 9.916602128343356e-05, "loss": 0.006, "num_tokens": 2473305.0, "reward": -0.45073336362838745, "reward_std": 0.1874859631061554, "rewards/rollout_reward_func/mean": -0.45073336362838745, "rewards/rollout_reward_func/std": 0.19179198145866394, "sampling/importance_sampling_ratio/max": 1.0000991821289062, "sampling/importance_sampling_ratio/mean": 1.0000112056732178, "sampling/importance_sampling_ratio/min": 0.9999768733978271, "sampling/sampling_logp_difference/max": 9.916012641042471e-05, "sampling/sampling_logp_difference/mean": 1.318043996434426e-05, "step": 74, "step_time": 10.713412969000046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.0006799578113714233, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 3.1676881917519495e-05, "kl": 3.322963882237673, "learning_rate": 9.91217245078308e-05, "loss": 0.0062, "num_tokens": 2501412.0, "reward": -0.4546876847743988, "reward_std": 0.14404551684856415, "rewards/rollout_reward_func/mean": -0.4546876847743988, "rewards/rollout_reward_func/std": 0.14299967885017395, "sampling/importance_sampling_ratio/max": 1.0000700950622559, "sampling/importance_sampling_ratio/mean": 1.0000104904174805, "sampling/importance_sampling_ratio/min": 0.9999879002571106, "sampling/sampling_logp_difference/max": 7.007511158008128e-05, "sampling/sampling_logp_difference/mean": 1.1978003385593183e-05, "step": 75, "step_time": 11.524892917999523 } ], "logging_steps": 1.0, "max_steps": 600, "num_input_tokens_seen": 2501412, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }