{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0017699115044247, "eval_steps": 500, "global_step": 566, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 144.484375, "completions/mean_terminated_length": 144.484375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4271422028541565, "epoch": 0.0017699115044247787, "frac_reward_zero_std": 0.75, "grad_norm": 1.1006226002289405, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0047, "num_tokens": 18911.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3433542251586914, "sampling/importance_sampling_ratio/mean": 1.000240683555603, "sampling/importance_sampling_ratio/min": 0.653702437877655, "sampling/sampling_logp_difference/max": 0.42510294914245605, "sampling/sampling_logp_difference/mean": 0.015977157279849052, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 209.75, "completions/mean_terminated_length": 209.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4180360436439514, "epoch": 0.0035398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 0.7553122903465309, "kl": 0.0, "learning_rate": 8.849557522123893e-09, "loss": -0.015, "num_tokens": 42495.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.371575951576233, "sampling/importance_sampling_ratio/mean": 0.9999609589576721, "sampling/importance_sampling_ratio/min": 0.6392531991004944, "sampling/sampling_logp_difference/max": 0.44745469093322754, "sampling/sampling_logp_difference/mean": 0.015478897839784622, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 261.46875, "completions/mean_terminated_length": 261.46875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.6020745635032654, "epoch": 0.005309734513274336, "frac_reward_zero_std": 0.5, "grad_norm": 0.8658353718902592, "kl": 0.00044076767517253757, "learning_rate": 1.7699115044247786e-08, "loss": 0.0159, "num_tokens": 72909.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5477291345596313, "sampling/importance_sampling_ratio/mean": 0.9996609687805176, "sampling/importance_sampling_ratio/min": 0.6928660869598389, "sampling/sampling_logp_difference/max": 0.43678879737854004, "sampling/sampling_logp_difference/mean": 0.017934903502464294, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.5642544031143188, "epoch": 0.007079646017699115, "frac_reward_zero_std": 0.5, "grad_norm": 1.2078506186870932, "kl": 0.0004340629675425589, "learning_rate": 2.654867256637168e-08, "loss": -0.0096, "num_tokens": 98837.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.492296576499939, "sampling/importance_sampling_ratio/mean": 1.0004456043243408, "sampling/importance_sampling_ratio/min": 0.6158694624900818, "sampling/sampling_logp_difference/max": 0.48472023010253906, "sampling/sampling_logp_difference/mean": 0.01798402890563011, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 148.0625, "completions/mean_terminated_length": 148.0625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.41338294744491577, "epoch": 0.008849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.00233020676983444, "kl": 0.0006641986547037959, "learning_rate": 3.539823008849557e-08, "loss": 0.0, "num_tokens": 119753.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5282143354415894, "sampling/importance_sampling_ratio/mean": 1.0009156465530396, "sampling/importance_sampling_ratio/min": 0.6264251470565796, "sampling/sampling_logp_difference/max": 0.4677259922027588, "sampling/sampling_logp_difference/mean": 0.016537081450223923, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 152.03125, "completions/mean_terminated_length": 152.03125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.30602705478668213, "epoch": 0.010619469026548672, "frac_reward_zero_std": 0.75, "grad_norm": 1.1710728518813673, "kl": 0.0006627484108321369, "learning_rate": 4.424778761061947e-08, "loss": 0.0178, "num_tokens": 140379.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4749361276626587, "sampling/importance_sampling_ratio/mean": 0.9997684955596924, "sampling/importance_sampling_ratio/min": 0.4771636128425598, "sampling/sampling_logp_difference/max": 0.7398958206176758, "sampling/sampling_logp_difference/mean": 0.014786459505558014, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 152.28125, "completions/mean_terminated_length": 152.28125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.48392254114151, "epoch": 0.012389380530973451, "frac_reward_zero_std": 0.5, "grad_norm": 1.4522039803604556, "kl": 0.0007441140478476882, "learning_rate": 5.309734513274336e-08, "loss": -0.0063, "num_tokens": 164637.0, "reward": 0.125, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4044241905212402, "sampling/importance_sampling_ratio/mean": 0.9991136789321899, "sampling/importance_sampling_ratio/min": 0.623917818069458, "sampling/sampling_logp_difference/max": 0.4717366695404053, "sampling/sampling_logp_difference/mean": 0.01828671619296074, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 196.265625, "completions/mean_terminated_length": 196.265625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5522291660308838, "epoch": 0.01415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022623975075869215, "kl": 0.0005923404823988676, "learning_rate": 6.194690265486725e-08, "loss": 0.0, "num_tokens": 188174.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4031603336334229, "sampling/importance_sampling_ratio/mean": 0.9999065399169922, "sampling/importance_sampling_ratio/min": 0.6051700115203857, "sampling/sampling_logp_difference/max": 0.5022459030151367, "sampling/sampling_logp_difference/mean": 0.019137494266033173, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 121.09375, "completions/mean_terminated_length": 121.09375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.25603991746902466, "epoch": 0.01592920353982301, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029169860087568922, "kl": 0.0005756879108957946, "learning_rate": 7.079646017699114e-08, "loss": 0.0, "num_tokens": 205732.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5759655237197876, "sampling/importance_sampling_ratio/mean": 1.0007699728012085, "sampling/importance_sampling_ratio/min": 0.7221340537071228, "sampling/sampling_logp_difference/max": 0.4548680782318115, "sampling/sampling_logp_difference/mean": 0.012160948477685452, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 170.234375, "completions/mean_terminated_length": 170.234375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4174914062023163, "epoch": 0.017699115044247787, "frac_reward_zero_std": 0.5, "grad_norm": 1.456361804165413, "kl": 0.0005390376900322735, "learning_rate": 7.964601769911503e-08, "loss": 0.0397, "num_tokens": 228307.0, "reward": 0.65625, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3539308309555054, "sampling/importance_sampling_ratio/mean": 1.0001657009124756, "sampling/importance_sampling_ratio/min": 0.6430124640464783, "sampling/sampling_logp_difference/max": 0.4415912628173828, "sampling/sampling_logp_difference/mean": 0.01574782468378544, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 176.6875, "completions/mean_terminated_length": 176.6875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.426904559135437, "epoch": 0.019469026548672566, "frac_reward_zero_std": 0.75, "grad_norm": 0.7685811919938644, "kl": 0.0005052468040958047, "learning_rate": 8.849557522123894e-08, "loss": 0.0164, "num_tokens": 249599.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.69903564453125, "sampling/importance_sampling_ratio/mean": 1.0001137256622314, "sampling/importance_sampling_ratio/min": 0.6657065153121948, "sampling/sampling_logp_difference/max": 0.5300607681274414, "sampling/sampling_logp_difference/mean": 0.015404904261231422, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 137.78125, "completions/mean_terminated_length": 137.78125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3701123595237732, "epoch": 0.021238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031209918838396716, "kl": 0.0006685380358248949, "learning_rate": 9.734513274336283e-08, "loss": 0.0, "num_tokens": 268721.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000590205192566, "sampling/importance_sampling_ratio/min": 0.6504923701286316, "sampling/sampling_logp_difference/max": 0.8355003595352173, "sampling/sampling_logp_difference/mean": 0.01659131795167923, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 181.109375, "completions/mean_terminated_length": 181.109375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.42291828989982605, "epoch": 0.023008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 1.1080865101289201, "kl": 0.0004783940967172384, "learning_rate": 1.0619469026548672e-07, "loss": 0.0108, "num_tokens": 291672.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.578126311302185, "sampling/importance_sampling_ratio/mean": 1.0000258684158325, "sampling/importance_sampling_ratio/min": 0.6148504018783569, "sampling/sampling_logp_difference/max": 0.4863762855529785, "sampling/sampling_logp_difference/mean": 0.015482140704989433, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 215.453125, "completions/mean_terminated_length": 215.453125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5591689348220825, "epoch": 0.024778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020184872105857158, "kl": 0.0005135840037837625, "learning_rate": 1.1504424778761061e-07, "loss": 0.0, "num_tokens": 318917.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.460014820098877, "sampling/importance_sampling_ratio/mean": 0.9999944567680359, "sampling/importance_sampling_ratio/min": 0.6955029964447021, "sampling/sampling_logp_difference/max": 0.3784465789794922, "sampling/sampling_logp_difference/mean": 0.01834474503993988, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 211.78125, "completions/mean_terminated_length": 211.78125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.46811556816101074, "epoch": 0.02654867256637168, "frac_reward_zero_std": 0.5, "grad_norm": 0.9886943375248237, "kl": 0.0005040219402872026, "learning_rate": 1.238938053097345e-07, "loss": 0.0123, "num_tokens": 345671.0, "reward": 0.5, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5770877599716187, "sampling/importance_sampling_ratio/mean": 1.0002729892730713, "sampling/importance_sampling_ratio/min": 0.5805851221084595, "sampling/sampling_logp_difference/max": 0.5437189340591431, "sampling/sampling_logp_difference/mean": 0.01677098125219345, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 195.515625, "completions/mean_terminated_length": 195.515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.2952774167060852, "epoch": 0.02831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.8590553181921636, "kl": 0.0004932324518449605, "learning_rate": 1.327433628318584e-07, "loss": -0.0, "num_tokens": 369400.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.510075569152832, "sampling/importance_sampling_ratio/mean": 0.9999840259552002, "sampling/importance_sampling_ratio/min": 0.5866603255271912, "sampling/sampling_logp_difference/max": 0.5333092212677002, "sampling/sampling_logp_difference/mean": 0.01272563450038433, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 227.71875, "completions/mean_terminated_length": 227.71875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.44876500964164734, "epoch": 0.03008849557522124, "frac_reward_zero_std": 0.5, "grad_norm": 0.917896025193256, "kl": 0.000513352919369936, "learning_rate": 1.4159292035398229e-07, "loss": -0.0413, "num_tokens": 394518.0, "reward": 0.53125, "reward_std": 0.42516323924064636, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5742335319519043, "sampling/importance_sampling_ratio/mean": 0.9999843835830688, "sampling/importance_sampling_ratio/min": 0.6992347240447998, "sampling/sampling_logp_difference/max": 0.4537684917449951, "sampling/sampling_logp_difference/mean": 0.01575635001063347, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.34195274114608765, "epoch": 0.03185840707964602, "frac_reward_zero_std": 0.25, "grad_norm": 1.3005797089893403, "kl": 0.0004412387206684798, "learning_rate": 1.504424778761062e-07, "loss": 0.0777, "num_tokens": 418870.0, "reward": 0.8125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6629990339279175, "sampling/importance_sampling_ratio/mean": 0.9997455477714539, "sampling/importance_sampling_ratio/min": 0.7294662594795227, "sampling/sampling_logp_difference/max": 0.5086226463317871, "sampling/sampling_logp_difference/mean": 0.012046229094266891, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 175.28125, "completions/mean_terminated_length": 175.28125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4359934329986572, "epoch": 0.033628318584070796, "frac_reward_zero_std": 0.5, "grad_norm": 1.0600244106746555, "kl": 0.00039431609911844134, "learning_rate": 1.5929203539823007e-07, "loss": 0.0539, "num_tokens": 441736.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.3352776765823364, "sampling/importance_sampling_ratio/mean": 1.0002167224884033, "sampling/importance_sampling_ratio/min": 0.6985317468643188, "sampling/sampling_logp_difference/max": 0.35877466201782227, "sampling/sampling_logp_difference/mean": 0.015411154367029667, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 130.890625, "completions/mean_terminated_length": 130.890625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3688535988330841, "epoch": 0.035398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.0783491352584433, "kl": 0.0005861029494553804, "learning_rate": 1.68141592920354e-07, "loss": -0.0158, "num_tokens": 460049.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4999357461929321, "sampling/importance_sampling_ratio/mean": 1.000003695487976, "sampling/importance_sampling_ratio/min": 0.6543043851852417, "sampling/sampling_logp_difference/max": 0.424182653427124, "sampling/sampling_logp_difference/mean": 0.01564677618443966, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.36204296350479126, "epoch": 0.03716814159292035, "frac_reward_zero_std": 0.75, "grad_norm": 0.8896725406897013, "kl": 0.00045308793778531253, "learning_rate": 1.7699115044247788e-07, "loss": 0.0343, "num_tokens": 482745.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.435202717781067, "sampling/importance_sampling_ratio/mean": 0.9992131590843201, "sampling/importance_sampling_ratio/min": 0.6173391342163086, "sampling/sampling_logp_difference/max": 0.48233675956726074, "sampling/sampling_logp_difference/mean": 0.014494507573544979, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 220.671875, "completions/mean_terminated_length": 220.671875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.45146217942237854, "epoch": 0.03893805309734513, "frac_reward_zero_std": 0.75, "grad_norm": 0.8407237808788562, "kl": 0.0005051965126767755, "learning_rate": 1.8584070796460178e-07, "loss": -0.0017, "num_tokens": 509556.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4402657747268677, "sampling/importance_sampling_ratio/mean": 1.0001249313354492, "sampling/importance_sampling_ratio/min": 0.6482202410697937, "sampling/sampling_logp_difference/max": 0.4335247278213501, "sampling/sampling_logp_difference/mean": 0.015440289862453938, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 199.96875, "completions/mean_terminated_length": 199.96875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.46031802892684937, "epoch": 0.04070796460176991, "frac_reward_zero_std": 0.25, "grad_norm": 1.4091022344664308, "kl": 0.0005111101781949401, "learning_rate": 1.9469026548672566e-07, "loss": 0.0234, "num_tokens": 532722.0, "reward": 0.4375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.442039966583252, "sampling/importance_sampling_ratio/mean": 1.0001800060272217, "sampling/importance_sampling_ratio/min": 0.6624577641487122, "sampling/sampling_logp_difference/max": 0.41179847717285156, "sampling/sampling_logp_difference/mean": 0.016388865187764168, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 192.15625, "completions/mean_terminated_length": 192.15625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3538476824760437, "epoch": 0.04247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 0.8647717655608781, "kl": 0.0006137521704658866, "learning_rate": 2.0353982300884956e-07, "loss": -0.024, "num_tokens": 555900.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.5973563194274902, "sampling/importance_sampling_ratio/mean": 0.9996161460876465, "sampling/importance_sampling_ratio/min": 0.5685754418373108, "sampling/sampling_logp_difference/max": 0.5646212697029114, "sampling/sampling_logp_difference/mean": 0.015036176890134811, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 193.9375, "completions/mean_terminated_length": 193.9375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.45706120133399963, "epoch": 0.04424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021691898248266707, "kl": 0.0005267454544082284, "learning_rate": 2.1238938053097344e-07, "loss": 0.0, "num_tokens": 580744.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5684353113174438, "sampling/importance_sampling_ratio/mean": 1.0003420114517212, "sampling/importance_sampling_ratio/min": 0.6095874309539795, "sampling/sampling_logp_difference/max": 0.49497294425964355, "sampling/sampling_logp_difference/mean": 0.015996476635336876, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 258.984375, "completions/mean_terminated_length": 258.984375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4292888045310974, "epoch": 0.04601769911504425, "frac_reward_zero_std": 0.5, "grad_norm": 0.9510533046556038, "kl": 0.0004537611675914377, "learning_rate": 2.2123893805309735e-07, "loss": -0.0008, "num_tokens": 608535.0, "reward": 0.34375, "reward_std": 0.4597553312778473, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5802814960479736, "sampling/importance_sampling_ratio/mean": 1.0002368688583374, "sampling/importance_sampling_ratio/min": 0.6091845035552979, "sampling/sampling_logp_difference/max": 0.4956340789794922, "sampling/sampling_logp_difference/mean": 0.014350948855280876, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 153.484375, "completions/mean_terminated_length": 153.484375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.37147828936576843, "epoch": 0.047787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026069699616681308, "kl": 0.0006182813085615635, "learning_rate": 2.3008849557522122e-07, "loss": 0.0, "num_tokens": 628438.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4643288850784302, "sampling/importance_sampling_ratio/mean": 0.999936580657959, "sampling/importance_sampling_ratio/min": 0.6089222431182861, "sampling/sampling_logp_difference/max": 0.4960646629333496, "sampling/sampling_logp_difference/mean": 0.014883618801832199, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4160059094429016, "epoch": 0.049557522123893805, "frac_reward_zero_std": 0.5, "grad_norm": 1.0446907347518073, "kl": 0.0004597888619173318, "learning_rate": 2.3893805309734513e-07, "loss": -0.0312, "num_tokens": 653774.0, "reward": 0.15625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.5278409719467163, "sampling/importance_sampling_ratio/mean": 0.9999281167984009, "sampling/importance_sampling_ratio/min": 0.6271337866783142, "sampling/sampling_logp_difference/max": 0.4665954113006592, "sampling/sampling_logp_difference/mean": 0.014859693124890327, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 130.453125, "completions/mean_terminated_length": 130.453125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.3917607367038727, "epoch": 0.05132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 1.334381952357411, "kl": 0.0006424708990380168, "learning_rate": 2.47787610619469e-07, "loss": -0.0282, "num_tokens": 672555.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.620834231376648, "sampling/importance_sampling_ratio/mean": 0.9999610185623169, "sampling/importance_sampling_ratio/min": 0.6735600233078003, "sampling/sampling_logp_difference/max": 0.4829409122467041, "sampling/sampling_logp_difference/mean": 0.01759732887148857, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 227.421875, "completions/mean_terminated_length": 227.421875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.49635761976242065, "epoch": 0.05309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 0.5970116235546846, "kl": 0.00038929813308641315, "learning_rate": 2.5663716814159294e-07, "loss": 0.0058, "num_tokens": 699190.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5870181322097778, "sampling/importance_sampling_ratio/mean": 1.0004205703735352, "sampling/importance_sampling_ratio/min": 0.6560810208320618, "sampling/sampling_logp_difference/max": 0.4618568420410156, "sampling/sampling_logp_difference/mean": 0.01632857695221901, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 328.40625, "completions/mean_terminated_length": 328.40625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.3582524061203003, "epoch": 0.05486725663716814, "frac_reward_zero_std": 0.5, "grad_norm": 0.8346609834753668, "kl": 0.0003085409989580512, "learning_rate": 2.654867256637168e-07, "loss": 0.0094, "num_tokens": 732208.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6506547927856445, "sampling/importance_sampling_ratio/mean": 1.0000078678131104, "sampling/importance_sampling_ratio/min": 0.5198546648025513, "sampling/sampling_logp_difference/max": 0.6542060375213623, "sampling/sampling_logp_difference/mean": 0.012050572782754898, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 230.21875, "completions/mean_terminated_length": 230.21875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4924032390117645, "epoch": 0.05663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 0.9186703210868864, "kl": 0.00048273595166392624, "learning_rate": 2.743362831858407e-07, "loss": 0.0103, "num_tokens": 758846.0, "reward": 0.21875, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.5340979099273682, "sampling/importance_sampling_ratio/mean": 0.9997835159301758, "sampling/importance_sampling_ratio/min": 0.7198682427406311, "sampling/sampling_logp_difference/max": 0.42794251441955566, "sampling/sampling_logp_difference/mean": 0.016245532780885696, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 172.765625, "completions/mean_terminated_length": 172.765625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.46681106090545654, "epoch": 0.0584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 0.92359418250283, "kl": 0.0004580853274092078, "learning_rate": 2.8318584070796457e-07, "loss": 0.0201, "num_tokens": 781055.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3864423036575317, "sampling/importance_sampling_ratio/mean": 1.00001859664917, "sampling/importance_sampling_ratio/min": 0.6857555508613586, "sampling/sampling_logp_difference/max": 0.37723398208618164, "sampling/sampling_logp_difference/mean": 0.016717787832021713, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 191.203125, "completions/mean_terminated_length": 191.203125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.4282730221748352, "epoch": 0.06017699115044248, "frac_reward_zero_std": 0.75, "grad_norm": 0.8838626164036724, "kl": 0.0005225211498327553, "learning_rate": 2.920353982300885e-07, "loss": 0.0266, "num_tokens": 804764.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4427217245101929, "sampling/importance_sampling_ratio/mean": 1.0002740621566772, "sampling/importance_sampling_ratio/min": 0.7011700868606567, "sampling/sampling_logp_difference/max": 0.3665313720703125, "sampling/sampling_logp_difference/mean": 0.01491781510412693, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 145.171875, "completions/mean_terminated_length": 145.171875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.36615437269210815, "epoch": 0.061946902654867256, "frac_reward_zero_std": 0.75, "grad_norm": 1.1755523692983083, "kl": 0.0006177417235448956, "learning_rate": 3.008849557522124e-07, "loss": -0.0171, "num_tokens": 827495.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.485703468322754, "sampling/importance_sampling_ratio/mean": 1.0003960132598877, "sampling/importance_sampling_ratio/min": 0.31634464859962463, "sampling/sampling_logp_difference/max": 1.1509230136871338, "sampling/sampling_logp_difference/mean": 0.015536945313215256, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 103.125, "completions/mean_terminated_length": 103.125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.245346337556839, "epoch": 0.06371681415929203, "frac_reward_zero_std": 1.0, "grad_norm": 0.004783989031599875, "kl": 0.0007909094565548003, "learning_rate": 3.0973451327433626e-07, "loss": 0.0, "num_tokens": 843471.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6225351095199585, "sampling/importance_sampling_ratio/mean": 1.0003888607025146, "sampling/importance_sampling_ratio/min": 0.620156466960907, "sampling/sampling_logp_difference/max": 0.4839897155761719, "sampling/sampling_logp_difference/mean": 0.014604487456381321, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 193.90625, "completions/mean_terminated_length": 193.90625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4967886805534363, "epoch": 0.06548672566371681, "frac_reward_zero_std": 0.75, "grad_norm": 1.105159124561485, "kl": 0.0006542068440467119, "learning_rate": 3.1858407079646014e-07, "loss": -0.0257, "num_tokens": 870345.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4062508344650269, "sampling/importance_sampling_ratio/mean": 0.9996336698532104, "sampling/importance_sampling_ratio/min": 0.5721895098686218, "sampling/sampling_logp_difference/max": 0.5582849979400635, "sampling/sampling_logp_difference/mean": 0.01660425029695034, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 145.34375, "completions/mean_terminated_length": 145.34375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3922208547592163, "epoch": 0.06725663716814159, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038207811778539903, "kl": 0.000669948582071811, "learning_rate": 3.2743362831858407e-07, "loss": 0.0, "num_tokens": 889455.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.567585825920105, "sampling/importance_sampling_ratio/mean": 1.0002198219299316, "sampling/importance_sampling_ratio/min": 0.7432371973991394, "sampling/sampling_logp_difference/max": 0.4495368003845215, "sampling/sampling_logp_difference/mean": 0.015564528293907642, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 134.453125, "completions/mean_terminated_length": 134.453125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.2876589298248291, "epoch": 0.06902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 1.0445104539737837, "kl": 0.0006731583271175623, "learning_rate": 3.36283185840708e-07, "loss": 0.0331, "num_tokens": 907452.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.3671373128890991, "sampling/importance_sampling_ratio/mean": 0.9999994039535522, "sampling/importance_sampling_ratio/min": 0.6149933934211731, "sampling/sampling_logp_difference/max": 0.4861437678337097, "sampling/sampling_logp_difference/mean": 0.013353753834962845, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 171.296875, "completions/mean_terminated_length": 171.296875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3427136242389679, "epoch": 0.07079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 0.9039518340255397, "kl": 0.0006144886719994247, "learning_rate": 3.451327433628318e-07, "loss": -0.0115, "num_tokens": 929663.0, "reward": -0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3595855236053467, "sampling/importance_sampling_ratio/mean": 1.0000739097595215, "sampling/importance_sampling_ratio/min": 0.6020215153694153, "sampling/sampling_logp_difference/max": 0.5074621438980103, "sampling/sampling_logp_difference/mean": 0.015125567093491554, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 208.9375, "completions/mean_terminated_length": 208.9375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.5422006845474243, "epoch": 0.07256637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 1.214409285618555, "kl": 0.0005240167956799269, "learning_rate": 3.5398230088495575e-07, "loss": -0.0523, "num_tokens": 956411.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.2793256044387817, "sampling/importance_sampling_ratio/mean": 0.9999217391014099, "sampling/importance_sampling_ratio/min": 0.6142622828483582, "sampling/sampling_logp_difference/max": 0.4873332977294922, "sampling/sampling_logp_difference/mean": 0.016808750107884407, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 219.953125, "completions/mean_terminated_length": 219.953125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.490911602973938, "epoch": 0.0743362831858407, "frac_reward_zero_std": 0.5, "grad_norm": 0.975297783397351, "kl": 0.000842518697027117, "learning_rate": 3.6283185840707963e-07, "loss": -0.0121, "num_tokens": 982088.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6122313737869263, "sampling/importance_sampling_ratio/mean": 0.9994571208953857, "sampling/importance_sampling_ratio/min": 0.6460330486297607, "sampling/sampling_logp_difference/max": 0.4776191711425781, "sampling/sampling_logp_difference/mean": 0.017350424081087112, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 198.140625, "completions/mean_terminated_length": 198.140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.496207058429718, "epoch": 0.07610619469026549, "frac_reward_zero_std": 0.5, "grad_norm": 1.1555111520132804, "kl": 0.0008575224783271551, "learning_rate": 3.7168141592920356e-07, "loss": 0.027, "num_tokens": 1005521.0, "reward": 0.0, "reward_std": 0.4787135720252991, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3447661399841309, "sampling/importance_sampling_ratio/mean": 1.000145673751831, "sampling/importance_sampling_ratio/min": 0.6634519100189209, "sampling/sampling_logp_difference/max": 0.4102989435195923, "sampling/sampling_logp_difference/mean": 0.016248438507318497, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 172.140625, "completions/mean_terminated_length": 172.140625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.39261263608932495, "epoch": 0.07787610619469026, "frac_reward_zero_std": 1.0, "grad_norm": 0.004234071258287348, "kl": 0.0007658154936507344, "learning_rate": 3.805309734513274e-07, "loss": 0.0, "num_tokens": 1028506.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5592234134674072, "sampling/importance_sampling_ratio/mean": 1.0003376007080078, "sampling/importance_sampling_ratio/min": 0.6684509515762329, "sampling/sampling_logp_difference/max": 0.44418787956237793, "sampling/sampling_logp_difference/mean": 0.016881383955478668, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 132.15625, "completions/mean_terminated_length": 132.15625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.28721314668655396, "epoch": 0.07964601769911504, "frac_reward_zero_std": 1.0, "grad_norm": 0.005064502940089037, "kl": 0.0008006412535905838, "learning_rate": 3.893805309734513e-07, "loss": 0.0, "num_tokens": 1046020.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3631056547164917, "sampling/importance_sampling_ratio/mean": 0.9993762969970703, "sampling/importance_sampling_ratio/min": 0.6436959505081177, "sampling/sampling_logp_difference/max": 0.44052886962890625, "sampling/sampling_logp_difference/mean": 0.014048042707145214, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 255.78125, "completions/mean_terminated_length": 255.78125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "entropy": 0.3829498291015625, "epoch": 0.08141592920353982, "frac_reward_zero_std": 0.5, "grad_norm": 1.09197729542159, "kl": 0.0007642432465218008, "learning_rate": 3.982300884955752e-07, "loss": 0.0471, "num_tokens": 1073174.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4337151050567627, "sampling/importance_sampling_ratio/mean": 0.9996473789215088, "sampling/importance_sampling_ratio/min": 0.7803710103034973, "sampling/sampling_logp_difference/max": 0.36026906967163086, "sampling/sampling_logp_difference/mean": 0.013885672204196453, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 208.265625, "completions/mean_terminated_length": 208.265625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.44653117656707764, "epoch": 0.0831858407079646, "frac_reward_zero_std": 0.5, "grad_norm": 1.0658224190385082, "kl": 0.0009269802249036729, "learning_rate": 4.0707964601769913e-07, "loss": 0.0345, "num_tokens": 1097095.0, "reward": -0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": -0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3222029209136963, "sampling/importance_sampling_ratio/mean": 0.9996322393417358, "sampling/importance_sampling_ratio/min": 0.689879834651947, "sampling/sampling_logp_difference/max": 0.3712378740310669, "sampling/sampling_logp_difference/mean": 0.01496546808630228, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 212.78125, "completions/mean_terminated_length": 212.78125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.37079882621765137, "epoch": 0.08495575221238938, "frac_reward_zero_std": 0.5, "grad_norm": 1.057700328676415, "kl": 0.0008619067957624793, "learning_rate": 4.1592920353982295e-07, "loss": -0.002, "num_tokens": 1120665.0, "reward": 0.875, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3062375783920288, "sampling/importance_sampling_ratio/mean": 0.9999436140060425, "sampling/importance_sampling_ratio/min": 0.654721736907959, "sampling/sampling_logp_difference/max": 0.4235450029373169, "sampling/sampling_logp_difference/mean": 0.014391312375664711, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 159.546875, "completions/mean_terminated_length": 159.546875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.3836078345775604, "epoch": 0.08672566371681416, "frac_reward_zero_std": 0.75, "grad_norm": 0.9627634721725993, "kl": 0.001014052890241146, "learning_rate": 4.247787610619469e-07, "loss": -0.0447, "num_tokens": 1140876.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.3341999053955078, "sampling/importance_sampling_ratio/mean": 1.000030517578125, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.015728633850812912, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 162.1875, "completions/mean_terminated_length": 162.1875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.46104708313941956, "epoch": 0.08849557522123894, "frac_reward_zero_std": 0.5, "grad_norm": 1.434499473471128, "kl": 0.0014803860103711486, "learning_rate": 4.3362831858407076e-07, "loss": 0.0053, "num_tokens": 1161736.0, "reward": 0.40625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4056953191757202, "sampling/importance_sampling_ratio/mean": 0.9999998807907104, "sampling/importance_sampling_ratio/min": 0.6213480234146118, "sampling/sampling_logp_difference/max": 0.4758639335632324, "sampling/sampling_logp_difference/mean": 0.016189994290471077, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 178.53125, "completions/mean_terminated_length": 178.53125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3214467167854309, "epoch": 0.09026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 1.1180849217318127, "kl": 0.001077151857316494, "learning_rate": 4.424778761061947e-07, "loss": 0.1117, "num_tokens": 1183514.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.4481030702590942, "sampling/importance_sampling_ratio/mean": 1.0001425743103027, "sampling/importance_sampling_ratio/min": 0.6622359156608582, "sampling/sampling_logp_difference/max": 0.41213345527648926, "sampling/sampling_logp_difference/mean": 0.013998802751302719, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 168.21875, "completions/mean_terminated_length": 168.21875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4382091164588928, "epoch": 0.0920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.0049643263246611285, "kl": 0.0008880996610969305, "learning_rate": 4.5132743362831857e-07, "loss": 0.0, "num_tokens": 1208600.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4654942750930786, "sampling/importance_sampling_ratio/mean": 0.9995968341827393, "sampling/importance_sampling_ratio/min": 0.4864470064640045, "sampling/sampling_logp_difference/max": 0.7206273078918457, "sampling/sampling_logp_difference/mean": 0.015631159767508507, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 224.921875, "completions/mean_terminated_length": 224.921875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.5588756799697876, "epoch": 0.09380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 0.7168121498298733, "kl": 0.0012564393691718578, "learning_rate": 4.6017699115044245e-07, "loss": 0.0129, "num_tokens": 1235331.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.325711727142334, "sampling/importance_sampling_ratio/mean": 1.0006237030029297, "sampling/importance_sampling_ratio/min": 0.6163890361785889, "sampling/sampling_logp_difference/max": 0.48387694358825684, "sampling/sampling_logp_difference/mean": 0.017355667427182198, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 217.265625, "completions/mean_terminated_length": 217.265625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.44299280643463135, "epoch": 0.09557522123893805, "frac_reward_zero_std": 0.5, "grad_norm": 1.1516076883842818, "kl": 0.0011872118338942528, "learning_rate": 4.690265486725664e-07, "loss": -0.0125, "num_tokens": 1260852.0, "reward": 0.0625, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.3174514770507812, "sampling/importance_sampling_ratio/mean": 1.0000274181365967, "sampling/importance_sampling_ratio/min": 0.6112026572227478, "sampling/sampling_logp_difference/max": 0.4923267364501953, "sampling/sampling_logp_difference/mean": 0.015816478058695793, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 151.8125, "completions/mean_terminated_length": 151.8125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.34421470761299133, "epoch": 0.09734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.006991936916443441, "kl": 0.001308807055465877, "learning_rate": 4.778761061946903e-07, "loss": 0.0, "num_tokens": 1281288.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2738009691238403, "sampling/importance_sampling_ratio/mean": 0.999666690826416, "sampling/importance_sampling_ratio/min": 0.6147581934928894, "sampling/sampling_logp_difference/max": 0.4865262508392334, "sampling/sampling_logp_difference/mean": 0.014791518449783325, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 190.765625, "completions/mean_terminated_length": 190.765625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.38345327973365784, "epoch": 0.09911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.2417819990578034, "kl": 0.003148602321743965, "learning_rate": 4.867256637168141e-07, "loss": 0.0063, "num_tokens": 1306313.0, "reward": 0.4375, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4671752452850342, "sampling/importance_sampling_ratio/mean": 0.9994722604751587, "sampling/importance_sampling_ratio/min": 0.5983425974845886, "sampling/sampling_logp_difference/max": 0.5135917663574219, "sampling/sampling_logp_difference/mean": 0.015144657343626022, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 149.453125, "completions/mean_terminated_length": 149.453125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.3478190302848816, "epoch": 0.10088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 1.1688785025902486, "kl": 0.0018540058517828584, "learning_rate": 4.95575221238938e-07, "loss": 0.0011, "num_tokens": 1325606.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.333213448524475, "sampling/importance_sampling_ratio/mean": 1.000150203704834, "sampling/importance_sampling_ratio/min": 0.6299831867218018, "sampling/sampling_logp_difference/max": 0.46206212043762207, "sampling/sampling_logp_difference/mean": 0.014898328110575676, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 189.328125, "completions/mean_terminated_length": 189.328125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.29512307047843933, "epoch": 0.10265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 0.969598367643197, "kl": 0.0023700199089944363, "learning_rate": 5.044247787610619e-07, "loss": -0.0002, "num_tokens": 1348667.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.5592995882034302, "sampling/importance_sampling_ratio/mean": 1.0008137226104736, "sampling/importance_sampling_ratio/min": 0.609809935092926, "sampling/sampling_logp_difference/max": 0.49460792541503906, "sampling/sampling_logp_difference/mean": 0.013864312320947647, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 245.796875, "completions/mean_terminated_length": 245.796875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.6066912412643433, "epoch": 0.10442477876106195, "frac_reward_zero_std": 0.5, "grad_norm": 0.889969071338536, "kl": 0.004439334850758314, "learning_rate": 5.132743362831859e-07, "loss": -0.0074, "num_tokens": 1378366.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.6305538415908813, "sampling/importance_sampling_ratio/mean": 1.0001251697540283, "sampling/importance_sampling_ratio/min": 0.6057105660438538, "sampling/sampling_logp_difference/max": 0.5013530254364014, "sampling/sampling_logp_difference/mean": 0.018475841730833054, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.44423145055770874, "epoch": 0.10619469026548672, "frac_reward_zero_std": 0.25, "grad_norm": 1.1032104959320381, "kl": 0.0033573941327631474, "learning_rate": 5.221238938053097e-07, "loss": 0.0037, "num_tokens": 1407718.0, "reward": 0.0625, "reward_std": 0.6707825064659119, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2653532028198242, "sampling/importance_sampling_ratio/mean": 0.9998553991317749, "sampling/importance_sampling_ratio/min": 0.6232948303222656, "sampling/sampling_logp_difference/max": 0.4727356433868408, "sampling/sampling_logp_difference/mean": 0.01365312747657299, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 223.703125, "completions/mean_terminated_length": 223.703125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4510975182056427, "epoch": 0.1079646017699115, "frac_reward_zero_std": 0.25, "grad_norm": 1.3626532503704003, "kl": 0.009123655967414379, "learning_rate": 5.309734513274336e-07, "loss": 0.0482, "num_tokens": 1435811.0, "reward": 0.25, "reward_std": 0.7191373109817505, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5612155199050903, "sampling/importance_sampling_ratio/mean": 0.9997560381889343, "sampling/importance_sampling_ratio/min": 0.6180218458175659, "sampling/sampling_logp_difference/max": 0.4812314510345459, "sampling/sampling_logp_difference/mean": 0.016338439658284187, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 106.75, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.3070604205131531, "epoch": 0.10973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.020025897738762826, "kl": 0.004569772630929947, "learning_rate": 5.398230088495575e-07, "loss": 0.0001, "num_tokens": 1452611.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4657670259475708, "sampling/importance_sampling_ratio/mean": 1.000557541847229, "sampling/importance_sampling_ratio/min": 0.6666322946548462, "sampling/sampling_logp_difference/max": 0.4055166244506836, "sampling/sampling_logp_difference/mean": 0.01649780571460724, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 188.5, "completions/mean_terminated_length": 188.5, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3099520206451416, "epoch": 0.11150442477876106, "frac_reward_zero_std": 0.75, "grad_norm": 0.6726922824460149, "kl": 0.0026066922582685947, "learning_rate": 5.486725663716814e-07, "loss": 0.0253, "num_tokens": 1475731.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.5191065073013306, "sampling/importance_sampling_ratio/mean": 1.000030517578125, "sampling/importance_sampling_ratio/min": 0.6821218729019165, "sampling/sampling_logp_difference/max": 0.4181222915649414, "sampling/sampling_logp_difference/mean": 0.012346604838967323, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 208.984375, "completions/mean_terminated_length": 208.984375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.45609599351882935, "epoch": 0.11327433628318584, "frac_reward_zero_std": 0.5, "grad_norm": 1.3096243389823579, "kl": 0.011606791988015175, "learning_rate": 5.575221238938052e-07, "loss": 0.0161, "num_tokens": 1501154.0, "reward": 0.53125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4819767475128174, "sampling/importance_sampling_ratio/mean": 1.0002541542053223, "sampling/importance_sampling_ratio/min": 0.6301558017730713, "sampling/sampling_logp_difference/max": 0.4617881774902344, "sampling/sampling_logp_difference/mean": 0.015784818679094315, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 161.09375, "completions/mean_terminated_length": 161.09375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3594406843185425, "epoch": 0.11504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.1443597720195766, "kl": 0.00754031864926219, "learning_rate": 5.663716814159291e-07, "loss": 0.0308, "num_tokens": 1521544.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4212729930877686, "sampling/importance_sampling_ratio/mean": 0.9998541474342346, "sampling/importance_sampling_ratio/min": 0.6153431534767151, "sampling/sampling_logp_difference/max": 0.48557519912719727, "sampling/sampling_logp_difference/mean": 0.014111566357314587, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 164.0, "completions/max_terminated_length": 164.0, "completions/mean_length": 101.125, "completions/mean_terminated_length": 101.125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.24357709288597107, "epoch": 0.1168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.024629826609836886, "kl": 0.0035986611619591713, "learning_rate": 5.752212389380531e-07, "loss": 0.0, "num_tokens": 1537696.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.377199649810791, "sampling/importance_sampling_ratio/mean": 1.0006064176559448, "sampling/importance_sampling_ratio/min": 0.637252151966095, "sampling/sampling_logp_difference/max": 0.4505898952484131, "sampling/sampling_logp_difference/mean": 0.013590476475656033, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 178.5625, "completions/mean_terminated_length": 178.5625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3457975387573242, "epoch": 0.11858407079646018, "frac_reward_zero_std": 0.75, "grad_norm": 0.8047084844593525, "kl": 0.018204988911747932, "learning_rate": 5.84070796460177e-07, "loss": -0.0068, "num_tokens": 1558852.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.29731285572052, "sampling/importance_sampling_ratio/mean": 0.9993517398834229, "sampling/importance_sampling_ratio/min": 0.695947527885437, "sampling/sampling_logp_difference/max": 0.3624809980392456, "sampling/sampling_logp_difference/mean": 0.013106441125273705, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 194.1875, "completions/mean_terminated_length": 194.1875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.39951688051223755, "epoch": 0.12035398230088495, "frac_reward_zero_std": 0.5, "grad_norm": 1.2475307094295822, "kl": 0.00963111873716116, "learning_rate": 5.929203539823009e-07, "loss": -0.0971, "num_tokens": 1580976.0, "reward": 0.375, "reward_std": 0.4577302038669586, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4656713008880615, "sampling/importance_sampling_ratio/mean": 0.9998877644538879, "sampling/importance_sampling_ratio/min": 0.6100970506668091, "sampling/sampling_logp_difference/max": 0.4941372871398926, "sampling/sampling_logp_difference/mean": 0.014960943721234798, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 187.609375, "completions/mean_terminated_length": 187.609375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3969162106513977, "epoch": 0.12212389380530973, "frac_reward_zero_std": 1.0, "grad_norm": 0.026519157980291592, "kl": 0.011400602757930756, "learning_rate": 6.017699115044248e-07, "loss": 0.0001, "num_tokens": 1603575.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6854522228240967, "sampling/importance_sampling_ratio/mean": 1.000511884689331, "sampling/importance_sampling_ratio/min": 0.6474390625953674, "sampling/sampling_logp_difference/max": 0.5220339298248291, "sampling/sampling_logp_difference/mean": 0.015393278561532497, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 177.546875, "completions/mean_terminated_length": 177.546875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3258362412452698, "epoch": 0.12389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9490586376721595, "kl": 0.0063695767894387245, "learning_rate": 6.106194690265486e-07, "loss": 0.0197, "num_tokens": 1626794.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.853153944015503, "sampling/importance_sampling_ratio/mean": 1.000276803970337, "sampling/importance_sampling_ratio/min": 0.5364962816238403, "sampling/sampling_logp_difference/max": 0.6226956844329834, "sampling/sampling_logp_difference/mean": 0.014140763320028782, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3936723470687866, "epoch": 0.1256637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 1.619913369316984, "kl": 0.013661106117069721, "learning_rate": 6.194690265486725e-07, "loss": -0.04, "num_tokens": 1647106.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.536644697189331, "sampling/importance_sampling_ratio/mean": 1.000814437866211, "sampling/importance_sampling_ratio/min": 0.6920749545097351, "sampling/sampling_logp_difference/max": 0.42960119247436523, "sampling/sampling_logp_difference/mean": 0.016903996467590332, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 155.953125, "completions/mean_terminated_length": 155.953125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.38842615485191345, "epoch": 0.12743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 0.9952886581157503, "kl": 0.011372420005500317, "learning_rate": 6.283185840707964e-07, "loss": -0.0232, "num_tokens": 1667471.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.597475528717041, "sampling/importance_sampling_ratio/mean": 1.0003933906555176, "sampling/importance_sampling_ratio/min": 0.6933631300926208, "sampling/sampling_logp_difference/max": 0.46842455863952637, "sampling/sampling_logp_difference/mean": 0.015585072338581085, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 118.015625, "completions/mean_terminated_length": 118.015625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.254025399684906, "epoch": 0.12920353982300886, "frac_reward_zero_std": 1.0, "grad_norm": 0.04284108299035032, "kl": 0.01039934903383255, "learning_rate": 6.371681415929203e-07, "loss": 0.0001, "num_tokens": 1684752.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.504378080368042, "sampling/importance_sampling_ratio/mean": 0.9999960064888, "sampling/importance_sampling_ratio/min": 0.649052083492279, "sampling/sampling_logp_difference/max": 0.4322422742843628, "sampling/sampling_logp_difference/mean": 0.013781622983515263, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 201.21875, "completions/mean_terminated_length": 201.21875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.3895375430583954, "epoch": 0.13097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 0.8645019235320143, "kl": 0.01647133380174637, "learning_rate": 6.460176991150442e-07, "loss": -0.0116, "num_tokens": 1707614.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6088998317718506, "sampling/importance_sampling_ratio/mean": 1.0000102519989014, "sampling/importance_sampling_ratio/min": 0.7015503644943237, "sampling/sampling_logp_difference/max": 0.47555065155029297, "sampling/sampling_logp_difference/mean": 0.014648362062871456, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 218.59375, "completions/mean_terminated_length": 218.59375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4297330379486084, "epoch": 0.13274336283185842, "frac_reward_zero_std": 0.5, "grad_norm": 0.9546869949633048, "kl": 0.015366164967417717, "learning_rate": 6.548672566371681e-07, "loss": 0.0099, "num_tokens": 1732084.0, "reward": 0.375, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.492048978805542, "sampling/importance_sampling_ratio/mean": 1.0000102519989014, "sampling/importance_sampling_ratio/min": 0.695496678352356, "sampling/sampling_logp_difference/max": 0.4001502990722656, "sampling/sampling_logp_difference/mean": 0.015646368265151978, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 133.046875, "completions/mean_terminated_length": 133.046875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.32887911796569824, "epoch": 0.13451327433628318, "frac_reward_zero_std": 1.0, "grad_norm": 0.02094079875903086, "kl": 0.009953726083040237, "learning_rate": 6.637168141592921e-07, "loss": 0.0001, "num_tokens": 1751687.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5874511003494263, "sampling/importance_sampling_ratio/mean": 0.9999567270278931, "sampling/importance_sampling_ratio/min": 0.7372974753379822, "sampling/sampling_logp_difference/max": 0.4621295928955078, "sampling/sampling_logp_difference/mean": 0.0152928177267313, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 146.71875, "completions/mean_terminated_length": 146.71875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3579290211200714, "epoch": 0.13628318584070798, "frac_reward_zero_std": 0.75, "grad_norm": 0.97467148237027, "kl": 0.012682763859629631, "learning_rate": 6.72566371681416e-07, "loss": -0.0063, "num_tokens": 1770389.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.6267775297164917, "sampling/importance_sampling_ratio/mean": 1.0003312826156616, "sampling/importance_sampling_ratio/min": 0.6440848112106323, "sampling/sampling_logp_difference/max": 0.4866011142730713, "sampling/sampling_logp_difference/mean": 0.015002220869064331, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 145.59375, "completions/mean_terminated_length": 145.59375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.3425063192844391, "epoch": 0.13805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 1.1006363356593576, "kl": 0.0066282302141189575, "learning_rate": 6.814159292035397e-07, "loss": 0.0246, "num_tokens": 1794123.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5020347833633423, "sampling/importance_sampling_ratio/mean": 0.9996710419654846, "sampling/importance_sampling_ratio/min": 0.5523310303688049, "sampling/sampling_logp_difference/max": 0.5936076641082764, "sampling/sampling_logp_difference/mean": 0.014506646431982517, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 155.078125, "completions/mean_terminated_length": 155.078125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.2749665379524231, "epoch": 0.13982300884955753, "frac_reward_zero_std": 1.0, "grad_norm": 0.017866335754167833, "kl": 0.006836398039013147, "learning_rate": 6.902654867256636e-07, "loss": 0.0001, "num_tokens": 1814176.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4780030250549316, "sampling/importance_sampling_ratio/mean": 1.000427484512329, "sampling/importance_sampling_ratio/min": 0.7656050324440002, "sampling/sampling_logp_difference/max": 0.390691876411438, "sampling/sampling_logp_difference/mean": 0.012600925751030445, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 202.546875, "completions/mean_terminated_length": 202.546875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.5037556290626526, "epoch": 0.1415929203539823, "frac_reward_zero_std": 0.25, "grad_norm": 1.3805809279315995, "kl": 0.020283661782741547, "learning_rate": 6.991150442477876e-07, "loss": 0.0377, "num_tokens": 1839651.0, "reward": 0.59375, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4834307432174683, "sampling/importance_sampling_ratio/mean": 0.9995615482330322, "sampling/importance_sampling_ratio/min": 0.7031538486480713, "sampling/sampling_logp_difference/max": 0.39435744285583496, "sampling/sampling_logp_difference/mean": 0.017124194651842117, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 186.859375, "completions/mean_terminated_length": 186.859375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.37872523069381714, "epoch": 0.1433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 0.7996257439925782, "kl": 0.017041007056832314, "learning_rate": 7.079646017699115e-07, "loss": -0.0076, "num_tokens": 1862890.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.363723635673523, "sampling/importance_sampling_ratio/mean": 0.9994292259216309, "sampling/importance_sampling_ratio/min": 0.63686603307724, "sampling/sampling_logp_difference/max": 0.45119595527648926, "sampling/sampling_logp_difference/mean": 0.014645487070083618, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 142.71875, "completions/mean_terminated_length": 142.71875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4198603928089142, "epoch": 0.14513274336283186, "frac_reward_zero_std": 0.75, "grad_norm": 1.0840679594058147, "kl": 0.007501678541302681, "learning_rate": 7.168141592920353e-07, "loss": 0.0294, "num_tokens": 1884456.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.564056158065796, "sampling/importance_sampling_ratio/mean": 1.0002379417419434, "sampling/importance_sampling_ratio/min": 0.6243125796318054, "sampling/sampling_logp_difference/max": 0.47110414505004883, "sampling/sampling_logp_difference/mean": 0.016645008698105812, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 161.421875, "completions/mean_terminated_length": 161.421875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.39928942918777466, "epoch": 0.14690265486725665, "frac_reward_zero_std": 0.5, "grad_norm": 1.5249126492123801, "kl": 0.008676768280565739, "learning_rate": 7.256637168141593e-07, "loss": 0.0494, "num_tokens": 1906963.0, "reward": 0.59375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.620729923248291, "sampling/importance_sampling_ratio/mean": 1.0003002882003784, "sampling/importance_sampling_ratio/min": 0.7047671675682068, "sampling/sampling_logp_difference/max": 0.4828765392303467, "sampling/sampling_logp_difference/mean": 0.016519783064723015, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 163.1875, "completions/mean_terminated_length": 163.1875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4313996434211731, "epoch": 0.1486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 1.028906812237053, "kl": 0.01829027757048607, "learning_rate": 7.345132743362832e-07, "loss": -0.0102, "num_tokens": 1928543.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4684945344924927, "sampling/importance_sampling_ratio/mean": 0.9997948408126831, "sampling/importance_sampling_ratio/min": 0.685234010219574, "sampling/sampling_logp_difference/max": 0.38423776626586914, "sampling/sampling_logp_difference/mean": 0.016351807862520218, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 211.78125, "completions/mean_terminated_length": 211.78125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4603821635246277, "epoch": 0.1504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 1.0361716117603827, "kl": 0.022909455001354218, "learning_rate": 7.433628318584071e-07, "loss": 0.0139, "num_tokens": 1952721.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2994089126586914, "sampling/importance_sampling_ratio/mean": 0.9995492696762085, "sampling/importance_sampling_ratio/min": 0.6153885126113892, "sampling/sampling_logp_difference/max": 0.4855015277862549, "sampling/sampling_logp_difference/mean": 0.01490258239209652, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 125.78125, "completions/mean_terminated_length": 125.78125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2793322503566742, "epoch": 0.15221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 0.02465176853858683, "kl": 0.006941329222172499, "learning_rate": 7.522123893805308e-07, "loss": 0.0001, "num_tokens": 1970899.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.367046594619751, "sampling/importance_sampling_ratio/mean": 0.9998903870582581, "sampling/importance_sampling_ratio/min": 0.6791725754737854, "sampling/sampling_logp_difference/max": 0.3868800401687622, "sampling/sampling_logp_difference/mean": 0.013056832365691662, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 139.671875, "completions/mean_terminated_length": 139.671875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.32963648438453674, "epoch": 0.15398230088495576, "frac_reward_zero_std": 0.75, "grad_norm": 0.9408148001914002, "kl": 0.011813381686806679, "learning_rate": 7.610619469026548e-07, "loss": -0.0057, "num_tokens": 1989662.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3227317333221436, "sampling/importance_sampling_ratio/mean": 0.9998810291290283, "sampling/importance_sampling_ratio/min": 0.6637370586395264, "sampling/sampling_logp_difference/max": 0.4098691940307617, "sampling/sampling_logp_difference/mean": 0.013913685455918312, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 185.25, "completions/mean_terminated_length": 185.25, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.40117347240448, "epoch": 0.15575221238938053, "frac_reward_zero_std": 0.75, "grad_norm": 1.03732217114201, "kl": 0.0204781461507082, "learning_rate": 7.699115044247787e-07, "loss": 0.0381, "num_tokens": 2012414.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.3919247388839722, "sampling/importance_sampling_ratio/mean": 0.9995261430740356, "sampling/importance_sampling_ratio/min": 0.6509130597114563, "sampling/sampling_logp_difference/max": 0.4293792247772217, "sampling/sampling_logp_difference/mean": 0.016024772077798843, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 128.25, "completions/mean_terminated_length": 128.25, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.34240853786468506, "epoch": 0.15752212389380532, "frac_reward_zero_std": 0.75, "grad_norm": 1.107699474176166, "kl": 0.01158861257135868, "learning_rate": 7.787610619469026e-07, "loss": -0.0249, "num_tokens": 2030750.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3654366731643677, "sampling/importance_sampling_ratio/mean": 1.000300407409668, "sampling/importance_sampling_ratio/min": 0.6950644850730896, "sampling/sampling_logp_difference/max": 0.3637505769729614, "sampling/sampling_logp_difference/mean": 0.015214118175208569, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 156.1875, "completions/mean_terminated_length": 156.1875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3586588203907013, "epoch": 0.1592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 1.050407924866668, "kl": 0.012983415275812149, "learning_rate": 7.876106194690266e-07, "loss": 0.0361, "num_tokens": 2054442.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.5333000421524048, "sampling/importance_sampling_ratio/mean": 0.9996432662010193, "sampling/importance_sampling_ratio/min": 0.7462313771247864, "sampling/sampling_logp_difference/max": 0.42742228507995605, "sampling/sampling_logp_difference/mean": 0.01414406206458807, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 175.96875, "completions/mean_terminated_length": 175.96875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.5748320817947388, "epoch": 0.16106194690265488, "frac_reward_zero_std": 0.25, "grad_norm": 1.5838592905412583, "kl": 0.03240203857421875, "learning_rate": 7.964601769911504e-07, "loss": 0.039, "num_tokens": 2082136.0, "reward": 0.25, "reward_std": 0.6285127401351929, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.5486781597137451, "sampling/importance_sampling_ratio/mean": 1.0000864267349243, "sampling/importance_sampling_ratio/min": 0.6489058136940002, "sampling/sampling_logp_difference/max": 0.43740177154541016, "sampling/sampling_logp_difference/mean": 0.01843888685107231, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 165.75, "completions/mean_terminated_length": 165.75, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4375103712081909, "epoch": 0.16283185840707964, "frac_reward_zero_std": 0.75, "grad_norm": 0.9584750125833097, "kl": 0.019327480345964432, "learning_rate": 8.053097345132743e-07, "loss": -0.0028, "num_tokens": 2106104.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4734406471252441, "sampling/importance_sampling_ratio/mean": 0.9997900724411011, "sampling/importance_sampling_ratio/min": 0.6482194662094116, "sampling/sampling_logp_difference/max": 0.4335259199142456, "sampling/sampling_logp_difference/mean": 0.01644751988351345, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 144.203125, "completions/mean_terminated_length": 144.203125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3018205165863037, "epoch": 0.16460176991150444, "frac_reward_zero_std": 1.0, "grad_norm": 0.03494572984307722, "kl": 0.010509281419217587, "learning_rate": 8.141592920353983e-07, "loss": 0.0001, "num_tokens": 2126389.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.394669771194458, "sampling/importance_sampling_ratio/mean": 0.9995672702789307, "sampling/importance_sampling_ratio/min": 0.611591100692749, "sampling/sampling_logp_difference/max": 0.49169135093688965, "sampling/sampling_logp_difference/mean": 0.014270318672060966, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 145.765625, "completions/mean_terminated_length": 145.765625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.348491370677948, "epoch": 0.1663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 1.2147250909188214, "kl": 0.013375984504818916, "learning_rate": 8.230088495575221e-07, "loss": -0.0192, "num_tokens": 2145574.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.6011326313018799, "sampling/importance_sampling_ratio/mean": 1.0004305839538574, "sampling/importance_sampling_ratio/min": 0.6399555802345276, "sampling/sampling_logp_difference/max": 0.47071123123168945, "sampling/sampling_logp_difference/mean": 0.013981970958411694, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 142.953125, "completions/mean_terminated_length": 142.953125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5088372230529785, "epoch": 0.168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.067385432800198, "kl": 0.031087420880794525, "learning_rate": 8.318584070796459e-07, "loss": -0.0101, "num_tokens": 2166899.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5077775716781616, "sampling/importance_sampling_ratio/mean": 1.0007333755493164, "sampling/importance_sampling_ratio/min": 0.6623175740242004, "sampling/sampling_logp_difference/max": 0.4120100736618042, "sampling/sampling_logp_difference/mean": 0.018186533823609352, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 133.59375, "completions/mean_terminated_length": 133.59375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.30815327167510986, "epoch": 0.16991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.027201587791542785, "kl": 0.00946621410548687, "learning_rate": 8.407079646017698e-07, "loss": 0.0001, "num_tokens": 2184889.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6088849306106567, "sampling/importance_sampling_ratio/mean": 1.0007243156433105, "sampling/importance_sampling_ratio/min": 0.7547110915184021, "sampling/sampling_logp_difference/max": 0.475541353225708, "sampling/sampling_logp_difference/mean": 0.013283636420965195, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 129.203125, "completions/mean_terminated_length": 129.203125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.32512176036834717, "epoch": 0.17168141592920355, "frac_reward_zero_std": 1.0, "grad_norm": 0.04247265994389906, "kl": 0.0198502354323864, "learning_rate": 8.495575221238938e-07, "loss": 0.0002, "num_tokens": 2202150.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4340442419052124, "sampling/importance_sampling_ratio/mean": 1.0001094341278076, "sampling/importance_sampling_ratio/min": 0.6144447922706604, "sampling/sampling_logp_difference/max": 0.48703622817993164, "sampling/sampling_logp_difference/mean": 0.01574341207742691, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 183.109375, "completions/mean_terminated_length": 183.109375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.614513635635376, "epoch": 0.17345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 1.3652957419468377, "kl": 0.042070478200912476, "learning_rate": 8.584070796460177e-07, "loss": -0.0243, "num_tokens": 2226765.0, "reward": 0.40625, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3663080930709839, "sampling/importance_sampling_ratio/mean": 1.0011334419250488, "sampling/importance_sampling_ratio/min": 0.6851910948753357, "sampling/sampling_logp_difference/max": 0.37805747985839844, "sampling/sampling_logp_difference/mean": 0.018913637846708298, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 118.1875, "completions/mean_terminated_length": 118.1875, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.4186111390590668, "epoch": 0.1752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 1.3455830789305978, "kl": 0.037338901311159134, "learning_rate": 8.672566371681415e-07, "loss": -0.0109, "num_tokens": 2246089.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4058585166931152, "sampling/importance_sampling_ratio/mean": 0.999924898147583, "sampling/importance_sampling_ratio/min": 0.5805544257164001, "sampling/sampling_logp_difference/max": 0.5437717437744141, "sampling/sampling_logp_difference/mean": 0.01750396192073822, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 144.515625, "completions/mean_terminated_length": 144.515625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3664934039115906, "epoch": 0.17699115044247787, "frac_reward_zero_std": 1.0, "grad_norm": 0.04605670211334364, "kl": 0.02497912012040615, "learning_rate": 8.761061946902655e-07, "loss": 0.0003, "num_tokens": 2270730.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4137628078460693, "sampling/importance_sampling_ratio/mean": 0.9990532398223877, "sampling/importance_sampling_ratio/min": 0.6056217551231384, "sampling/sampling_logp_difference/max": 0.5014996528625488, "sampling/sampling_logp_difference/mean": 0.016384344547986984, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 209.09375, "completions/mean_terminated_length": 209.09375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.5220569968223572, "epoch": 0.17876106194690267, "frac_reward_zero_std": 0.5, "grad_norm": 1.2159113837907307, "kl": 0.03031090274453163, "learning_rate": 8.849557522123894e-07, "loss": -0.0172, "num_tokens": 2295472.0, "reward": 0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.475376844406128, "sampling/importance_sampling_ratio/mean": 0.9996536374092102, "sampling/importance_sampling_ratio/min": 0.6090261936187744, "sampling/sampling_logp_difference/max": 0.4958939552307129, "sampling/sampling_logp_difference/mean": 0.016138719394803047, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 140.734375, "completions/mean_terminated_length": 140.734375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4087064266204834, "epoch": 0.18053097345132743, "frac_reward_zero_std": 1.0, "grad_norm": 0.037537008599160135, "kl": 0.029635798186063766, "learning_rate": 8.938053097345132e-07, "loss": 0.0003, "num_tokens": 2313967.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3540775775909424, "sampling/importance_sampling_ratio/mean": 0.9989540576934814, "sampling/importance_sampling_ratio/min": 0.6092417240142822, "sampling/sampling_logp_difference/max": 0.49554014205932617, "sampling/sampling_logp_difference/mean": 0.01638820394873619, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 128.625, "completions/mean_terminated_length": 128.625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.33473706245422363, "epoch": 0.18230088495575222, "frac_reward_zero_std": 0.75, "grad_norm": 1.2249577525011914, "kl": 0.014623328112065792, "learning_rate": 9.026548672566371e-07, "loss": -0.0065, "num_tokens": 2332871.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5468659400939941, "sampling/importance_sampling_ratio/mean": 1.0005148649215698, "sampling/importance_sampling_ratio/min": 0.6942964196205139, "sampling/sampling_logp_difference/max": 0.4362308979034424, "sampling/sampling_logp_difference/mean": 0.015089668333530426, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 140.796875, "completions/mean_terminated_length": 140.796875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4268108010292053, "epoch": 0.184070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.2764446322401908, "kl": 0.015488414093852043, "learning_rate": 9.11504424778761e-07, "loss": -0.0196, "num_tokens": 2351674.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.5834494829177856, "sampling/importance_sampling_ratio/mean": 1.0005009174346924, "sampling/importance_sampling_ratio/min": 0.663982093334198, "sampling/sampling_logp_difference/max": 0.45960569381713867, "sampling/sampling_logp_difference/mean": 0.01634242758154869, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 174.4375, "completions/mean_terminated_length": 174.4375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.507774829864502, "epoch": 0.18584070796460178, "frac_reward_zero_std": 0.5, "grad_norm": 1.3069282947095495, "kl": 0.03400026261806488, "learning_rate": 9.203539823008849e-07, "loss": -0.0386, "num_tokens": 2377846.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.294175148010254, "sampling/importance_sampling_ratio/mean": 0.99942946434021, "sampling/importance_sampling_ratio/min": 0.6771621704101562, "sampling/sampling_logp_difference/max": 0.3898444175720215, "sampling/sampling_logp_difference/mean": 0.016538560390472412, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 160.34375, "completions/mean_terminated_length": 160.34375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.5758728384971619, "epoch": 0.18761061946902655, "frac_reward_zero_std": 0.25, "grad_norm": 1.665599068136189, "kl": 0.026462240144610405, "learning_rate": 9.292035398230088e-07, "loss": -0.0373, "num_tokens": 2401244.0, "reward": 0.53125, "reward_std": 0.6331988573074341, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3962644338607788, "sampling/importance_sampling_ratio/mean": 0.9994282126426697, "sampling/importance_sampling_ratio/min": 0.6452757120132446, "sampling/sampling_logp_difference/max": 0.4380776882171631, "sampling/sampling_logp_difference/mean": 0.019216172397136688, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 138.71875, "completions/mean_terminated_length": 138.71875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.40617284178733826, "epoch": 0.18938053097345134, "frac_reward_zero_std": 0.5, "grad_norm": 1.5116903926994538, "kl": 0.015507981181144714, "learning_rate": 9.380530973451328e-07, "loss": 0.0271, "num_tokens": 2421866.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4085325002670288, "sampling/importance_sampling_ratio/mean": 0.9999234080314636, "sampling/importance_sampling_ratio/min": 0.7436591982841492, "sampling/sampling_logp_difference/max": 0.3425483703613281, "sampling/sampling_logp_difference/mean": 0.0145273357629776, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 155.375, "completions/mean_terminated_length": 155.375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4087454676628113, "epoch": 0.1911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.454546734150439, "kl": 0.05462752655148506, "learning_rate": 9.469026548672566e-07, "loss": 0.0008, "num_tokens": 2442706.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4575304985046387, "sampling/importance_sampling_ratio/mean": 0.9992321729660034, "sampling/importance_sampling_ratio/min": 0.6291640996932983, "sampling/sampling_logp_difference/max": 0.4633631706237793, "sampling/sampling_logp_difference/mean": 0.01630711928009987, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 153.5625, "completions/mean_terminated_length": 153.5625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.4511426091194153, "epoch": 0.1929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.024483318744972813, "kl": 0.011083774268627167, "learning_rate": 9.557522123893805e-07, "loss": 0.0001, "num_tokens": 2463062.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3670463562011719, "sampling/importance_sampling_ratio/mean": 0.9995465874671936, "sampling/importance_sampling_ratio/min": 0.6154676079750061, "sampling/sampling_logp_difference/max": 0.48537302017211914, "sampling/sampling_logp_difference/mean": 0.017112568020820618, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 139.921875, "completions/mean_terminated_length": 139.921875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.4308427572250366, "epoch": 0.19469026548672566, "frac_reward_zero_std": 0.5, "grad_norm": 1.5095584664072077, "kl": 0.01657198928296566, "learning_rate": 9.646017699115042e-07, "loss": -0.0318, "num_tokens": 2482145.0, "reward": 0.5, "reward_std": 0.4472135901451111, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6176989078521729, "sampling/importance_sampling_ratio/mean": 1.0003668069839478, "sampling/importance_sampling_ratio/min": 0.459940105676651, "sampling/sampling_logp_difference/max": 0.7766590118408203, "sampling/sampling_logp_difference/mean": 0.016544148325920105, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 193.328125, "completions/mean_terminated_length": 193.328125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.5080606937408447, "epoch": 0.19646017699115045, "frac_reward_zero_std": 0.75, "grad_norm": 0.7919745122322969, "kl": 0.0173199363052845, "learning_rate": 9.734513274336282e-07, "loss": -0.0041, "num_tokens": 2508614.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4097967147827148, "sampling/importance_sampling_ratio/mean": 0.9998339414596558, "sampling/importance_sampling_ratio/min": 0.6026744246482849, "sampling/sampling_logp_difference/max": 0.506378173828125, "sampling/sampling_logp_difference/mean": 0.017503349110484123, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.49331408739089966, "epoch": 0.19823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 1.3789700100106055, "kl": 0.010955605655908585, "learning_rate": 9.82300884955752e-07, "loss": 0.018, "num_tokens": 2537686.0, "reward": 0.375, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.6250979900360107, "sampling/importance_sampling_ratio/mean": 1.0000886917114258, "sampling/importance_sampling_ratio/min": 0.7136193513870239, "sampling/sampling_logp_difference/max": 0.4855680465698242, "sampling/sampling_logp_difference/mean": 0.017850980162620544, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.4632031321525574, "epoch": 0.2, "frac_reward_zero_std": 0.75, "grad_norm": 1.1494826839551227, "kl": 0.011163798160851002, "learning_rate": 9.91150442477876e-07, "loss": -0.0053, "num_tokens": 2561142.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5633403062820435, "sampling/importance_sampling_ratio/mean": 1.000186800956726, "sampling/importance_sampling_ratio/min": 0.6731029748916626, "sampling/sampling_logp_difference/max": 0.4468247890472412, "sampling/sampling_logp_difference/mean": 0.017129603773355484, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 132.609375, "completions/mean_terminated_length": 132.609375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5000830292701721, "epoch": 0.20176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.1237492563637732, "kl": 0.011804342269897461, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 2582461.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.5152255296707153, "sampling/importance_sampling_ratio/mean": 1.000805139541626, "sampling/importance_sampling_ratio/min": 0.7269524931907654, "sampling/sampling_logp_difference/max": 0.41556429862976074, "sampling/sampling_logp_difference/mean": 0.01942405290901661, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 174.765625, "completions/mean_terminated_length": 174.765625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.42178767919540405, "epoch": 0.20353982300884957, "frac_reward_zero_std": 0.75, "grad_norm": 1.0131540511399926, "kl": 0.0098259337246418, "learning_rate": 9.99997614400677e-07, "loss": 0.012, "num_tokens": 2604750.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4035134315490723, "sampling/importance_sampling_ratio/mean": 1.0000475645065308, "sampling/importance_sampling_ratio/min": 0.6080347299575806, "sampling/sampling_logp_difference/max": 0.49752330780029297, "sampling/sampling_logp_difference/mean": 0.016035035252571106, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 172.96875, "completions/mean_terminated_length": 172.96875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5256979465484619, "epoch": 0.20530973451327433, "frac_reward_zero_std": 0.75, "grad_norm": 0.8396764659600636, "kl": 0.01159391738474369, "learning_rate": 9.999904576254724e-07, "loss": 0.0023, "num_tokens": 2630236.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5071685314178467, "sampling/importance_sampling_ratio/mean": 1.0004163980484009, "sampling/importance_sampling_ratio/min": 0.7611066699028015, "sampling/sampling_logp_difference/max": 0.4102327823638916, "sampling/sampling_logp_difference/mean": 0.017749082297086716, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 181.6875, "completions/mean_terminated_length": 181.6875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5166543126106262, "epoch": 0.20707964601769913, "frac_reward_zero_std": 0.5, "grad_norm": 1.1997542527548868, "kl": 0.013240222819149494, "learning_rate": 9.999785297426788e-07, "loss": 0.0383, "num_tokens": 2653560.0, "reward": 0.46875, "reward_std": 0.3723389506340027, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.3042196035385132, "sampling/importance_sampling_ratio/mean": 1.0007864236831665, "sampling/importance_sampling_ratio/min": 0.692559540271759, "sampling/sampling_logp_difference/max": 0.36736106872558594, "sampling/sampling_logp_difference/mean": 0.017134547233581543, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 158.15625, "completions/mean_terminated_length": 158.15625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.5524478554725647, "epoch": 0.2088495575221239, "frac_reward_zero_std": 0.5, "grad_norm": 1.385869401627046, "kl": 0.012404483743011951, "learning_rate": 9.999618308661168e-07, "loss": -0.0238, "num_tokens": 2674610.0, "reward": 0.71875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.329261302947998, "sampling/importance_sampling_ratio/mean": 1.0006515979766846, "sampling/importance_sampling_ratio/min": 0.7105941772460938, "sampling/sampling_logp_difference/max": 0.34165382385253906, "sampling/sampling_logp_difference/mean": 0.01848817989230156, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 128.484375, "completions/mean_terminated_length": 128.484375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.39519980549812317, "epoch": 0.21061946902654868, "frac_reward_zero_std": 0.75, "grad_norm": 1.1890170889233096, "kl": 0.0123123899102211, "learning_rate": 9.99940361155134e-07, "loss": 0.0021, "num_tokens": 2692049.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2881121635437012, "sampling/importance_sampling_ratio/mean": 0.9997977018356323, "sampling/importance_sampling_ratio/min": 0.743664026260376, "sampling/sampling_logp_difference/max": 0.29616594314575195, "sampling/sampling_logp_difference/mean": 0.01698697730898857, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 208.015625, "completions/mean_terminated_length": 208.015625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.6560627222061157, "epoch": 0.21238938053097345, "frac_reward_zero_std": 0.5, "grad_norm": 1.3285645147361667, "kl": 0.016548465937376022, "learning_rate": 9.999141208146027e-07, "loss": 0.0569, "num_tokens": 2717218.0, "reward": 0.0, "reward_std": 0.5123475193977356, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.531611680984497, "sampling/importance_sampling_ratio/mean": 1.0004523992538452, "sampling/importance_sampling_ratio/min": 0.7322604656219482, "sampling/sampling_logp_difference/max": 0.42632055282592773, "sampling/sampling_logp_difference/mean": 0.01894541271030903, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 158.265625, "completions/mean_terminated_length": 158.265625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5611162185668945, "epoch": 0.21415929203539824, "frac_reward_zero_std": 0.5, "grad_norm": 1.4886446751012414, "kl": 0.016358420252799988, "learning_rate": 9.998831100949186e-07, "loss": 0.0003, "num_tokens": 2741459.0, "reward": 0.875, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.425057053565979, "sampling/importance_sampling_ratio/mean": 0.9999103546142578, "sampling/importance_sampling_ratio/min": 0.5411344170570374, "sampling/sampling_logp_difference/max": 0.6140875816345215, "sampling/sampling_logp_difference/mean": 0.018663184717297554, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 158.65625, "completions/mean_terminated_length": 158.65625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.43928951025009155, "epoch": 0.215929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01645904357050317, "kl": 0.009950288571417332, "learning_rate": 9.998473292919985e-07, "loss": 0.0001, "num_tokens": 2764541.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3009637594223022, "sampling/importance_sampling_ratio/mean": 1.0002142190933228, "sampling/importance_sampling_ratio/min": 0.6546957492828369, "sampling/sampling_logp_difference/max": 0.4235846996307373, "sampling/sampling_logp_difference/mean": 0.015278320759534836, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 120.984375, "completions/mean_terminated_length": 120.984375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2946837544441223, "epoch": 0.2176991150442478, "frac_reward_zero_std": 1.0, "grad_norm": 0.02478017919018473, "kl": 0.009915519505739212, "learning_rate": 9.99806778747277e-07, "loss": 0.0001, "num_tokens": 2781916.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4346206188201904, "sampling/importance_sampling_ratio/mean": 0.9996424913406372, "sampling/importance_sampling_ratio/min": 0.635346531867981, "sampling/sampling_logp_difference/max": 0.4535846710205078, "sampling/sampling_logp_difference/mean": 0.014531348831951618, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 212.0, "completions/max_terminated_length": 212.0, "completions/mean_length": 107.0, "completions/mean_terminated_length": 107.0, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.29190003871917725, "epoch": 0.21946902654867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.028728227613374267, "kl": 0.010412806645035744, "learning_rate": 9.997614588477033e-07, "loss": 0.0001, "num_tokens": 2798876.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.421050786972046, "sampling/importance_sampling_ratio/mean": 1.0006955862045288, "sampling/importance_sampling_ratio/min": 0.7137504816055298, "sampling/sampling_logp_difference/max": 0.3513965606689453, "sampling/sampling_logp_difference/mean": 0.014932672493159771, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 181.015625, "completions/mean_terminated_length": 181.015625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.34129270911216736, "epoch": 0.22123893805309736, "frac_reward_zero_std": 0.75, "grad_norm": 1.0047889755198065, "kl": 0.011963277123868465, "learning_rate": 9.99711370025738e-07, "loss": 0.0003, "num_tokens": 2820333.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2962805032730103, "sampling/importance_sampling_ratio/mean": 1.0000866651535034, "sampling/importance_sampling_ratio/min": 0.6254509687423706, "sampling/sampling_logp_difference/max": 0.4692823886871338, "sampling/sampling_logp_difference/mean": 0.012929601594805717, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.37770137190818787, "epoch": 0.22300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 1.0151443363078754, "kl": 0.01518353633582592, "learning_rate": 9.996565127593489e-07, "loss": -0.0039, "num_tokens": 2841221.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5573248863220215, "sampling/importance_sampling_ratio/mean": 1.000090479850769, "sampling/importance_sampling_ratio/min": 0.6955075860023499, "sampling/sampling_logp_difference/max": 0.44296956062316895, "sampling/sampling_logp_difference/mean": 0.014279832132160664, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 125.375, "completions/mean_terminated_length": 125.375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.38288789987564087, "epoch": 0.2247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 9.291574908812418, "kl": 0.017631027847528458, "learning_rate": 9.995968875720051e-07, "loss": -0.0187, "num_tokens": 2864157.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.442091703414917, "sampling/importance_sampling_ratio/mean": 0.9988166093826294, "sampling/importance_sampling_ratio/min": 0.6271333694458008, "sampling/sampling_logp_difference/max": 0.4665961265563965, "sampling/sampling_logp_difference/mean": 0.015197532251477242, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 185.109375, "completions/mean_terminated_length": 185.109375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3877907395362854, "epoch": 0.22654867256637168, "frac_reward_zero_std": 1.0, "grad_norm": 0.02394435841726851, "kl": 0.01672307401895523, "learning_rate": 9.995324950326745e-07, "loss": 0.0002, "num_tokens": 2887348.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6528104543685913, "sampling/importance_sampling_ratio/mean": 0.9994519352912903, "sampling/importance_sampling_ratio/min": 0.7085780501365662, "sampling/sampling_logp_difference/max": 0.5024771690368652, "sampling/sampling_logp_difference/mean": 0.01464095525443554, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 170.84375, "completions/mean_terminated_length": 170.84375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4916332960128784, "epoch": 0.22831858407079647, "frac_reward_zero_std": 1.0, "grad_norm": 0.03916647784036259, "kl": 0.031004594638943672, "learning_rate": 9.994633357558158e-07, "loss": 0.0003, "num_tokens": 2909434.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2818790674209595, "sampling/importance_sampling_ratio/mean": 0.9994707107543945, "sampling/importance_sampling_ratio/min": 0.6706785559654236, "sampling/sampling_logp_difference/max": 0.39946532249450684, "sampling/sampling_logp_difference/mean": 0.01647157035768032, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 156.40625, "completions/mean_terminated_length": 156.40625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.46675926446914673, "epoch": 0.23008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 1.1741257957068456, "kl": 0.021152405068278313, "learning_rate": 9.993894104013746e-07, "loss": 0.0574, "num_tokens": 2929812.0, "reward": -0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2836991548538208, "sampling/importance_sampling_ratio/mean": 0.9995951652526855, "sampling/importance_sampling_ratio/min": 0.6450011134147644, "sampling/sampling_logp_difference/max": 0.4385032653808594, "sampling/sampling_logp_difference/mean": 0.01770133152604103, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 154.4375, "completions/mean_terminated_length": 154.4375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.33106446266174316, "epoch": 0.23185840707964603, "frac_reward_zero_std": 1.0, "grad_norm": 0.027089062065130332, "kl": 0.020202994346618652, "learning_rate": 9.993107196747758e-07, "loss": 0.0002, "num_tokens": 2949408.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5536561012268066, "sampling/importance_sampling_ratio/mean": 1.0000596046447754, "sampling/importance_sampling_ratio/min": 0.7092294692993164, "sampling/sampling_logp_difference/max": 0.4406108856201172, "sampling/sampling_logp_difference/mean": 0.013444620184600353, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 189.640625, "completions/mean_terminated_length": 189.640625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.49644505977630615, "epoch": 0.2336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 0.8487458344539385, "kl": 0.02899659052491188, "learning_rate": 9.99227264326918e-07, "loss": 0.0148, "num_tokens": 2973593.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4009907245635986, "sampling/importance_sampling_ratio/mean": 1.0004339218139648, "sampling/importance_sampling_ratio/min": 0.697740912437439, "sampling/sampling_logp_difference/max": 0.3599073886871338, "sampling/sampling_logp_difference/mean": 0.015961207449436188, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 171.0625, "completions/mean_terminated_length": 171.0625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.523189902305603, "epoch": 0.23539823008849559, "frac_reward_zero_std": 0.75, "grad_norm": 0.8880329497640999, "kl": 0.025794755667448044, "learning_rate": 9.991390451541648e-07, "loss": -0.0173, "num_tokens": 2998269.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4689908027648926, "sampling/importance_sampling_ratio/mean": 0.9997839331626892, "sampling/importance_sampling_ratio/min": 0.65727299451828, "sampling/sampling_logp_difference/max": 0.4196559190750122, "sampling/sampling_logp_difference/mean": 0.017466718330979347, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 213.234375, "completions/mean_terminated_length": 213.234375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.5031307935714722, "epoch": 0.23716814159292035, "frac_reward_zero_std": 0.5, "grad_norm": 1.1444489279085417, "kl": 0.02678334340453148, "learning_rate": 9.990460629983388e-07, "loss": 0.0087, "num_tokens": 3023724.0, "reward": 0.1875, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.4857287406921387, "sampling/importance_sampling_ratio/mean": 1.0004141330718994, "sampling/importance_sampling_ratio/min": 0.6298375129699707, "sampling/sampling_logp_difference/max": 0.4622933864593506, "sampling/sampling_logp_difference/mean": 0.016972266137599945, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 162.078125, "completions/mean_terminated_length": 162.078125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.35991179943084717, "epoch": 0.23893805309734514, "frac_reward_zero_std": 1.0, "grad_norm": 0.028513651900094868, "kl": 0.02142981067299843, "learning_rate": 9.989483187467125e-07, "loss": 0.0002, "num_tokens": 3045377.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.535014033317566, "sampling/importance_sampling_ratio/mean": 1.0001251697540283, "sampling/importance_sampling_ratio/min": 0.6358522176742554, "sampling/sampling_logp_difference/max": 0.4527890682220459, "sampling/sampling_logp_difference/mean": 0.014851902611553669, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 121.625, "completions/mean_terminated_length": 121.625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.31467437744140625, "epoch": 0.2407079646017699, "frac_reward_zero_std": 1.0, "grad_norm": 0.05676225492585991, "kl": 0.025293543934822083, "learning_rate": 9.988458133320008e-07, "loss": 0.0003, "num_tokens": 3062985.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4338220357894897, "sampling/importance_sampling_ratio/mean": 1.0003433227539062, "sampling/importance_sampling_ratio/min": 0.6482194662094116, "sampling/sampling_logp_difference/max": 0.4335259199142456, "sampling/sampling_logp_difference/mean": 0.015467900782823563, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 246.1875, "completions/mean_terminated_length": 246.1875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.45606034994125366, "epoch": 0.2424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.027817480301703744, "kl": 0.028994332998991013, "learning_rate": 9.987385477323506e-07, "loss": 0.0003, "num_tokens": 3088773.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.558082938194275, "sampling/importance_sampling_ratio/mean": 0.999538242816925, "sampling/importance_sampling_ratio/min": 0.6333794593811035, "sampling/sampling_logp_difference/max": 0.45668554306030273, "sampling/sampling_logp_difference/mean": 0.014899208210408688, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 159.015625, "completions/mean_terminated_length": 159.015625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.31140589714050293, "epoch": 0.24424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 1.2857330417792834, "kl": 0.023789769038558006, "learning_rate": 9.98626522971333e-07, "loss": -0.012, "num_tokens": 3110774.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.6011077165603638, "sampling/importance_sampling_ratio/mean": 0.9999802708625793, "sampling/importance_sampling_ratio/min": 0.6121524572372437, "sampling/sampling_logp_difference/max": 0.49077391624450684, "sampling/sampling_logp_difference/mean": 0.01341700367629528, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 179.328125, "completions/mean_terminated_length": 179.328125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3433701694011688, "epoch": 0.24601769911504426, "frac_reward_zero_std": 1.0, "grad_norm": 0.036298222737904895, "kl": 0.02790701389312744, "learning_rate": 9.985097401179333e-07, "loss": 0.0003, "num_tokens": 3132299.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4854481220245361, "sampling/importance_sampling_ratio/mean": 0.9998148679733276, "sampling/importance_sampling_ratio/min": 0.6218048930168152, "sampling/sampling_logp_difference/max": 0.4751288890838623, "sampling/sampling_logp_difference/mean": 0.014352495782077312, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1260.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 224.28125, "completions/mean_terminated_length": 224.28125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3062925934791565, "epoch": 0.24778761061946902, "frac_reward_zero_std": 1.0, "grad_norm": 0.03537521382370573, "kl": 0.02674679271876812, "learning_rate": 9.98388200286539e-07, "loss": 0.0003, "num_tokens": 3156413.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 0.9995108842849731, "sampling/importance_sampling_ratio/min": 0.6957627534866333, "sampling/sampling_logp_difference/max": 0.3627464771270752, "sampling/sampling_logp_difference/mean": 0.012646778486669064, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 258.265625, "completions/mean_terminated_length": 258.265625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4568946957588196, "epoch": 0.24955752212389382, "frac_reward_zero_std": 1.0, "grad_norm": 0.03188772400383102, "kl": 0.03366459906101227, "learning_rate": 9.98261904636932e-07, "loss": 0.0003, "num_tokens": 3183054.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.374427080154419, "sampling/importance_sampling_ratio/mean": 0.9999855756759644, "sampling/importance_sampling_ratio/min": 0.6346470713615417, "sampling/sampling_logp_difference/max": 0.45468616485595703, "sampling/sampling_logp_difference/mean": 0.014311404898762703, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 288.890625, "completions/mean_terminated_length": 288.890625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.5986075401306152, "epoch": 0.2513274336283186, "frac_reward_zero_std": 0.5, "grad_norm": 0.9051967590407283, "kl": 0.04546056687831879, "learning_rate": 9.981308543742756e-07, "loss": -0.0179, "num_tokens": 3213959.0, "reward": 0.0, "reward_std": 0.5, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5933138132095337, "sampling/importance_sampling_ratio/mean": 1.0001044273376465, "sampling/importance_sampling_ratio/min": 0.6030287146568298, "sampling/sampling_logp_difference/max": 0.5057904720306396, "sampling/sampling_logp_difference/mean": 0.01623942330479622, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 139.03125, "completions/mean_terminated_length": 139.03125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3082438111305237, "epoch": 0.25309734513274335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0346740695433099, "kl": 0.026018641889095306, "learning_rate": 9.979950507491033e-07, "loss": 0.0003, "num_tokens": 3232889.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4102617502212524, "sampling/importance_sampling_ratio/mean": 1.000148892402649, "sampling/importance_sampling_ratio/min": 0.7250264286994934, "sampling/sampling_logp_difference/max": 0.34377527236938477, "sampling/sampling_logp_difference/mean": 0.013314444571733475, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 154.265625, "completions/mean_terminated_length": 154.265625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.34074559807777405, "epoch": 0.25486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.03860775273382935, "kl": 0.03532181680202484, "learning_rate": 9.978544950573073e-07, "loss": 0.0004, "num_tokens": 3251242.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5971453189849854, "sampling/importance_sampling_ratio/mean": 1.0001463890075684, "sampling/importance_sampling_ratio/min": 0.6404089331626892, "sampling/sampling_logp_difference/max": 0.4682178497314453, "sampling/sampling_logp_difference/mean": 0.013817005790770054, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 147.515625, "completions/mean_terminated_length": 147.515625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2873741686344147, "epoch": 0.25663716814159293, "frac_reward_zero_std": 1.0, "grad_norm": 0.04067253581570143, "kl": 0.03148189187049866, "learning_rate": 9.97709188640126e-07, "loss": 0.0003, "num_tokens": 3271099.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2917137145996094, "sampling/importance_sampling_ratio/mean": 0.9992789626121521, "sampling/importance_sampling_ratio/min": 0.616651713848114, "sampling/sampling_logp_difference/max": 0.48345088958740234, "sampling/sampling_logp_difference/mean": 0.012774511240422726, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 201.03125, "completions/mean_terminated_length": 201.03125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4325287938117981, "epoch": 0.2584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 0.8993299382546053, "kl": 0.03885094076395035, "learning_rate": 9.975591328841304e-07, "loss": 0.0169, "num_tokens": 3296477.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.537389874458313, "sampling/importance_sampling_ratio/mean": 0.9997760057449341, "sampling/importance_sampling_ratio/min": 0.729432225227356, "sampling/sampling_logp_difference/max": 0.4300861358642578, "sampling/sampling_logp_difference/mean": 0.014719918370246887, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 166.1875, "completions/mean_terminated_length": 166.1875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.4421290159225464, "epoch": 0.26017699115044246, "frac_reward_zero_std": 0.75, "grad_norm": 1.1663039993812925, "kl": 0.03346528485417366, "learning_rate": 9.974043292212127e-07, "loss": -0.0257, "num_tokens": 3324201.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5876429080963135, "sampling/importance_sampling_ratio/mean": 0.9990295171737671, "sampling/importance_sampling_ratio/min": 0.694132924079895, "sampling/sampling_logp_difference/max": 0.4622504711151123, "sampling/sampling_logp_difference/mean": 0.016207978129386902, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 288.78125, "completions/mean_terminated_length": 288.78125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.4689440131187439, "epoch": 0.26194690265486725, "frac_reward_zero_std": 1.0, "grad_norm": 0.027072815911706842, "kl": 0.03730308637022972, "learning_rate": 9.97244779128571e-07, "loss": 0.0004, "num_tokens": 3355563.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3829669952392578, "sampling/importance_sampling_ratio/mean": 1.0002778768539429, "sampling/importance_sampling_ratio/min": 0.6509962677955627, "sampling/sampling_logp_difference/max": 0.42925143241882324, "sampling/sampling_logp_difference/mean": 0.014328066259622574, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 225.9375, "completions/mean_terminated_length": 225.9375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.3772192597389221, "epoch": 0.26371681415929205, "frac_reward_zero_std": 1.0, "grad_norm": 0.02721464347010123, "kl": 0.027017677202820778, "learning_rate": 9.970804841286953e-07, "loss": 0.0003, "num_tokens": 3380599.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4368870258331299, "sampling/importance_sampling_ratio/mean": 1.0000723600387573, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.01404891163110733, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 250.78125, "completions/mean_terminated_length": 250.78125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.408069372177124, "epoch": 0.26548672566371684, "frac_reward_zero_std": 1.0, "grad_norm": 0.024763661217230747, "kl": 0.03320477902889252, "learning_rate": 9.969114457893539e-07, "loss": 0.0003, "num_tokens": 3409225.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5792803764343262, "sampling/importance_sampling_ratio/mean": 0.9997915029525757, "sampling/importance_sampling_ratio/min": 0.6956390738487244, "sampling/sampling_logp_difference/max": 0.4569692611694336, "sampling/sampling_logp_difference/mean": 0.013673271983861923, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 200.765625, "completions/mean_terminated_length": 200.765625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4479539096355438, "epoch": 0.2672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.03492702919427098, "kl": 0.040304169058799744, "learning_rate": 9.967376657235778e-07, "loss": 0.0005, "num_tokens": 3432458.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.511489748954773, "sampling/importance_sampling_ratio/mean": 1.0001468658447266, "sampling/importance_sampling_ratio/min": 0.5509209036827087, "sampling/sampling_logp_difference/max": 0.5961639881134033, "sampling/sampling_logp_difference/mean": 0.016600418835878372, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.40500524640083313, "epoch": 0.26902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.03434521781959229, "kl": 0.0346105583012104, "learning_rate": 9.965591455896455e-07, "loss": 0.0004, "num_tokens": 3459130.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2980811595916748, "sampling/importance_sampling_ratio/mean": 1.000051736831665, "sampling/importance_sampling_ratio/min": 0.6824501752853394, "sampling/sampling_logp_difference/max": 0.3820657730102539, "sampling/sampling_logp_difference/mean": 0.014134477823972702, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 354.90625, "completions/mean_terminated_length": 354.90625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3681276738643646, "epoch": 0.27079646017699116, "frac_reward_zero_std": 1.0, "grad_norm": 0.01678495946536908, "kl": 0.02303595468401909, "learning_rate": 9.96375887091067e-07, "loss": 0.0002, "num_tokens": 3494548.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.343514323234558, "sampling/importance_sampling_ratio/mean": 0.9999706149101257, "sampling/importance_sampling_ratio/min": 0.6474516987800598, "sampling/sampling_logp_difference/max": 0.4347110986709595, "sampling/sampling_logp_difference/mean": 0.011611346155405045, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 301.46875, "completions/mean_terminated_length": 301.46875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.35046854615211487, "epoch": 0.27256637168141595, "frac_reward_zero_std": 1.0, "grad_norm": 0.018301977976502796, "kl": 0.018390771001577377, "learning_rate": 9.961878919765677e-07, "loss": 0.0002, "num_tokens": 3527010.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6247203350067139, "sampling/importance_sampling_ratio/mean": 1.0001691579818726, "sampling/importance_sampling_ratio/min": 0.49258702993392944, "sampling/sampling_logp_difference/max": 0.7080841064453125, "sampling/sampling_logp_difference/mean": 0.011635717004537582, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 190.40625, "completions/mean_terminated_length": 190.40625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3786317706108093, "epoch": 0.2743362831858407, "frac_reward_zero_std": 1.0, "grad_norm": 0.03390067600192146, "kl": 0.03351679816842079, "learning_rate": 9.959951620400718e-07, "loss": 0.0004, "num_tokens": 3548684.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3179739713668823, "sampling/importance_sampling_ratio/mean": 1.0002996921539307, "sampling/importance_sampling_ratio/min": 0.6940809488296509, "sampling/sampling_logp_difference/max": 0.36516666412353516, "sampling/sampling_logp_difference/mean": 0.014952241443097591, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 289.421875, "completions/mean_terminated_length": 289.421875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.41793277859687805, "epoch": 0.2761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.027633166062037313, "kl": 0.03265716880559921, "learning_rate": 9.957976991206845e-07, "loss": 0.0003, "num_tokens": 3578055.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5194100141525269, "sampling/importance_sampling_ratio/mean": 0.999330997467041, "sampling/importance_sampling_ratio/min": 0.6532416343688965, "sampling/sampling_logp_difference/max": 0.4258081912994385, "sampling/sampling_logp_difference/mean": 0.01384137012064457, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 219.671875, "completions/mean_terminated_length": 219.671875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4456171989440918, "epoch": 0.2778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.03839060018604204, "kl": 0.0445060208439827, "learning_rate": 9.955955051026758e-07, "loss": 0.0005, "num_tokens": 3604290.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4353132247924805, "sampling/importance_sampling_ratio/mean": 0.9999475479125977, "sampling/importance_sampling_ratio/min": 0.6802029609680176, "sampling/sampling_logp_difference/max": 0.3853640556335449, "sampling/sampling_logp_difference/mean": 0.014434573240578175, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 235.515625, "completions/mean_terminated_length": 235.515625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.3188089430332184, "epoch": 0.27964601769911507, "frac_reward_zero_std": 1.0, "grad_norm": 0.020726454533559042, "kl": 0.020972244441509247, "learning_rate": 9.953885819154614e-07, "loss": 0.0002, "num_tokens": 3630067.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071592330932617, "sampling/importance_sampling_ratio/mean": 1.000205159187317, "sampling/importance_sampling_ratio/min": 0.639909029006958, "sampling/sampling_logp_difference/max": 0.4464292526245117, "sampling/sampling_logp_difference/mean": 0.011311270296573639, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 239.65625, "completions/mean_terminated_length": 239.65625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.588887631893158, "epoch": 0.2814159292035398, "frac_reward_zero_std": 1.0, "grad_norm": 0.038805979135392424, "kl": 0.05581683665513992, "learning_rate": 9.951769315335843e-07, "loss": 0.0006, "num_tokens": 3656461.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.280115008354187, "sampling/importance_sampling_ratio/mean": 1.0008742809295654, "sampling/importance_sampling_ratio/min": 0.6223615407943726, "sampling/sampling_logp_difference/max": 0.47423410415649414, "sampling/sampling_logp_difference/mean": 0.01711086928844452, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 227.390625, "completions/mean_terminated_length": 227.390625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4441239535808563, "epoch": 0.2831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.03132778531647549, "kl": 0.04597689211368561, "learning_rate": 9.949605559766967e-07, "loss": 0.0005, "num_tokens": 3682422.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3805807828903198, "sampling/importance_sampling_ratio/mean": 1.0000271797180176, "sampling/importance_sampling_ratio/min": 0.6719688773155212, "sampling/sampling_logp_difference/max": 0.3975433111190796, "sampling/sampling_logp_difference/mean": 0.014202705584466457, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 233.9375, "completions/mean_terminated_length": 233.9375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.4425801634788513, "epoch": 0.2849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.026878682207162892, "kl": 0.03890957683324814, "learning_rate": 9.947394573095402e-07, "loss": 0.0004, "num_tokens": 3708082.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2759809494018555, "sampling/importance_sampling_ratio/mean": 0.9999120235443115, "sampling/importance_sampling_ratio/min": 0.6183002591133118, "sampling/sampling_logp_difference/max": 0.48078107833862305, "sampling/sampling_logp_difference/mean": 0.014461169950664043, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 154.203125, "completions/mean_terminated_length": 154.203125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.367497980594635, "epoch": 0.2867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.030844164307948486, "kl": 0.031070904806256294, "learning_rate": 9.945136376419258e-07, "loss": 0.0004, "num_tokens": 3728895.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 1.0001007318496704, "sampling/importance_sampling_ratio/min": 0.6359556317329407, "sampling/sampling_logp_difference/max": 0.45262646675109863, "sampling/sampling_logp_difference/mean": 0.015953868627548218, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 205.703125, "completions/mean_terminated_length": 205.703125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.448856383562088, "epoch": 0.2884955752212389, "frac_reward_zero_std": 1.0, "grad_norm": 0.05840902340127153, "kl": 0.040665335953235626, "learning_rate": 9.942830991287149e-07, "loss": 0.0004, "num_tokens": 3755004.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3960000276565552, "sampling/importance_sampling_ratio/mean": 1.0006427764892578, "sampling/importance_sampling_ratio/min": 0.5062552094459534, "sampling/sampling_logp_difference/max": 0.6807143688201904, "sampling/sampling_logp_difference/mean": 0.015489346347749233, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 169.0625, "completions/mean_terminated_length": 169.0625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.36270537972450256, "epoch": 0.2902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.030222286248721437, "kl": 0.028047749772667885, "learning_rate": 9.940478439697972e-07, "loss": 0.0003, "num_tokens": 3775056.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3460074663162231, "sampling/importance_sampling_ratio/mean": 0.9998701810836792, "sampling/importance_sampling_ratio/min": 0.6988232135772705, "sampling/sampling_logp_difference/max": 0.35835742950439453, "sampling/sampling_logp_difference/mean": 0.014132875949144363, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 216.078125, "completions/mean_terminated_length": 216.078125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.46945494413375854, "epoch": 0.2920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.03360911814839701, "kl": 0.04232267290353775, "learning_rate": 9.93807874410071e-07, "loss": 0.0005, "num_tokens": 3801413.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.5725798606872559, "sampling/importance_sampling_ratio/mean": 0.9995458126068115, "sampling/importance_sampling_ratio/min": 0.6771509051322937, "sampling/sampling_logp_difference/max": 0.45271754264831543, "sampling/sampling_logp_difference/mean": 0.014892792329192162, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 176.703125, "completions/mean_terminated_length": 176.703125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4789275527000427, "epoch": 0.2938053097345133, "frac_reward_zero_std": 1.0, "grad_norm": 0.03239150066842321, "kl": 0.04120933637022972, "learning_rate": 9.935631927394214e-07, "loss": 0.0005, "num_tokens": 3824354.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3178095817565918, "sampling/importance_sampling_ratio/mean": 0.9998352527618408, "sampling/importance_sampling_ratio/min": 0.6385133862495422, "sampling/sampling_logp_difference/max": 0.44861268997192383, "sampling/sampling_logp_difference/mean": 0.01687219925224781, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 220.671875, "completions/mean_terminated_length": 220.671875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3237878978252411, "epoch": 0.29557522123893804, "frac_reward_zero_std": 1.0, "grad_norm": 0.020555933492004103, "kl": 0.019533012062311172, "learning_rate": 9.93313801292698e-07, "loss": 0.0002, "num_tokens": 3848093.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4228062629699707, "sampling/importance_sampling_ratio/mean": 0.999441385269165, "sampling/importance_sampling_ratio/min": 0.6771497130393982, "sampling/sampling_logp_difference/max": 0.38986289501190186, "sampling/sampling_logp_difference/mean": 0.012553451582789421, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 205.359375, "completions/mean_terminated_length": 205.359375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.26872894167900085, "epoch": 0.2973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.022728436780581552, "kl": 0.017619963735342026, "learning_rate": 9.93059702449693e-07, "loss": 0.0002, "num_tokens": 3870868.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5467859506607056, "sampling/importance_sampling_ratio/mean": 0.9995690584182739, "sampling/importance_sampling_ratio/min": 0.6017630100250244, "sampling/sampling_logp_difference/max": 0.5078915357589722, "sampling/sampling_logp_difference/mean": 0.012646839022636414, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 141.390625, "completions/mean_terminated_length": 141.390625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.34829992055892944, "epoch": 0.2991150442477876, "frac_reward_zero_std": 0.75, "grad_norm": 1.2453504342625805, "kl": 0.029450297355651855, "learning_rate": 9.928008986351186e-07, "loss": -0.0426, "num_tokens": 3890445.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3386385440826416, "sampling/importance_sampling_ratio/mean": 0.9994571208953857, "sampling/importance_sampling_ratio/min": 0.6298378109931946, "sampling/sampling_logp_difference/max": 0.4622929096221924, "sampling/sampling_logp_difference/mean": 0.014471322298049927, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 212.671875, "completions/mean_terminated_length": 212.671875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5321623086929321, "epoch": 0.3008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 0.8178777238789101, "kl": 0.04331979900598526, "learning_rate": 9.925373923185834e-07, "loss": 0.0202, "num_tokens": 3918168.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.2948603630065918, "sampling/importance_sampling_ratio/mean": 1.000131368637085, "sampling/importance_sampling_ratio/min": 0.6646633148193359, "sampling/sampling_logp_difference/max": 0.4084746837615967, "sampling/sampling_logp_difference/mean": 0.01630018651485443, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 246.5625, "completions/mean_terminated_length": 246.5625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.43632301688194275, "epoch": 0.30265486725663715, "frac_reward_zero_std": 1.0, "grad_norm": 0.020989156876186344, "kl": 0.03128629922866821, "learning_rate": 9.922691860145696e-07, "loss": 0.0003, "num_tokens": 3949628.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6568189859390259, "sampling/importance_sampling_ratio/mean": 1.0005426406860352, "sampling/importance_sampling_ratio/min": 0.5483725070953369, "sampling/sampling_logp_difference/max": 0.6008005142211914, "sampling/sampling_logp_difference/mean": 0.015076635405421257, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 200.453125, "completions/mean_terminated_length": 200.453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4134460985660553, "epoch": 0.30442477876106194, "frac_reward_zero_std": 1.0, "grad_norm": 0.12538438339130348, "kl": 0.0488395132124424, "learning_rate": 9.919962822824083e-07, "loss": 0.0006, "num_tokens": 3978249.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5698164701461792, "sampling/importance_sampling_ratio/mean": 0.9996939897537231, "sampling/importance_sampling_ratio/min": 0.665626049041748, "sampling/sampling_logp_difference/max": 0.4509587287902832, "sampling/sampling_logp_difference/mean": 0.015557809732854366, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 106.59375, "completions/mean_terminated_length": 106.59375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.28162699937820435, "epoch": 0.30619469026548674, "frac_reward_zero_std": 1.0, "grad_norm": 0.033040203096103796, "kl": 0.01680285856127739, "learning_rate": 9.91718683726255e-07, "loss": 0.0002, "num_tokens": 3994175.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6337953805923462, "sampling/importance_sampling_ratio/mean": 0.9994305968284607, "sampling/importance_sampling_ratio/min": 0.6063137650489807, "sampling/sampling_logp_difference/max": 0.5003576278686523, "sampling/sampling_logp_difference/mean": 0.015085892751812935, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 226.046875, "completions/mean_terminated_length": 226.046875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.4104277491569519, "epoch": 0.30796460176991153, "frac_reward_zero_std": 1.0, "grad_norm": 0.023710154914067024, "kl": 0.030415624380111694, "learning_rate": 9.914363929950657e-07, "loss": 0.0004, "num_tokens": 4018834.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3802553415298462, "sampling/importance_sampling_ratio/mean": 1.0001755952835083, "sampling/importance_sampling_ratio/min": 0.48663073778152466, "sampling/sampling_logp_difference/max": 0.7202496528625488, "sampling/sampling_logp_difference/mean": 0.015630293637514114, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 131.171875, "completions/mean_terminated_length": 131.171875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.2812592387199402, "epoch": 0.30973451327433627, "frac_reward_zero_std": 1.0, "grad_norm": 0.031217325718960937, "kl": 0.019437432289123535, "learning_rate": 9.91149412782571e-07, "loss": 0.0002, "num_tokens": 4035165.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.293818473815918, "sampling/importance_sampling_ratio/mean": 1.0000535249710083, "sampling/importance_sampling_ratio/min": 0.6472192406654358, "sampling/sampling_logp_difference/max": 0.4350701570510864, "sampling/sampling_logp_difference/mean": 0.012308983132243156, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 224.953125, "completions/mean_terminated_length": 224.953125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4822311997413635, "epoch": 0.31150442477876106, "frac_reward_zero_std": 1.0, "grad_norm": 0.030771598024009236, "kl": 0.0388898141682148, "learning_rate": 9.908577458272495e-07, "loss": 0.0004, "num_tokens": 4061178.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3889068365097046, "sampling/importance_sampling_ratio/mean": 0.9999488592147827, "sampling/importance_sampling_ratio/min": 0.6047934889793396, "sampling/sampling_logp_difference/max": 0.5028681755065918, "sampling/sampling_logp_difference/mean": 0.015290592797100544, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 146.4375, "completions/mean_terminated_length": 146.4375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.33842039108276367, "epoch": 0.31327433628318585, "frac_reward_zero_std": 1.0, "grad_norm": 0.027339872557214075, "kl": 0.017229732125997543, "learning_rate": 9.905613949123034e-07, "loss": 0.0002, "num_tokens": 4082726.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999748706817627, "sampling/importance_sampling_ratio/min": 0.7109527587890625, "sampling/sampling_logp_difference/max": 0.7022933959960938, "sampling/sampling_logp_difference/mean": 0.01365036703646183, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 228.984375, "completions/mean_terminated_length": 228.984375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4279431700706482, "epoch": 0.31504424778761064, "frac_reward_zero_std": 1.0, "grad_norm": 0.028186699292072136, "kl": 0.03191818296909332, "learning_rate": 9.902603628656311e-07, "loss": 0.0004, "num_tokens": 4107733.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4215264320373535, "sampling/importance_sampling_ratio/mean": 0.9996159672737122, "sampling/importance_sampling_ratio/min": 0.6781244874000549, "sampling/sampling_logp_difference/max": 0.3884243965148926, "sampling/sampling_logp_difference/mean": 0.013241034932434559, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 175.734375, "completions/mean_terminated_length": 175.734375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3895474672317505, "epoch": 0.3168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.04318810553249402, "kl": 0.025557177141308784, "learning_rate": 9.899546525597997e-07, "loss": 0.0003, "num_tokens": 4129508.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5860437154769897, "sampling/importance_sampling_ratio/mean": 1.000396966934204, "sampling/importance_sampling_ratio/min": 0.6368714570999146, "sampling/sampling_logp_difference/max": 0.46124267578125, "sampling/sampling_logp_difference/mean": 0.016088902950286865, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 196.296875, "completions/mean_terminated_length": 196.296875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.4673101305961609, "epoch": 0.3185840707964602, "frac_reward_zero_std": 1.0, "grad_norm": 0.042519648945525956, "kl": 0.04119706153869629, "learning_rate": 9.896442669120187e-07, "loss": 0.0004, "num_tokens": 4154183.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4919874668121338, "sampling/importance_sampling_ratio/mean": 1.0002926588058472, "sampling/importance_sampling_ratio/min": 0.6402009725570679, "sampling/sampling_logp_difference/max": 0.44597315788269043, "sampling/sampling_logp_difference/mean": 0.0154157355427742, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 168.359375, "completions/mean_terminated_length": 168.359375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3967672884464264, "epoch": 0.32035398230088497, "frac_reward_zero_std": 1.0, "grad_norm": 0.027599792756827137, "kl": 0.0274306982755661, "learning_rate": 9.893292088841108e-07, "loss": 0.0003, "num_tokens": 4174158.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.361751914024353, "sampling/importance_sampling_ratio/mean": 1.0000026226043701, "sampling/importance_sampling_ratio/min": 0.6568920612335205, "sampling/sampling_logp_difference/max": 0.4202355146408081, "sampling/sampling_logp_difference/mean": 0.015374141745269299, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 280.921875, "completions/mean_terminated_length": 280.921875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.46510568261146545, "epoch": 0.32212389380530976, "frac_reward_zero_std": 1.0, "grad_norm": 0.11045246901719807, "kl": 0.039309095591306686, "learning_rate": 9.890094814824852e-07, "loss": 0.0004, "num_tokens": 4204857.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3266682624816895, "sampling/importance_sampling_ratio/mean": 0.9999687671661377, "sampling/importance_sampling_ratio/min": 0.6524989604949951, "sampling/sampling_logp_difference/max": 0.42694568634033203, "sampling/sampling_logp_difference/mean": 0.015028094872832298, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3483262062072754, "epoch": 0.3238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.02743467795406971, "kl": 0.024161968380212784, "learning_rate": 9.886850877581078e-07, "loss": 0.0003, "num_tokens": 4223289.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6014540195465088, "sampling/importance_sampling_ratio/mean": 0.999658465385437, "sampling/importance_sampling_ratio/min": 0.623564600944519, "sampling/sampling_logp_difference/max": 0.4723029136657715, "sampling/sampling_logp_difference/mean": 0.015557544305920601, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 139.265625, "completions/mean_terminated_length": 139.265625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.36477214097976685, "epoch": 0.3256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.032238574152394166, "kl": 0.02369370311498642, "learning_rate": 9.883560308064722e-07, "loss": 0.0003, "num_tokens": 4242426.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6076874732971191, "sampling/importance_sampling_ratio/mean": 1.0006036758422852, "sampling/importance_sampling_ratio/min": 0.7465273141860962, "sampling/sampling_logp_difference/max": 0.47479677200317383, "sampling/sampling_logp_difference/mean": 0.015426398254930973, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 190.4375, "completions/mean_terminated_length": 190.4375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.37622520327568054, "epoch": 0.3274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.02247795274545629, "kl": 0.019452273845672607, "learning_rate": 9.880223137675707e-07, "loss": 0.0003, "num_tokens": 4264630.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3681427240371704, "sampling/importance_sampling_ratio/mean": 0.9998056292533875, "sampling/importance_sampling_ratio/min": 0.6489084362983704, "sampling/sampling_logp_difference/max": 0.4324636459350586, "sampling/sampling_logp_difference/mean": 0.015204913914203644, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 138.71875, "completions/mean_terminated_length": 138.71875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.28751200437545776, "epoch": 0.3292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.023669913508627505, "kl": 0.013852202333509922, "learning_rate": 9.876839398258639e-07, "loss": 0.0001, "num_tokens": 4284660.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.573148488998413, "sampling/importance_sampling_ratio/mean": 1.000110387802124, "sampling/importance_sampling_ratio/min": 0.6071493029594421, "sampling/sampling_logp_difference/max": 0.4989805221557617, "sampling/sampling_logp_difference/mean": 0.014854800887405872, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 146.140625, "completions/mean_terminated_length": 146.140625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3645159900188446, "epoch": 0.3309734513274336, "frac_reward_zero_std": 1.0, "grad_norm": 0.11880412522046792, "kl": 0.027670051902532578, "learning_rate": 9.873409122102503e-07, "loss": 0.0004, "num_tokens": 4304429.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4720406532287598, "sampling/importance_sampling_ratio/mean": 0.999610185623169, "sampling/importance_sampling_ratio/min": 0.6360495090484619, "sampling/sampling_logp_difference/max": 0.45247888565063477, "sampling/sampling_logp_difference/mean": 0.01561033632606268, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 271.203125, "completions/mean_terminated_length": 271.203125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3917270600795746, "epoch": 0.3327433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.02590847844041066, "kl": 0.02397763356566429, "learning_rate": 9.869932341940358e-07, "loss": 0.0003, "num_tokens": 4332778.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.608748435974121, "sampling/importance_sampling_ratio/mean": 1.0001976490020752, "sampling/importance_sampling_ratio/min": 0.6262629628181458, "sampling/sampling_logp_difference/max": 0.47545647621154785, "sampling/sampling_logp_difference/mean": 0.013658429495990276, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 145.6875, "completions/mean_terminated_length": 145.6875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.23473447561264038, "epoch": 0.3345132743362832, "frac_reward_zero_std": 1.0, "grad_norm": 0.020008689326622402, "kl": 0.011209280230104923, "learning_rate": 9.86640909094902e-07, "loss": 0.0001, "num_tokens": 4352374.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3670518398284912, "sampling/importance_sampling_ratio/mean": 0.9996951818466187, "sampling/importance_sampling_ratio/min": 0.6023116111755371, "sampling/sampling_logp_difference/max": 0.5069804191589355, "sampling/sampling_logp_difference/mean": 0.011350013315677643, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 295.09375, "completions/mean_terminated_length": 295.09375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "entropy": 0.46837174892425537, "epoch": 0.336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.02787908959420259, "kl": 0.026032259687781334, "learning_rate": 9.862839402748753e-07, "loss": 0.0003, "num_tokens": 4382476.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.405807375907898, "sampling/importance_sampling_ratio/mean": 1.0000739097595215, "sampling/importance_sampling_ratio/min": 0.771156907081604, "sampling/sampling_logp_difference/max": 0.3406118154525757, "sampling/sampling_logp_difference/mean": 0.014217447489500046, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 170.859375, "completions/mean_terminated_length": 170.859375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.2878764271736145, "epoch": 0.3380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.01399220170190771, "kl": 0.010474582202732563, "learning_rate": 9.859223311402936e-07, "loss": 0.0001, "num_tokens": 4403603.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.526659607887268, "sampling/importance_sampling_ratio/mean": 0.9999752640724182, "sampling/importance_sampling_ratio/min": 0.6080242991447449, "sampling/sampling_logp_difference/max": 0.4975404739379883, "sampling/sampling_logp_difference/mean": 0.012195511721074581, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 164.671875, "completions/mean_terminated_length": 164.671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4259641766548157, "epoch": 0.3398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.02764650026709833, "kl": 0.023068472743034363, "learning_rate": 9.85556085141775e-07, "loss": 0.0003, "num_tokens": 4426350.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3869491815567017, "sampling/importance_sampling_ratio/mean": 1.0000137090682983, "sampling/importance_sampling_ratio/min": 0.6759111881256104, "sampling/sampling_logp_difference/max": 0.3916935920715332, "sampling/sampling_logp_difference/mean": 0.016428587958216667, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 178.3125, "completions/mean_terminated_length": 178.3125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3221898674964905, "epoch": 0.3415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.01953206362168495, "kl": 0.01120888814330101, "learning_rate": 9.851852057741844e-07, "loss": 0.0001, "num_tokens": 4449362.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.441954255104065, "sampling/importance_sampling_ratio/mean": 1.0001592636108398, "sampling/importance_sampling_ratio/min": 0.6792446970939636, "sampling/sampling_logp_difference/max": 0.38677382469177246, "sampling/sampling_logp_difference/mean": 0.013950981199741364, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 167.84375, "completions/mean_terminated_length": 167.84375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.36951327323913574, "epoch": 0.3433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.021651804598215342, "kl": 0.02235507220029831, "learning_rate": 9.848096965766002e-07, "loss": 0.0003, "num_tokens": 4470872.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5216445922851562, "sampling/importance_sampling_ratio/mean": 0.9995793104171753, "sampling/importance_sampling_ratio/min": 0.7712951898574829, "sampling/sampling_logp_difference/max": 0.41979169845581055, "sampling/sampling_logp_difference/mean": 0.014211702160537243, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 151.515625, "completions/mean_terminated_length": 151.515625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.3271106481552124, "epoch": 0.34513274336283184, "frac_reward_zero_std": 1.0, "grad_norm": 0.021102853279036434, "kl": 0.015833543613553047, "learning_rate": 9.844295611322803e-07, "loss": 0.0002, "num_tokens": 4490361.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2862383127212524, "sampling/importance_sampling_ratio/mean": 0.9994319677352905, "sampling/importance_sampling_ratio/min": 0.6054958701133728, "sampling/sampling_logp_difference/max": 0.5017075538635254, "sampling/sampling_logp_difference/mean": 0.013794454745948315, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 155.796875, "completions/mean_terminated_length": 155.796875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.46761173009872437, "epoch": 0.34690265486725663, "frac_reward_zero_std": 0.75, "grad_norm": 1.1197933032331047, "kl": 0.02969934791326523, "learning_rate": 9.84044803068628e-07, "loss": 0.0277, "num_tokens": 4510972.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4048992395401, "sampling/importance_sampling_ratio/mean": 1.0003435611724854, "sampling/importance_sampling_ratio/min": 0.5983700156211853, "sampling/sampling_logp_difference/max": 0.5135459899902344, "sampling/sampling_logp_difference/mean": 0.017445165663957596, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 132.328125, "completions/mean_terminated_length": 132.328125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3574693500995636, "epoch": 0.3486725663716814, "frac_reward_zero_std": 0.75, "grad_norm": 1.4380926813243147, "kl": 0.022844795137643814, "learning_rate": 9.836554260571577e-07, "loss": -0.0285, "num_tokens": 4530481.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3352618217468262, "sampling/importance_sampling_ratio/mean": 1.001255750656128, "sampling/importance_sampling_ratio/min": 0.7138298153877258, "sampling/sampling_logp_difference/max": 0.3371107578277588, "sampling/sampling_logp_difference/mean": 0.014518964104354382, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.27148181200027466, "epoch": 0.3504424778761062, "frac_reward_zero_std": 1.0, "grad_norm": 0.025780608407040082, "kl": 0.011116293258965015, "learning_rate": 9.832614338134595e-07, "loss": 0.0001, "num_tokens": 4548233.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5538471937179565, "sampling/importance_sampling_ratio/mean": 0.9993295669555664, "sampling/importance_sampling_ratio/min": 0.6070836186408997, "sampling/sampling_logp_difference/max": 0.49908876419067383, "sampling/sampling_logp_difference/mean": 0.013770409859716892, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 187.140625, "completions/mean_terminated_length": 187.140625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4195002615451813, "epoch": 0.35221238938053095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0162650481550141, "kl": 0.016213098540902138, "learning_rate": 9.828628300971638e-07, "loss": 0.0002, "num_tokens": 4571138.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3236145973205566, "sampling/importance_sampling_ratio/mean": 1.0003173351287842, "sampling/importance_sampling_ratio/min": 0.6771544814109802, "sampling/sampling_logp_difference/max": 0.38985586166381836, "sampling/sampling_logp_difference/mean": 0.01656009815633297, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 217.59375, "completions/mean_terminated_length": 217.59375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.5768882036209106, "epoch": 0.35398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.03671326191022061, "kl": 0.0406668446958065, "learning_rate": 9.82459618711906e-07, "loss": 0.0004, "num_tokens": 4602744.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2956786155700684, "sampling/importance_sampling_ratio/mean": 0.9998922348022461, "sampling/importance_sampling_ratio/min": 0.7278984189033508, "sampling/sampling_logp_difference/max": 0.3175938129425049, "sampling/sampling_logp_difference/mean": 0.01754450425505638, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 278.609375, "completions/mean_terminated_length": 278.609375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5030993819236755, "epoch": 0.35575221238938054, "frac_reward_zero_std": 1.0, "grad_norm": 0.034186990638514964, "kl": 0.044528283178806305, "learning_rate": 9.820518035052889e-07, "loss": 0.0004, "num_tokens": 4630511.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.298831582069397, "sampling/importance_sampling_ratio/mean": 1.0000646114349365, "sampling/importance_sampling_ratio/min": 0.6081597208976746, "sampling/sampling_logp_difference/max": 0.4973177909851074, "sampling/sampling_logp_difference/mean": 0.014819031581282616, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 253.03125, "completions/mean_terminated_length": 253.03125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5252817869186401, "epoch": 0.35752212389380533, "frac_reward_zero_std": 1.0, "grad_norm": 0.029911273715772218, "kl": 0.033255547285079956, "learning_rate": 9.816393883688475e-07, "loss": 0.0003, "num_tokens": 4659553.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.310223937034607, "sampling/importance_sampling_ratio/mean": 1.000415325164795, "sampling/importance_sampling_ratio/min": 0.6752868294715881, "sampling/sampling_logp_difference/max": 0.39261770248413086, "sampling/sampling_logp_difference/mean": 0.016464397311210632, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 235.78125, "completions/mean_terminated_length": 235.78125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.43605995178222656, "epoch": 0.35929203539823007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0210552935025437, "kl": 0.02404474839568138, "learning_rate": 9.812223772380105e-07, "loss": 0.0003, "num_tokens": 4684563.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5271201133728027, "sampling/importance_sampling_ratio/mean": 0.9989410042762756, "sampling/importance_sampling_ratio/min": 0.6703864932060242, "sampling/sampling_logp_difference/max": 0.4233837127685547, "sampling/sampling_logp_difference/mean": 0.015572982840240002, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 222.09375, "completions/mean_terminated_length": 222.09375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.4209005832672119, "epoch": 0.36106194690265486, "frac_reward_zero_std": 1.0, "grad_norm": 0.02194375076571999, "kl": 0.01922685280442238, "learning_rate": 9.808007740920645e-07, "loss": 0.0002, "num_tokens": 4712121.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2821297645568848, "sampling/importance_sampling_ratio/mean": 0.9998530745506287, "sampling/importance_sampling_ratio/min": 0.6066128015518188, "sampling/sampling_logp_difference/max": 0.4998645782470703, "sampling/sampling_logp_difference/mean": 0.01416803803294897, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 213.390625, "completions/mean_terminated_length": 213.390625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.4472233057022095, "epoch": 0.36283185840707965, "frac_reward_zero_std": 1.0, "grad_norm": 0.01741477807687026, "kl": 0.01929214969277382, "learning_rate": 9.803745829541137e-07, "loss": 0.0002, "num_tokens": 4738722.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.630118489265442, "sampling/importance_sampling_ratio/mean": 1.0000805854797363, "sampling/importance_sampling_ratio/min": 0.6256342530250549, "sampling/sampling_logp_difference/max": 0.48865270614624023, "sampling/sampling_logp_difference/mean": 0.01494203507900238, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 175.390625, "completions/mean_terminated_length": 175.390625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.38506031036376953, "epoch": 0.36460176991150445, "frac_reward_zero_std": 1.0, "grad_norm": 0.01890302236673626, "kl": 0.017401982098817825, "learning_rate": 9.799438078910432e-07, "loss": 0.0002, "num_tokens": 4760347.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.333805799484253, "sampling/importance_sampling_ratio/mean": 0.9999526739120483, "sampling/importance_sampling_ratio/min": 0.6824237108230591, "sampling/sampling_logp_difference/max": 0.3821045160293579, "sampling/sampling_logp_difference/mean": 0.014700263738632202, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 150.484375, "completions/mean_terminated_length": 150.484375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.43252402544021606, "epoch": 0.3663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 1.097902939114096, "kl": 0.03045620024204254, "learning_rate": 9.7950845301348e-07, "loss": -0.004, "num_tokens": 4780074.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.310570478439331, "sampling/importance_sampling_ratio/mean": 1.0005415678024292, "sampling/importance_sampling_ratio/min": 0.6203567385673523, "sampling/sampling_logp_difference/max": 0.4774606227874756, "sampling/sampling_logp_difference/mean": 0.01583504118025303, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 169.109375, "completions/mean_terminated_length": 169.109375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.4405205249786377, "epoch": 0.368141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.1743411438430786, "kl": 0.02028767019510269, "learning_rate": 9.790685224757532e-07, "loss": -0.0356, "num_tokens": 4801233.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.442976951599121, "sampling/importance_sampling_ratio/mean": 1.0004386901855469, "sampling/importance_sampling_ratio/min": 0.7331960797309875, "sampling/sampling_logp_difference/max": 0.36670827865600586, "sampling/sampling_logp_difference/mean": 0.015310833230614662, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 117.25, "completions/mean_terminated_length": 117.25, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.27622413635253906, "epoch": 0.36991150442477877, "frac_reward_zero_std": 1.0, "grad_norm": 0.016372609160945743, "kl": 0.007270202971994877, "learning_rate": 9.786240204758552e-07, "loss": 0.0001, "num_tokens": 4817809.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6354501247406006, "sampling/importance_sampling_ratio/mean": 1.0009384155273438, "sampling/importance_sampling_ratio/min": 0.6567115187644958, "sampling/sampling_logp_difference/max": 0.49191808700561523, "sampling/sampling_logp_difference/mean": 0.015505598857998848, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 231.4375, "completions/mean_terminated_length": 231.4375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.543701171875, "epoch": 0.37168141592920356, "frac_reward_zero_std": 0.75, "grad_norm": 0.7267322612573032, "kl": 0.03129158169031143, "learning_rate": 9.781749512553998e-07, "loss": -0.0166, "num_tokens": 4844973.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.5301965475082397, "sampling/importance_sampling_ratio/mean": 0.9999191761016846, "sampling/importance_sampling_ratio/min": 0.6165185570716858, "sampling/sampling_logp_difference/max": 0.48366689682006836, "sampling/sampling_logp_difference/mean": 0.016235269606113434, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 229.25, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.5884172916412354, "epoch": 0.3734513274336283, "frac_reward_zero_std": 0.5, "grad_norm": 1.061016962198772, "kl": 0.029968272894620895, "learning_rate": 9.777213190995847e-07, "loss": -0.0025, "num_tokens": 4871629.0, "reward": 0.34375, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.3039125204086304, "sampling/importance_sampling_ratio/mean": 1.0002131462097168, "sampling/importance_sampling_ratio/min": 0.6482402682304382, "sampling/sampling_logp_difference/max": 0.43349385261535645, "sampling/sampling_logp_difference/mean": 0.01739707589149475, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 180.640625, "completions/mean_terminated_length": 180.640625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4253998398780823, "epoch": 0.3752212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 1.052056436755184, "kl": 0.020112060010433197, "learning_rate": 9.77263128337148e-07, "loss": -0.0292, "num_tokens": 4895190.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4048014879226685, "sampling/importance_sampling_ratio/mean": 0.9998510479927063, "sampling/importance_sampling_ratio/min": 0.6613737344741821, "sampling/sampling_logp_difference/max": 0.4134361743927002, "sampling/sampling_logp_difference/mean": 0.014138556085526943, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 201.40625, "completions/mean_terminated_length": 201.40625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.5402541756629944, "epoch": 0.3769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 0.8347692127920789, "kl": 0.029757630079984665, "learning_rate": 9.768003833403276e-07, "loss": -0.0094, "num_tokens": 4920736.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6207294464111328, "sampling/importance_sampling_ratio/mean": 0.9997462630271912, "sampling/importance_sampling_ratio/min": 0.7541331052780151, "sampling/sampling_logp_difference/max": 0.4828763008117676, "sampling/sampling_logp_difference/mean": 0.016633976250886917, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 106.765625, "completions/mean_terminated_length": 106.765625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.2815261483192444, "epoch": 0.3787610619469027, "frac_reward_zero_std": 1.0, "grad_norm": 0.024846509969903882, "kl": 0.010371927171945572, "learning_rate": 9.763330885248204e-07, "loss": 0.0001, "num_tokens": 4937425.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6145673990249634, "sampling/importance_sampling_ratio/mean": 1.0003533363342285, "sampling/importance_sampling_ratio/min": 0.6623780131340027, "sampling/sampling_logp_difference/max": 0.4790670871734619, "sampling/sampling_logp_difference/mean": 0.0145414462313056, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 120.28125, "completions/mean_terminated_length": 120.28125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3886878192424774, "epoch": 0.3805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.023396212835991338, "kl": 0.016547974199056625, "learning_rate": 9.758612483497394e-07, "loss": 0.0002, "num_tokens": 4955539.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3992942571640015, "sampling/importance_sampling_ratio/mean": 0.9997751116752625, "sampling/importance_sampling_ratio/min": 0.5779934525489807, "sampling/sampling_logp_difference/max": 0.5481927394866943, "sampling/sampling_logp_difference/mean": 0.015263278037309647, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 162.65625, "completions/mean_terminated_length": 162.65625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.44779172539711, "epoch": 0.3823008849557522, "frac_reward_zero_std": 0.75, "grad_norm": 1.0702160813909476, "kl": 0.020305894315242767, "learning_rate": 9.753848673175707e-07, "loss": -0.018, "num_tokens": 4977373.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.3271540403366089, "sampling/importance_sampling_ratio/mean": 0.9998618960380554, "sampling/importance_sampling_ratio/min": 0.7632419466972351, "sampling/sampling_logp_difference/max": 0.28303682804107666, "sampling/sampling_logp_difference/mean": 0.015994010493159294, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 153.21875, "completions/mean_terminated_length": 153.21875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.30035877227783203, "epoch": 0.384070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.018376748811207013, "kl": 0.009180797263979912, "learning_rate": 9.74903949974131e-07, "loss": 0.0001, "num_tokens": 4997563.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3879265785217285, "sampling/importance_sampling_ratio/mean": 0.9997693300247192, "sampling/importance_sampling_ratio/min": 0.6757646203041077, "sampling/sampling_logp_difference/max": 0.3919105529785156, "sampling/sampling_logp_difference/mean": 0.01399917807430029, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 205.421875, "completions/mean_terminated_length": 205.421875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.5094939470291138, "epoch": 0.3858407079646018, "frac_reward_zero_std": 0.5, "grad_norm": 1.16903239345118, "kl": 0.031334519386291504, "learning_rate": 9.744185009085256e-07, "loss": -0.019, "num_tokens": 5020982.0, "reward": 0.40625, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.4352703094482422, "sampling/importance_sampling_ratio/mean": 0.9993390440940857, "sampling/importance_sampling_ratio/min": 0.61335289478302, "sampling/sampling_logp_difference/max": 0.4888148307800293, "sampling/sampling_logp_difference/mean": 0.016465526074171066, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 170.9375, "completions/mean_terminated_length": 170.9375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.45526376366615295, "epoch": 0.38761061946902653, "frac_reward_zero_std": 0.75, "grad_norm": 0.8736043720727277, "kl": 0.016055453568696976, "learning_rate": 9.739285247531017e-07, "loss": -0.0153, "num_tokens": 5042866.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4764906167984009, "sampling/importance_sampling_ratio/mean": 1.0001682043075562, "sampling/importance_sampling_ratio/min": 0.6263575553894043, "sampling/sampling_logp_difference/max": 0.4678339958190918, "sampling/sampling_logp_difference/mean": 0.017147481441497803, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 209.84375, "completions/mean_terminated_length": 209.84375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.6200296878814697, "epoch": 0.3893805309734513, "frac_reward_zero_std": 0.25, "grad_norm": 1.371812364956784, "kl": 0.030370794236660004, "learning_rate": 9.734340261834066e-07, "loss": 0.0509, "num_tokens": 5067256.0, "reward": 0.125, "reward_std": 0.6494960784912109, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.2872438430786133, "sampling/importance_sampling_ratio/mean": 0.9999788403511047, "sampling/importance_sampling_ratio/min": 0.7144313454627991, "sampling/sampling_logp_difference/max": 0.33626842498779297, "sampling/sampling_logp_difference/mean": 0.017782989889383316, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 248.296875, "completions/mean_terminated_length": 248.296875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5655719041824341, "epoch": 0.3911504424778761, "frac_reward_zero_std": 0.5, "grad_norm": 1.0377346363036783, "kl": 0.021286573261022568, "learning_rate": 9.729350099181419e-07, "loss": 0.023, "num_tokens": 5094875.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.455339789390564, "sampling/importance_sampling_ratio/mean": 1.0000135898590088, "sampling/importance_sampling_ratio/min": 0.6379542946815491, "sampling/sampling_logp_difference/max": 0.44948863983154297, "sampling/sampling_logp_difference/mean": 0.017688971012830734, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 189.484375, "completions/mean_terminated_length": 189.484375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.39400508999824524, "epoch": 0.3929203539823009, "frac_reward_zero_std": 1.0, "grad_norm": 0.02233395690026934, "kl": 0.014203056693077087, "learning_rate": 9.724314807191196e-07, "loss": 0.0002, "num_tokens": 5118106.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7028963565826416, "sampling/importance_sampling_ratio/mean": 1.0008316040039062, "sampling/importance_sampling_ratio/min": 0.7066613435745239, "sampling/sampling_logp_difference/max": 0.5323305130004883, "sampling/sampling_logp_difference/mean": 0.01546834409236908, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 225.890625, "completions/mean_terminated_length": 225.890625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.6610978841781616, "epoch": 0.39469026548672564, "frac_reward_zero_std": 0.5, "grad_norm": 1.0932215259143516, "kl": 0.02542734146118164, "learning_rate": 9.719234433912146e-07, "loss": 0.0067, "num_tokens": 5145075.0, "reward": 0.5625, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2990854978561401, "sampling/importance_sampling_ratio/mean": 0.9992256164550781, "sampling/importance_sampling_ratio/min": 0.7171126008033752, "sampling/sampling_logp_difference/max": 0.3325223922729492, "sampling/sampling_logp_difference/mean": 0.0187995582818985, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 176.578125, "completions/mean_terminated_length": 176.578125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.510978102684021, "epoch": 0.39646017699115044, "frac_reward_zero_std": 0.5, "grad_norm": 1.381758013357901, "kl": 0.021535923704504967, "learning_rate": 9.714109027823216e-07, "loss": 0.011, "num_tokens": 5167944.0, "reward": 0.15625, "reward_std": 0.42695626616477966, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.444840669631958, "sampling/importance_sampling_ratio/mean": 1.0002617835998535, "sampling/importance_sampling_ratio/min": 0.6666793823242188, "sampling/sampling_logp_difference/max": 0.40544605255126953, "sampling/sampling_logp_difference/mean": 0.016433410346508026, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 153.6875, "completions/mean_terminated_length": 153.6875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4001522362232208, "epoch": 0.39823008849557523, "frac_reward_zero_std": 0.75, "grad_norm": 1.0646633808271555, "kl": 0.01506091095507145, "learning_rate": 9.708938637833064e-07, "loss": -0.0094, "num_tokens": 5187924.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5525848865509033, "sampling/importance_sampling_ratio/mean": 1.0004353523254395, "sampling/importance_sampling_ratio/min": 0.667425274848938, "sampling/sampling_logp_difference/max": 0.43992114067077637, "sampling/sampling_logp_difference/mean": 0.01390500832349062, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 206.1875, "completions/mean_terminated_length": 206.1875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.49817371368408203, "epoch": 0.4, "frac_reward_zero_std": 1.0, "grad_norm": 0.028349881888547018, "kl": 0.01913132332265377, "learning_rate": 9.703723313279605e-07, "loss": 0.0002, "num_tokens": 5211088.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2966861724853516, "sampling/importance_sampling_ratio/mean": 1.000023365020752, "sampling/importance_sampling_ratio/min": 0.6807541847229004, "sampling/sampling_logp_difference/max": 0.38455402851104736, "sampling/sampling_logp_difference/mean": 0.01603330299258232, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.6391530632972717, "epoch": 0.40176991150442476, "frac_reward_zero_std": 0.5, "grad_norm": 0.9967957194934567, "kl": 0.02740185335278511, "learning_rate": 9.698463103929541e-07, "loss": 0.0385, "num_tokens": 5238432.0, "reward": 0.625, "reward_std": 0.481805682182312, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3534291982650757, "sampling/importance_sampling_ratio/mean": 0.9993962049484253, "sampling/importance_sampling_ratio/min": 0.6330663561820984, "sampling/sampling_logp_difference/max": 0.4571800231933594, "sampling/sampling_logp_difference/mean": 0.01765705645084381, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 282.03125, "completions/mean_terminated_length": 282.03125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4961310625076294, "epoch": 0.40353982300884955, "frac_reward_zero_std": 0.5, "grad_norm": 1.0505587189342032, "kl": 0.019192662090063095, "learning_rate": 9.693158059977877e-07, "loss": 0.002, "num_tokens": 5267970.0, "reward": 0.40625, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.8243242502212524, "sampling/importance_sampling_ratio/mean": 1.0002517700195312, "sampling/importance_sampling_ratio/min": 0.7685901522636414, "sampling/sampling_logp_difference/max": 0.6012096405029297, "sampling/sampling_logp_difference/mean": 0.014218084514141083, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 108.84375, "completions/mean_terminated_length": 108.84375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2912164628505707, "epoch": 0.40530973451327434, "frac_reward_zero_std": 1.0, "grad_norm": 0.03102505553320011, "kl": 0.010266855359077454, "learning_rate": 9.68780823204745e-07, "loss": 0.0001, "num_tokens": 5284424.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4392528533935547, "sampling/importance_sampling_ratio/mean": 0.9990299940109253, "sampling/importance_sampling_ratio/min": 0.6255216598510742, "sampling/sampling_logp_difference/max": 0.46916937828063965, "sampling/sampling_logp_difference/mean": 0.013452749699354172, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 176.546875, "completions/mean_terminated_length": 176.546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.37197035551071167, "epoch": 0.40707964601769914, "frac_reward_zero_std": 1.0, "grad_norm": 0.01896041032859841, "kl": 0.011751978658139706, "learning_rate": 9.682413671188444e-07, "loss": 0.0001, "num_tokens": 5305819.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4121108055114746, "sampling/importance_sampling_ratio/mean": 1.0002362728118896, "sampling/importance_sampling_ratio/min": 0.7776780724525452, "sampling/sampling_logp_difference/max": 0.34508562088012695, "sampling/sampling_logp_difference/mean": 0.01318573672324419, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 269.203125, "completions/mean_terminated_length": 269.203125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.5372804403305054, "epoch": 0.4088495575221239, "frac_reward_zero_std": 0.5, "grad_norm": 1.132039308346563, "kl": 0.016992956399917603, "learning_rate": 9.6769744288779e-07, "loss": 0.0479, "num_tokens": 5334968.0, "reward": 0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.5277607440948486, "sampling/importance_sampling_ratio/mean": 0.9999881982803345, "sampling/importance_sampling_ratio/min": 0.6976944804191589, "sampling/sampling_logp_difference/max": 0.42380309104919434, "sampling/sampling_logp_difference/mean": 0.015516486018896103, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 209.28125, "completions/mean_terminated_length": 209.28125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.5396798253059387, "epoch": 0.41061946902654867, "frac_reward_zero_std": 0.5, "grad_norm": 1.209918698640942, "kl": 0.026270480826497078, "learning_rate": 9.671490557019233e-07, "loss": -0.0009, "num_tokens": 5360170.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.2983312606811523, "sampling/importance_sampling_ratio/mean": 1.000602126121521, "sampling/importance_sampling_ratio/min": 0.7697843909263611, "sampling/sampling_logp_difference/max": 0.2616448402404785, "sampling/sampling_logp_difference/mean": 0.016278911381959915, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 182.0625, "completions/mean_terminated_length": 182.0625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.41270995140075684, "epoch": 0.41238938053097346, "frac_reward_zero_std": 1.0, "grad_norm": 0.025328679594359024, "kl": 0.016738593578338623, "learning_rate": 9.665962107941724e-07, "loss": 0.0002, "num_tokens": 5381534.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 0.9995510578155518, "sampling/importance_sampling_ratio/min": 0.6961075067520142, "sampling/sampling_logp_difference/max": 0.48689985275268555, "sampling/sampling_logp_difference/mean": 0.013693327084183693, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 243.28125, "completions/mean_terminated_length": 243.28125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.5390169620513916, "epoch": 0.41415929203539825, "frac_reward_zero_std": 0.75, "grad_norm": 0.7296044095280566, "kl": 0.025505583733320236, "learning_rate": 9.660389134400033e-07, "loss": 0.0166, "num_tokens": 5408832.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3929054737091064, "sampling/importance_sampling_ratio/mean": 1.0000765323638916, "sampling/importance_sampling_ratio/min": 0.7323424816131592, "sampling/sampling_logp_difference/max": 0.3313918113708496, "sampling/sampling_logp_difference/mean": 0.015481807291507721, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 180.078125, "completions/mean_terminated_length": 180.078125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4292844533920288, "epoch": 0.415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.9506863313827874, "kl": 0.022041702643036842, "learning_rate": 9.654771689573684e-07, "loss": 0.0058, "num_tokens": 5431029.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.275795340538025, "sampling/importance_sampling_ratio/mean": 0.9993537664413452, "sampling/importance_sampling_ratio/min": 0.695496678352356, "sampling/sampling_logp_difference/max": 0.36312901973724365, "sampling/sampling_logp_difference/mean": 0.013931536115705967, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 155.3125, "completions/mean_terminated_length": 155.3125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3744187355041504, "epoch": 0.4176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 1.0355842923875715, "kl": 0.016530301421880722, "learning_rate": 9.64910982706657e-07, "loss": 0.0521, "num_tokens": 5452345.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.53571355342865, "sampling/importance_sampling_ratio/mean": 1.000366449356079, "sampling/importance_sampling_ratio/min": 0.5138673186302185, "sampling/sampling_logp_difference/max": 0.6657902002334595, "sampling/sampling_logp_difference/mean": 0.01374361664056778, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 155.4375, "completions/mean_terminated_length": 155.4375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.5032491087913513, "epoch": 0.4194690265486726, "frac_reward_zero_std": 0.75, "grad_norm": 1.0318982236311647, "kl": 0.025443322956562042, "learning_rate": 9.643403600906432e-07, "loss": -0.0148, "num_tokens": 5471269.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.303486704826355, "sampling/importance_sampling_ratio/mean": 1.0002458095550537, "sampling/importance_sampling_ratio/min": 0.6822929978370667, "sampling/sampling_logp_difference/max": 0.382296085357666, "sampling/sampling_logp_difference/mean": 0.01743306592106819, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 126.453125, "completions/mean_terminated_length": 126.453125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.4431364834308624, "epoch": 0.42123893805309737, "frac_reward_zero_std": 0.75, "grad_norm": 1.3168846370828566, "kl": 0.028479289263486862, "learning_rate": 9.637653065544349e-07, "loss": 0.0066, "num_tokens": 5490434.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.4940837621688843, "sampling/importance_sampling_ratio/mean": 1.00003981590271, "sampling/importance_sampling_ratio/min": 0.7341160178184509, "sampling/sampling_logp_difference/max": 0.40151309967041016, "sampling/sampling_logp_difference/mean": 0.016477826982736588, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 174.71875, "completions/mean_terminated_length": 174.71875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.48424744606018066, "epoch": 0.4230088495575221, "frac_reward_zero_std": 0.75, "grad_norm": 1.0173106400671965, "kl": 0.022911589592695236, "learning_rate": 9.63185827585421e-07, "loss": 0.005, "num_tokens": 5511840.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3995966911315918, "sampling/importance_sampling_ratio/mean": 1.000849723815918, "sampling/importance_sampling_ratio/min": 0.7160124182701111, "sampling/sampling_logp_difference/max": 0.33618414402008057, "sampling/sampling_logp_difference/mean": 0.015440179035067558, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 206.125, "completions/mean_terminated_length": 206.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.6587193608283997, "epoch": 0.4247787610619469, "frac_reward_zero_std": 0.5, "grad_norm": 1.191138425928516, "kl": 0.03645382449030876, "learning_rate": 9.6260192871322e-07, "loss": -0.0358, "num_tokens": 5537272.0, "reward": 0.5625, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2708524465560913, "sampling/importance_sampling_ratio/mean": 0.9996212124824524, "sampling/importance_sampling_ratio/min": 0.645904004573822, "sampling/sampling_logp_difference/max": 0.4371044635772705, "sampling/sampling_logp_difference/mean": 0.018731512129306793, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 229.765625, "completions/mean_terminated_length": 229.765625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.7126834392547607, "epoch": 0.4265486725663717, "frac_reward_zero_std": 0.25, "grad_norm": 1.2622471061996008, "kl": 0.043451737612485886, "learning_rate": 9.620136155096275e-07, "loss": -0.0345, "num_tokens": 5562217.0, "reward": -0.03125, "reward_std": 0.6683381795883179, "rewards/decision_reward_func/mean": -0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.3758001327514648, "sampling/importance_sampling_ratio/mean": 1.0001277923583984, "sampling/importance_sampling_ratio/min": 0.7748174071311951, "sampling/sampling_logp_difference/max": 0.31903553009033203, "sampling/sampling_logp_difference/mean": 0.01953180506825447, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 206.3125, "completions/mean_terminated_length": 206.3125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5569666624069214, "epoch": 0.4283185840707965, "frac_reward_zero_std": 0.5, "grad_norm": 1.2751258361314115, "kl": 0.035392675548791885, "learning_rate": 9.614208935885614e-07, "loss": -0.0336, "num_tokens": 5589597.0, "reward": 0.34375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.5744760036468506, "sampling/importance_sampling_ratio/mean": 1.0000823736190796, "sampling/importance_sampling_ratio/min": 0.7300400137901306, "sampling/sampling_logp_difference/max": 0.4539225101470947, "sampling/sampling_logp_difference/mean": 0.016382835805416107, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 120.4375, "completions/mean_terminated_length": 120.4375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.4012613892555237, "epoch": 0.4300884955752212, "frac_reward_zero_std": 1.0, "grad_norm": 0.0384114085462503, "kl": 0.0251857191324234, "learning_rate": 9.608237686060097e-07, "loss": 0.0003, "num_tokens": 5608681.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3672583103179932, "sampling/importance_sampling_ratio/mean": 0.9998194575309753, "sampling/importance_sampling_ratio/min": 0.6546944975852966, "sampling/sampling_logp_difference/max": 0.4235866069793701, "sampling/sampling_logp_difference/mean": 0.01646578684449196, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 175.6875, "completions/mean_terminated_length": 175.6875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.39096441864967346, "epoch": 0.431858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.026902015466041054, "kl": 0.022231586277484894, "learning_rate": 9.602222462599766e-07, "loss": 0.0002, "num_tokens": 5634789.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4702367782592773, "sampling/importance_sampling_ratio/mean": 0.9999848008155823, "sampling/importance_sampling_ratio/min": 0.6614747047424316, "sampling/sampling_logp_difference/max": 0.4132835865020752, "sampling/sampling_logp_difference/mean": 0.015026605688035488, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 134.453125, "completions/mean_terminated_length": 134.453125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.30047836899757385, "epoch": 0.4336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.03992031362694212, "kl": 0.017428256571292877, "learning_rate": 9.596163322904269e-07, "loss": 0.0002, "num_tokens": 5652482.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3753678798675537, "sampling/importance_sampling_ratio/mean": 0.999406099319458, "sampling/importance_sampling_ratio/min": 0.6152034997940063, "sampling/sampling_logp_difference/max": 0.48580217361450195, "sampling/sampling_logp_difference/mean": 0.013884510844945908, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 193.453125, "completions/mean_terminated_length": 193.453125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4609413743019104, "epoch": 0.4353982300884956, "frac_reward_zero_std": 0.75, "grad_norm": 1.0185164908305393, "kl": 0.02759983018040657, "learning_rate": 9.590060324792325e-07, "loss": 0.0043, "num_tokens": 5675487.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4057984352111816, "sampling/importance_sampling_ratio/mean": 1.0006136894226074, "sampling/importance_sampling_ratio/min": 0.6176036596298218, "sampling/sampling_logp_difference/max": 0.48190832138061523, "sampling/sampling_logp_difference/mean": 0.016024740412831306, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 162.203125, "completions/mean_terminated_length": 162.203125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4550291895866394, "epoch": 0.43716814159292033, "frac_reward_zero_std": 0.75, "grad_norm": 1.167626541533283, "kl": 0.028106512501835823, "learning_rate": 9.58391352650117e-07, "loss": -0.0043, "num_tokens": 5695932.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2885998487472534, "sampling/importance_sampling_ratio/mean": 1.0006885528564453, "sampling/importance_sampling_ratio/min": 0.6220961213111877, "sampling/sampling_logp_difference/max": 0.47466063499450684, "sampling/sampling_logp_difference/mean": 0.016126839444041252, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 168.71875, "completions/mean_terminated_length": 168.71875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3887510299682617, "epoch": 0.4389380530973451, "frac_reward_zero_std": 1.0, "grad_norm": 0.03398823727925125, "kl": 0.026438862085342407, "learning_rate": 9.57772298668599e-07, "loss": 0.0003, "num_tokens": 5717514.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.409028172492981, "sampling/importance_sampling_ratio/mean": 1.0000361204147339, "sampling/importance_sampling_ratio/min": 0.6638216972351074, "sampling/sampling_logp_difference/max": 0.4097416400909424, "sampling/sampling_logp_difference/mean": 0.01466484647244215, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 258.90625, "completions/mean_terminated_length": 258.90625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.4102955162525177, "epoch": 0.4407079646017699, "frac_reward_zero_std": 0.75, "grad_norm": 0.8652263137420526, "kl": 0.026828434318304062, "learning_rate": 9.57148876441938e-07, "loss": -0.002, "num_tokens": 5745332.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6272636651992798, "sampling/importance_sampling_ratio/mean": 0.9999905824661255, "sampling/importance_sampling_ratio/min": 0.6772826910018921, "sampling/sampling_logp_difference/max": 0.48689985275268555, "sampling/sampling_logp_difference/mean": 0.014956308528780937, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.46813422441482544, "epoch": 0.4424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.07185935544295184, "kl": 0.04196876287460327, "learning_rate": 9.565210919190763e-07, "loss": 0.0004, "num_tokens": 5773644.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3978229761123657, "sampling/importance_sampling_ratio/mean": 0.9999117255210876, "sampling/importance_sampling_ratio/min": 0.550255537033081, "sampling/sampling_logp_difference/max": 0.5973725318908691, "sampling/sampling_logp_difference/mean": 0.016827870160341263, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 113.625, "completions/mean_terminated_length": 113.625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3235929012298584, "epoch": 0.44424778761061945, "frac_reward_zero_std": 1.0, "grad_norm": 0.06097395303517226, "kl": 0.023939600214362144, "learning_rate": 9.558889510905835e-07, "loss": 0.0003, "num_tokens": 5794180.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6908003091812134, "sampling/importance_sampling_ratio/mean": 1.0010063648223877, "sampling/importance_sampling_ratio/min": 0.6060614585876465, "sampling/sampling_logp_difference/max": 0.5252020359039307, "sampling/sampling_logp_difference/mean": 0.01488159503787756, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 123.203125, "completions/mean_terminated_length": 123.203125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.214959979057312, "epoch": 0.44601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.04782722132271782, "kl": 0.01646348088979721, "learning_rate": 9.55252459988598e-07, "loss": 0.0002, "num_tokens": 5813233.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5760674476623535, "sampling/importance_sampling_ratio/mean": 0.9988939166069031, "sampling/importance_sampling_ratio/min": 0.5484436750411987, "sampling/sampling_logp_difference/max": 0.6006706953048706, "sampling/sampling_logp_difference/mean": 0.012746063992381096, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 164.203125, "completions/mean_terminated_length": 164.203125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.32349807024002075, "epoch": 0.44778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.04171575141316936, "kl": 0.028249233961105347, "learning_rate": 9.546116246867713e-07, "loss": 0.0003, "num_tokens": 5834734.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5744246244430542, "sampling/importance_sampling_ratio/mean": 0.9997957944869995, "sampling/importance_sampling_ratio/min": 0.6231405735015869, "sampling/sampling_logp_difference/max": 0.47298312187194824, "sampling/sampling_logp_difference/mean": 0.013606461696326733, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 137.8125, "completions/mean_terminated_length": 137.8125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.32213008403778076, "epoch": 0.4495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.004911966993259, "kl": 0.03752923011779785, "learning_rate": 9.539664513002084e-07, "loss": -0.0278, "num_tokens": 5854130.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3999723196029663, "sampling/importance_sampling_ratio/mean": 0.9996242523193359, "sampling/importance_sampling_ratio/min": 0.6813591122627258, "sampling/sampling_logp_difference/max": 0.3836658000946045, "sampling/sampling_logp_difference/mean": 0.014559498056769371, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 165.109375, "completions/mean_terminated_length": 165.109375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3359453082084656, "epoch": 0.45132743362831856, "frac_reward_zero_std": 0.75, "grad_norm": 1.3108660255354827, "kl": 0.031047837808728218, "learning_rate": 9.533169459854098e-07, "loss": -0.0249, "num_tokens": 5874457.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6089236736297607, "sampling/importance_sampling_ratio/mean": 0.9999319314956665, "sampling/importance_sampling_ratio/min": 0.6133982539176941, "sampling/sampling_logp_difference/max": 0.4887409210205078, "sampling/sampling_logp_difference/mean": 0.014974427409470081, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 159.296875, "completions/mean_terminated_length": 159.296875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.29909294843673706, "epoch": 0.45309734513274336, "frac_reward_zero_std": 1.0, "grad_norm": 0.04973117091467245, "kl": 0.029443148523569107, "learning_rate": 9.526631149402134e-07, "loss": 0.0003, "num_tokens": 5894844.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4723871946334839, "sampling/importance_sampling_ratio/mean": 1.0001782178878784, "sampling/importance_sampling_ratio/min": 0.6331398487091064, "sampling/sampling_logp_difference/max": 0.4570639133453369, "sampling/sampling_logp_difference/mean": 0.01476360484957695, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 180.1875, "completions/mean_terminated_length": 180.1875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.33986639976501465, "epoch": 0.45486725663716815, "frac_reward_zero_std": 1.0, "grad_norm": 0.05368425239332466, "kl": 0.032071635127067566, "learning_rate": 9.520049644037347e-07, "loss": 0.0003, "num_tokens": 5916376.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5971448421478271, "sampling/importance_sampling_ratio/mean": 0.9999678134918213, "sampling/importance_sampling_ratio/min": 0.641612708568573, "sampling/sampling_logp_difference/max": 0.4682176113128662, "sampling/sampling_logp_difference/mean": 0.014039294794201851, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 150.65625, "completions/mean_terminated_length": 150.65625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.33248865604400635, "epoch": 0.45663716814159294, "frac_reward_zero_std": 1.0, "grad_norm": 0.05925151073793785, "kl": 0.031930696219205856, "learning_rate": 9.513425006563078e-07, "loss": 0.0004, "num_tokens": 5936962.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.7888113260269165, "sampling/importance_sampling_ratio/mean": 0.9999220371246338, "sampling/importance_sampling_ratio/min": 0.652288019657135, "sampling/sampling_logp_difference/max": 0.5815513134002686, "sampling/sampling_logp_difference/mean": 0.01445157453417778, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 160.984375, "completions/mean_terminated_length": 160.984375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.33986949920654297, "epoch": 0.4584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.08418805660577101, "kl": 0.04477149248123169, "learning_rate": 9.506757300194248e-07, "loss": 0.0005, "num_tokens": 5970113.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4855422973632812, "sampling/importance_sampling_ratio/mean": 1.0002464056015015, "sampling/importance_sampling_ratio/min": 0.547514796257019, "sampling/sampling_logp_difference/max": 0.6023657917976379, "sampling/sampling_logp_difference/mean": 0.014705209992825985, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 192.90625, "completions/mean_terminated_length": 192.90625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3955267071723938, "epoch": 0.46017699115044247, "frac_reward_zero_std": 1.0, "grad_norm": 0.053332440143998176, "kl": 0.039812974631786346, "learning_rate": 9.500046588556761e-07, "loss": 0.0004, "num_tokens": 5993611.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4954570531845093, "sampling/importance_sampling_ratio/mean": 0.9997931122779846, "sampling/importance_sampling_ratio/min": 0.7331216931343079, "sampling/sampling_logp_difference/max": 0.402431845664978, "sampling/sampling_logp_difference/mean": 0.015132974833250046, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 167.609375, "completions/mean_terminated_length": 167.609375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.3669849634170532, "epoch": 0.46194690265486726, "frac_reward_zero_std": 1.0, "grad_norm": 0.0387119018057199, "kl": 0.029329845681786537, "learning_rate": 9.493292935686894e-07, "loss": 0.0003, "num_tokens": 6014706.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6007599830627441, "sampling/importance_sampling_ratio/mean": 1.000851035118103, "sampling/importance_sampling_ratio/min": 0.6125900745391846, "sampling/sampling_logp_difference/max": 0.49005937576293945, "sampling/sampling_logp_difference/mean": 0.015224466100335121, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 231.703125, "completions/mean_terminated_length": 231.703125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.31691044569015503, "epoch": 0.46371681415929206, "frac_reward_zero_std": 1.0, "grad_norm": 0.04087936515212419, "kl": 0.02219213917851448, "learning_rate": 9.486496406030685e-07, "loss": 0.0003, "num_tokens": 6041631.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.333465576171875, "sampling/importance_sampling_ratio/mean": 1.0001709461212158, "sampling/importance_sampling_ratio/min": 0.6453405618667603, "sampling/sampling_logp_difference/max": 0.4379770755767822, "sampling/sampling_logp_difference/mean": 0.013479698449373245, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 200.1875, "completions/mean_terminated_length": 200.1875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "entropy": 0.45341458916664124, "epoch": 0.4654867256637168, "frac_reward_zero_std": 1.0, "grad_norm": 0.03696931697805135, "kl": 0.041830264031887054, "learning_rate": 9.479657064443321e-07, "loss": 0.0004, "num_tokens": 6066843.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3260716199874878, "sampling/importance_sampling_ratio/mean": 0.9999263286590576, "sampling/importance_sampling_ratio/min": 0.684958815574646, "sampling/sampling_logp_difference/max": 0.37839651107788086, "sampling/sampling_logp_difference/mean": 0.016536952927708626, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 253.03125, "completions/mean_terminated_length": 253.03125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "entropy": 0.34183597564697266, "epoch": 0.4672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.0236482152955727, "kl": 0.02626795694231987, "learning_rate": 9.472774976188513e-07, "loss": 0.0003, "num_tokens": 6094653.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3353275060653687, "sampling/importance_sampling_ratio/mean": 1.0005013942718506, "sampling/importance_sampling_ratio/min": 0.7414844036102295, "sampling/sampling_logp_difference/max": 0.2991011142730713, "sampling/sampling_logp_difference/mean": 0.012045430950820446, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 192.5625, "completions/mean_terminated_length": 192.5625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.27034690976142883, "epoch": 0.4690265486725664, "frac_reward_zero_std": 1.0, "grad_norm": 0.03322347176542136, "kl": 0.02082662284374237, "learning_rate": 9.465850206937887e-07, "loss": 0.0002, "num_tokens": 6117521.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3415220975875854, "sampling/importance_sampling_ratio/mean": 0.9999597072601318, "sampling/importance_sampling_ratio/min": 0.6079509258270264, "sampling/sampling_logp_difference/max": 0.49766111373901367, "sampling/sampling_logp_difference/mean": 0.013065982609987259, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 136.28125, "completions/mean_terminated_length": 136.28125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.20872843265533447, "epoch": 0.47079646017699117, "frac_reward_zero_std": 1.0, "grad_norm": 0.042854545735950086, "kl": 0.01808926649391651, "learning_rate": 9.45888282277034e-07, "loss": 0.0002, "num_tokens": 6136003.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3818767070770264, "sampling/importance_sampling_ratio/mean": 0.9991451501846313, "sampling/importance_sampling_ratio/min": 0.3858293294906616, "sampling/sampling_logp_difference/max": 0.9523601531982422, "sampling/sampling_logp_difference/mean": 0.012654677964746952, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 231.828125, "completions/mean_terminated_length": 231.828125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.3623976707458496, "epoch": 0.4725663716814159, "frac_reward_zero_std": 1.0, "grad_norm": 0.03701270794177717, "kl": 0.03097117319703102, "learning_rate": 9.451872890171419e-07, "loss": 0.0003, "num_tokens": 6161704.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4505499601364136, "sampling/importance_sampling_ratio/mean": 1.0002344846725464, "sampling/importance_sampling_ratio/min": 0.7210965752601624, "sampling/sampling_logp_difference/max": 0.37194275856018066, "sampling/sampling_logp_difference/mean": 0.013468829914927483, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 296.28125, "completions/mean_terminated_length": 296.28125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.5455514192581177, "epoch": 0.4743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 0.659294837514523, "kl": 0.041745543479919434, "learning_rate": 9.444820476032685e-07, "loss": -0.0268, "num_tokens": 6194010.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4530071020126343, "sampling/importance_sampling_ratio/mean": 1.0003100633621216, "sampling/importance_sampling_ratio/min": 0.6299753785133362, "sampling/sampling_logp_difference/max": 0.46207451820373535, "sampling/sampling_logp_difference/mean": 0.016169343143701553, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 178.109375, "completions/mean_terminated_length": 178.109375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.32269760966300964, "epoch": 0.4761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.035257563406750556, "kl": 0.01920374296605587, "learning_rate": 9.437725647651078e-07, "loss": 0.0002, "num_tokens": 6218465.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6268205642700195, "sampling/importance_sampling_ratio/mean": 1.0004115104675293, "sampling/importance_sampling_ratio/min": 0.6369473934173584, "sampling/sampling_logp_difference/max": 0.48662757873535156, "sampling/sampling_logp_difference/mean": 0.015678048133850098, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 190.265625, "completions/mean_terminated_length": 190.265625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.37728631496429443, "epoch": 0.4778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.06004114459586334, "kl": 0.03412202000617981, "learning_rate": 9.430588472728269e-07, "loss": 0.0004, "num_tokens": 6239938.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4148181676864624, "sampling/importance_sampling_ratio/mean": 0.9995939135551453, "sampling/importance_sampling_ratio/min": 0.6361550688743591, "sampling/sampling_logp_difference/max": 0.4523129463195801, "sampling/sampling_logp_difference/mean": 0.013349948450922966, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 228.203125, "completions/mean_terminated_length": 228.203125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.311359167098999, "epoch": 0.479646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.023194905350027488, "kl": 0.01969325914978981, "learning_rate": 9.423409019370014e-07, "loss": 0.0002, "num_tokens": 6264255.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.578434944152832, "sampling/importance_sampling_ratio/mean": 0.999988317489624, "sampling/importance_sampling_ratio/min": 0.6299833655357361, "sampling/sampling_logp_difference/max": 0.46206188201904297, "sampling/sampling_logp_difference/mean": 0.014301535673439503, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 153.890625, "completions/mean_terminated_length": 153.890625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.39541131258010864, "epoch": 0.4814159292035398, "frac_reward_zero_std": 1.0, "grad_norm": 0.036914813075702546, "kl": 0.031673021614551544, "learning_rate": 9.416187356085512e-07, "loss": 0.0004, "num_tokens": 6288504.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.6045210361480713, "sampling/importance_sampling_ratio/mean": 0.999925971031189, "sampling/importance_sampling_ratio/min": 0.634973406791687, "sampling/sampling_logp_difference/max": 0.472825288772583, "sampling/sampling_logp_difference/mean": 0.01636761799454689, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 154.640625, "completions/mean_terminated_length": 154.640625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.27730217576026917, "epoch": 0.4831858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.019819123880060466, "kl": 0.015538809821009636, "learning_rate": 9.408923551786742e-07, "loss": 0.0002, "num_tokens": 6310129.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.28533136844635, "sampling/importance_sampling_ratio/mean": 1.0004087686538696, "sampling/importance_sampling_ratio/min": 0.7605655193328857, "sampling/sampling_logp_difference/max": 0.2736930251121521, "sampling/sampling_logp_difference/mean": 0.013058988377451897, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 148.1875, "completions/mean_terminated_length": 148.1875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.33199161291122437, "epoch": 0.4849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.025359344994080425, "kl": 0.019676754251122475, "learning_rate": 9.40161767578781e-07, "loss": 0.0002, "num_tokens": 6330909.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6070911884307861, "sampling/importance_sampling_ratio/mean": 0.9996774196624756, "sampling/importance_sampling_ratio/min": 0.6098532676696777, "sampling/sampling_logp_difference/max": 0.4945368766784668, "sampling/sampling_logp_difference/mean": 0.013872501440346241, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 119.03125, "completions/mean_terminated_length": 119.03125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.25699901580810547, "epoch": 0.48672566371681414, "frac_reward_zero_std": 1.0, "grad_norm": 0.030136405749096117, "kl": 0.013796394690871239, "learning_rate": 9.394269797804288e-07, "loss": 0.0001, "num_tokens": 6348767.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6629855632781982, "sampling/importance_sampling_ratio/mean": 1.0000479221343994, "sampling/importance_sampling_ratio/min": 0.49850544333457947, "sampling/sampling_logp_difference/max": 0.6961407661437988, "sampling/sampling_logp_difference/mean": 0.013275440782308578, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 195.703125, "completions/mean_terminated_length": 195.703125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.4806135892868042, "epoch": 0.48849557522123893, "frac_reward_zero_std": 1.0, "grad_norm": 0.027145594527740468, "kl": 0.03496091812849045, "learning_rate": 9.386879987952549e-07, "loss": 0.0004, "num_tokens": 6380412.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4901208877563477, "sampling/importance_sampling_ratio/mean": 1.0001537799835205, "sampling/importance_sampling_ratio/min": 0.6103525161743164, "sampling/sampling_logp_difference/max": 0.49371862411499023, "sampling/sampling_logp_difference/mean": 0.01681288704276085, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 120.90625, "completions/mean_terminated_length": 120.90625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.27076271176338196, "epoch": 0.4902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.027781639445950742, "kl": 0.016093842685222626, "learning_rate": 9.37944831674909e-07, "loss": 0.0002, "num_tokens": 6396918.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.40231192111969, "sampling/importance_sampling_ratio/mean": 1.0001559257507324, "sampling/importance_sampling_ratio/min": 0.6182749271392822, "sampling/sampling_logp_difference/max": 0.4808220863342285, "sampling/sampling_logp_difference/mean": 0.013468633405864239, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 185.421875, "completions/mean_terminated_length": 185.421875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3222809135913849, "epoch": 0.4920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.02109532999968717, "kl": 0.021185191348195076, "learning_rate": 9.371974855109874e-07, "loss": 0.0002, "num_tokens": 6420369.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.443979024887085, "sampling/importance_sampling_ratio/mean": 1.0000238418579102, "sampling/importance_sampling_ratio/min": 0.6533827781677246, "sampling/sampling_logp_difference/max": 0.42559218406677246, "sampling/sampling_logp_difference/mean": 0.012732608243823051, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 167.0625, "completions/mean_terminated_length": 167.0625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3757314085960388, "epoch": 0.49380530973451325, "frac_reward_zero_std": 1.0, "grad_norm": 0.02220712023150318, "kl": 0.02302708476781845, "learning_rate": 9.36445967434964e-07, "loss": 0.0003, "num_tokens": 6440965.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3643826246261597, "sampling/importance_sampling_ratio/mean": 0.9994344115257263, "sampling/importance_sampling_ratio/min": 0.5260617733001709, "sampling/sampling_logp_difference/max": 0.6423366069793701, "sampling/sampling_logp_difference/mean": 0.015564961358904839, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 101.328125, "completions/mean_terminated_length": 101.328125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.2622968852519989, "epoch": 0.49557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.024359950973413567, "kl": 0.01559748686850071, "learning_rate": 9.356902846181228e-07, "loss": 0.0002, "num_tokens": 6456506.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.435205340385437, "sampling/importance_sampling_ratio/mean": 1.0001575946807861, "sampling/importance_sampling_ratio/min": 0.6743903756141663, "sampling/sampling_logp_difference/max": 0.39394617080688477, "sampling/sampling_logp_difference/mean": 0.013536439277231693, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 154.34375, "completions/mean_terminated_length": 154.34375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.36557501554489136, "epoch": 0.49734513274336284, "frac_reward_zero_std": 1.0, "grad_norm": 0.01883840913930749, "kl": 0.021856961771845818, "learning_rate": 9.349304442714895e-07, "loss": 0.0002, "num_tokens": 6476640.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7520533800125122, "sampling/importance_sampling_ratio/mean": 1.000203251838684, "sampling/importance_sampling_ratio/min": 0.7620862126350403, "sampling/sampling_logp_difference/max": 0.5607883930206299, "sampling/sampling_logp_difference/mean": 0.015660658478736877, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 220.4375, "completions/mean_terminated_length": 220.4375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.519733190536499, "epoch": 0.49911504424778763, "frac_reward_zero_std": 1.0, "grad_norm": 0.02446708363363976, "kl": 0.03840360417962074, "learning_rate": 9.341664536457625e-07, "loss": 0.0004, "num_tokens": 6510732.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2997714281082153, "sampling/importance_sampling_ratio/mean": 1.0001707077026367, "sampling/importance_sampling_ratio/min": 0.6407306790351868, "sampling/sampling_logp_difference/max": 0.4451460838317871, "sampling/sampling_logp_difference/mean": 0.01690497063100338, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 234.578125, "completions/mean_terminated_length": 234.578125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5728236436843872, "epoch": 0.5008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.04478807109354219, "kl": 0.04850665107369423, "learning_rate": 9.33398320031244e-07, "loss": 0.0005, "num_tokens": 6539425.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3944982290267944, "sampling/importance_sampling_ratio/mean": 0.9997686743736267, "sampling/importance_sampling_ratio/min": 0.7058351635932922, "sampling/sampling_logp_difference/max": 0.34837353229522705, "sampling/sampling_logp_difference/mean": 0.01805371418595314, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 180.921875, "completions/mean_terminated_length": 180.921875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4373040795326233, "epoch": 0.5026548672566372, "frac_reward_zero_std": 1.0, "grad_norm": 0.02192972516307828, "kl": 0.02788272686302662, "learning_rate": 9.3262605075777e-07, "loss": 0.0004, "num_tokens": 6563452.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4271368980407715, "sampling/importance_sampling_ratio/mean": 0.9991888403892517, "sampling/importance_sampling_ratio/min": 0.7124289274215698, "sampling/sampling_logp_difference/max": 0.3556702136993408, "sampling/sampling_logp_difference/mean": 0.016525931656360626, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 197.21875, "completions/mean_terminated_length": 197.21875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.5013217329978943, "epoch": 0.504424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 0.9948517252301762, "kl": 0.042211972177028656, "learning_rate": 9.318496531946409e-07, "loss": -0.0194, "num_tokens": 6591754.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.4786263704299927, "sampling/importance_sampling_ratio/mean": 0.9997700452804565, "sampling/importance_sampling_ratio/min": 0.6789397597312927, "sampling/sampling_logp_difference/max": 0.3911135196685791, "sampling/sampling_logp_difference/mean": 0.01624986156821251, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 132.8125, "completions/mean_terminated_length": 132.8125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4127649962902069, "epoch": 0.5061946902654867, "frac_reward_zero_std": 1.0, "grad_norm": 0.021955724850407192, "kl": 0.02443903684616089, "learning_rate": 9.310691347505505e-07, "loss": 0.0003, "num_tokens": 6611518.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3266682624816895, "sampling/importance_sampling_ratio/mean": 1.0003299713134766, "sampling/importance_sampling_ratio/min": 0.7654657363891602, "sampling/sampling_logp_difference/max": 0.2826707363128662, "sampling/sampling_logp_difference/mean": 0.015683164820075035, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 187.703125, "completions/mean_terminated_length": 187.703125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.3855421543121338, "epoch": 0.5079646017699115, "frac_reward_zero_std": 1.0, "grad_norm": 0.025761441639133714, "kl": 0.03089805133640766, "learning_rate": 9.30284502873516e-07, "loss": 0.0003, "num_tokens": 6633019.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4548776149749756, "sampling/importance_sampling_ratio/mean": 0.9999523758888245, "sampling/importance_sampling_ratio/min": 0.6919315457344055, "sampling/sampling_logp_difference/max": 0.3749217987060547, "sampling/sampling_logp_difference/mean": 0.013852078467607498, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 167.265625, "completions/mean_terminated_length": 167.265625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.28932544589042664, "epoch": 0.5097345132743363, "frac_reward_zero_std": 1.0, "grad_norm": 0.020763578839902066, "kl": 0.017403192818164825, "learning_rate": 9.294957650508064e-07, "loss": 0.0002, "num_tokens": 6653372.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3269850015640259, "sampling/importance_sampling_ratio/mean": 1.000016689300537, "sampling/importance_sampling_ratio/min": 0.7129645347595215, "sampling/sampling_logp_difference/max": 0.3383236527442932, "sampling/sampling_logp_difference/mean": 0.012063910253345966, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 184.140625, "completions/mean_terminated_length": 184.140625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3914494514465332, "epoch": 0.511504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.02080754388776366, "kl": 0.020681608468294144, "learning_rate": 9.287029288088716e-07, "loss": 0.0003, "num_tokens": 6677653.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4709348678588867, "sampling/importance_sampling_ratio/mean": 0.9999104738235474, "sampling/importance_sampling_ratio/min": 0.6171886920928955, "sampling/sampling_logp_difference/max": 0.48258042335510254, "sampling/sampling_logp_difference/mean": 0.014956386759877205, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 175.484375, "completions/mean_terminated_length": 175.484375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5061272382736206, "epoch": 0.5132743362831859, "frac_reward_zero_std": 1.0, "grad_norm": 0.03207714367449626, "kl": 0.03939015045762062, "learning_rate": 9.279060017132697e-07, "loss": 0.0005, "num_tokens": 6705876.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.293717384338379, "sampling/importance_sampling_ratio/mean": 0.9991098642349243, "sampling/importance_sampling_ratio/min": 0.7004491090774536, "sampling/sampling_logp_difference/max": 0.3560335636138916, "sampling/sampling_logp_difference/mean": 0.017809201031923294, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 210.8125, "completions/mean_terminated_length": 210.8125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.41961824893951416, "epoch": 0.5150442477876106, "frac_reward_zero_std": 1.0, "grad_norm": 0.017195812824318982, "kl": 0.02614808827638626, "learning_rate": 9.271049913685959e-07, "loss": 0.0003, "num_tokens": 6730776.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3036280870437622, "sampling/importance_sampling_ratio/mean": 0.9997357130050659, "sampling/importance_sampling_ratio/min": 0.6956069469451904, "sampling/sampling_logp_difference/max": 0.3629704713821411, "sampling/sampling_logp_difference/mean": 0.015091790817677975, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 243.984375, "completions/mean_terminated_length": 243.984375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.513262152671814, "epoch": 0.5168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.02756305466878137, "kl": 0.04059458523988724, "learning_rate": 9.262999054184091e-07, "loss": 0.0004, "num_tokens": 6757991.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5239235162734985, "sampling/importance_sampling_ratio/mean": 1.0002270936965942, "sampling/importance_sampling_ratio/min": 0.7383068799972534, "sampling/sampling_logp_difference/max": 0.42128825187683105, "sampling/sampling_logp_difference/mean": 0.015886511653661728, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 170.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.44291773438453674, "epoch": 0.5185840707964602, "frac_reward_zero_std": 1.0, "grad_norm": 0.023537025702967722, "kl": 0.025342687964439392, "learning_rate": 9.254907515451591e-07, "loss": 0.0003, "num_tokens": 6779511.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4934041500091553, "sampling/importance_sampling_ratio/mean": 1.000349521636963, "sampling/importance_sampling_ratio/min": 0.6546944975852966, "sampling/sampling_logp_difference/max": 0.4235866069793701, "sampling/sampling_logp_difference/mean": 0.015553380362689495, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 270.359375, "completions/mean_terminated_length": 270.359375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.6321618556976318, "epoch": 0.5203539823008849, "frac_reward_zero_std": 0.75, "grad_norm": 0.6718029852191244, "kl": 0.05273823440074921, "learning_rate": 9.246775374701138e-07, "loss": 0.0073, "num_tokens": 6830126.0, "reward": -0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": -0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.437933325767517, "sampling/importance_sampling_ratio/mean": 0.999913215637207, "sampling/importance_sampling_ratio/min": 0.6157731413841248, "sampling/sampling_logp_difference/max": 0.4848766326904297, "sampling/sampling_logp_difference/mean": 0.018297888338565826, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 113.25, "completions/mean_terminated_length": 113.25, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.30457520484924316, "epoch": 0.5221238938053098, "frac_reward_zero_std": 1.0, "grad_norm": 0.014786051129669832, "kl": 0.011042892932891846, "learning_rate": 9.23860270953285e-07, "loss": 0.0001, "num_tokens": 6846686.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.297924518585205, "sampling/importance_sampling_ratio/mean": 0.9996802806854248, "sampling/importance_sampling_ratio/min": 0.5011434555053711, "sampling/sampling_logp_difference/max": 0.6908628940582275, "sampling/sampling_logp_difference/mean": 0.013864563778042793, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 182.171875, "completions/mean_terminated_length": 182.171875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.34369033575057983, "epoch": 0.5238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.029699791224600255, "kl": 0.023969898000359535, "learning_rate": 9.230389597933543e-07, "loss": 0.0003, "num_tokens": 6868745.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3634611368179321, "sampling/importance_sampling_ratio/mean": 0.999811589717865, "sampling/importance_sampling_ratio/min": 0.6313598155975342, "sampling/sampling_logp_difference/max": 0.45987939834594727, "sampling/sampling_logp_difference/mean": 0.013427523896098137, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 164.265625, "completions/mean_terminated_length": 164.265625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.43866345286369324, "epoch": 0.5256637168141592, "frac_reward_zero_std": 1.0, "grad_norm": 0.035562733657815176, "kl": 0.03041796386241913, "learning_rate": 9.222136118275995e-07, "loss": 0.0004, "num_tokens": 6890698.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.515989065170288, "sampling/importance_sampling_ratio/mean": 1.0003479719161987, "sampling/importance_sampling_ratio/min": 0.662236213684082, "sampling/sampling_logp_difference/max": 0.41606807708740234, "sampling/sampling_logp_difference/mean": 0.015883635729551315, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 112.890625, "completions/mean_terminated_length": 112.890625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.27075040340423584, "epoch": 0.5274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.016403369626088807, "kl": 0.012253494933247566, "learning_rate": 9.213842349318184e-07, "loss": 0.0001, "num_tokens": 6906963.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3670459985733032, "sampling/importance_sampling_ratio/mean": 0.9997448921203613, "sampling/importance_sampling_ratio/min": 0.5639070868492126, "sampling/sampling_logp_difference/max": 0.5728658437728882, "sampling/sampling_logp_difference/mean": 0.013015816919505596, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.5180535316467285, "epoch": 0.5292035398230088, "frac_reward_zero_std": 0.75, "grad_norm": 0.7780875731753081, "kl": 0.039922792464494705, "learning_rate": 9.205508370202551e-07, "loss": 0.0211, "num_tokens": 6934955.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3283700942993164, "sampling/importance_sampling_ratio/mean": 1.0002131462097168, "sampling/importance_sampling_ratio/min": 0.751277506351471, "sampling/sampling_logp_difference/max": 0.285980224609375, "sampling/sampling_logp_difference/mean": 0.016268664970993996, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 190.5625, "completions/mean_terminated_length": 190.5625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.49133166670799255, "epoch": 0.5309734513274337, "frac_reward_zero_std": 1.0, "grad_norm": 0.023107569257414747, "kl": 0.03035806678235531, "learning_rate": 9.197134260455233e-07, "loss": 0.0003, "num_tokens": 6961407.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3000757694244385, "sampling/importance_sampling_ratio/mean": 0.9997725486755371, "sampling/importance_sampling_ratio/min": 0.7143741250038147, "sampling/sampling_logp_difference/max": 0.3363485336303711, "sampling/sampling_logp_difference/mean": 0.015370436944067478, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 162.84375, "completions/mean_terminated_length": 162.84375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4394567608833313, "epoch": 0.5327433628318584, "frac_reward_zero_std": 0.75, "grad_norm": 1.3449971981015074, "kl": 0.025277450680732727, "learning_rate": 9.188720099985315e-07, "loss": -0.0636, "num_tokens": 6984389.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3278049230575562, "sampling/importance_sampling_ratio/mean": 1.0007739067077637, "sampling/importance_sampling_ratio/min": 0.719417929649353, "sampling/sampling_logp_difference/max": 0.329312801361084, "sampling/sampling_logp_difference/mean": 0.015836309641599655, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 190.59375, "completions/mean_terminated_length": 190.59375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.4766071140766144, "epoch": 0.5345132743362832, "frac_reward_zero_std": 0.75, "grad_norm": 0.9641606035355126, "kl": 0.029689954593777657, "learning_rate": 9.180265969084056e-07, "loss": -0.0054, "num_tokens": 7008811.0, "reward": 0.03125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.03125, "rewards/decision_reward_func/std": 1.0074130296707153, "sampling/importance_sampling_ratio/max": 1.44169020652771, "sampling/importance_sampling_ratio/mean": 1.0000590085983276, "sampling/importance_sampling_ratio/min": 0.7592349052429199, "sampling/sampling_logp_difference/max": 0.3658161163330078, "sampling/sampling_logp_difference/mean": 0.01634790375828743, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 207.09375, "completions/mean_terminated_length": 207.09375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4701969027519226, "epoch": 0.536283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.023617708610028605, "kl": 0.029484564438462257, "learning_rate": 9.171771948424136e-07, "loss": 0.0003, "num_tokens": 7034529.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4577841758728027, "sampling/importance_sampling_ratio/mean": 1.0001473426818848, "sampling/importance_sampling_ratio/min": 0.6138359308242798, "sampling/sampling_logp_difference/max": 0.48802757263183594, "sampling/sampling_logp_difference/mean": 0.016000833362340927, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 123.984375, "completions/mean_terminated_length": 123.984375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4232381582260132, "epoch": 0.5380530973451327, "frac_reward_zero_std": 1.0, "grad_norm": 0.017957166653677577, "kl": 0.017481127753853798, "learning_rate": 9.163238119058871e-07, "loss": 0.0002, "num_tokens": 7052288.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4640995264053345, "sampling/importance_sampling_ratio/mean": 0.9997575879096985, "sampling/importance_sampling_ratio/min": 0.7615348696708679, "sampling/sampling_logp_difference/max": 0.3812403678894043, "sampling/sampling_logp_difference/mean": 0.01718355156481266, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 124.765625, "completions/mean_terminated_length": 124.765625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.34873998165130615, "epoch": 0.5398230088495575, "frac_reward_zero_std": 1.0, "grad_norm": 0.01970275105905271, "kl": 0.01254827156662941, "learning_rate": 9.154664562421453e-07, "loss": 0.0001, "num_tokens": 7070497.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5928269624710083, "sampling/importance_sampling_ratio/mean": 0.9998602271080017, "sampling/importance_sampling_ratio/min": 0.7438759207725525, "sampling/sampling_logp_difference/max": 0.46551036834716797, "sampling/sampling_logp_difference/mean": 0.01575476862490177, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 146.6875, "completions/mean_terminated_length": 146.6875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.45946380496025085, "epoch": 0.5415929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.027331330504074755, "kl": 0.025959448888897896, "learning_rate": 9.146051360324165e-07, "loss": 0.0003, "num_tokens": 7090989.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4168685674667358, "sampling/importance_sampling_ratio/mean": 1.0000064373016357, "sampling/importance_sampling_ratio/min": 0.610821545124054, "sampling/sampling_logp_difference/max": 0.492950439453125, "sampling/sampling_logp_difference/mean": 0.01741579920053482, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 179.34375, "completions/mean_terminated_length": 179.34375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.3625985383987427, "epoch": 0.5433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.02555287441167141, "kl": 0.022509675472974777, "learning_rate": 9.137398594957603e-07, "loss": 0.0003, "num_tokens": 7112979.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3600614070892334, "sampling/importance_sampling_ratio/mean": 1.0002243518829346, "sampling/importance_sampling_ratio/min": 0.645759105682373, "sampling/sampling_logp_difference/max": 0.4373287856578827, "sampling/sampling_logp_difference/mean": 0.014308637008070946, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 201.90625, "completions/mean_terminated_length": 201.90625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4243144094944, "epoch": 0.5451327433628319, "frac_reward_zero_std": 1.0, "grad_norm": 0.020160252089109758, "kl": 0.019518980756402016, "learning_rate": 9.128706348889894e-07, "loss": 0.0002, "num_tokens": 7136429.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4510719776153564, "sampling/importance_sampling_ratio/mean": 0.9998438358306885, "sampling/importance_sampling_ratio/min": 0.7054146528244019, "sampling/sampling_logp_difference/max": 0.3723025321960449, "sampling/sampling_logp_difference/mean": 0.01521989330649376, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 171.28125, "completions/mean_terminated_length": 171.28125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4476414620876312, "epoch": 0.5469026548672566, "frac_reward_zero_std": 0.75, "grad_norm": 1.1123146738256926, "kl": 0.02239086851477623, "learning_rate": 9.1199747050659e-07, "loss": -0.0148, "num_tokens": 7159679.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2853026390075684, "sampling/importance_sampling_ratio/mean": 0.9998779296875, "sampling/importance_sampling_ratio/min": 0.6334834098815918, "sampling/sampling_logp_difference/max": 0.45652151107788086, "sampling/sampling_logp_difference/mean": 0.01625833287835121, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 192.46875, "completions/mean_terminated_length": 192.46875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3643852472305298, "epoch": 0.5486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.017628714255359525, "kl": 0.01792171411216259, "learning_rate": 9.111203746806439e-07, "loss": 0.0002, "num_tokens": 7182349.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4224846363067627, "sampling/importance_sampling_ratio/mean": 1.000328540802002, "sampling/importance_sampling_ratio/min": 0.6199836730957031, "sampling/sampling_logp_difference/max": 0.47806215286254883, "sampling/sampling_logp_difference/mean": 0.013797037303447723, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 213.234375, "completions/mean_terminated_length": 213.234375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.5292292833328247, "epoch": 0.5504424778761062, "frac_reward_zero_std": 0.5, "grad_norm": 1.3727710535787903, "kl": 0.03132692724466324, "learning_rate": 9.102393557807476e-07, "loss": -0.0237, "num_tokens": 7208476.0, "reward": 0.15625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4008958339691162, "sampling/importance_sampling_ratio/mean": 1.0000406503677368, "sampling/importance_sampling_ratio/min": 0.6428192853927612, "sampling/sampling_logp_difference/max": 0.4418916702270508, "sampling/sampling_logp_difference/mean": 0.01771797239780426, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4951961934566498, "epoch": 0.552212389380531, "frac_reward_zero_std": 0.75, "grad_norm": 1.1948294530666905, "kl": 0.02944597788155079, "learning_rate": 9.093544222139337e-07, "loss": -0.0143, "num_tokens": 7227896.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.632109522819519, "sampling/importance_sampling_ratio/mean": 0.9997971057891846, "sampling/importance_sampling_ratio/min": 0.6881688237190247, "sampling/sampling_logp_difference/max": 0.48987340927124023, "sampling/sampling_logp_difference/mean": 0.016983861103653908, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 113.109375, "completions/mean_terminated_length": 113.109375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.3199790418148041, "epoch": 0.5539823008849557, "frac_reward_zero_std": 1.0, "grad_norm": 0.022995973721823926, "kl": 0.013230049051344395, "learning_rate": 9.084655824245897e-07, "loss": 0.0001, "num_tokens": 7245567.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2884671688079834, "sampling/importance_sampling_ratio/mean": 0.9997365474700928, "sampling/importance_sampling_ratio/min": 0.6622399091720581, "sampling/sampling_logp_difference/max": 0.41212737560272217, "sampling/sampling_logp_difference/mean": 0.01478345226496458, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 179.875, "completions/mean_terminated_length": 179.875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4696625769138336, "epoch": 0.5557522123893806, "frac_reward_zero_std": 0.5, "grad_norm": 1.1544001954538277, "kl": 0.03679889440536499, "learning_rate": 9.075728448943781e-07, "loss": 0.0253, "num_tokens": 7268551.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.5777137279510498, "sampling/importance_sampling_ratio/mean": 0.999577522277832, "sampling/importance_sampling_ratio/min": 0.6172122955322266, "sampling/sampling_logp_difference/max": 0.4825422763824463, "sampling/sampling_logp_difference/mean": 0.015795571729540825, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 280.515625, "completions/mean_terminated_length": 280.515625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.7630300521850586, "epoch": 0.5575221238938053, "frac_reward_zero_std": 0.25, "grad_norm": 1.219075142426945, "kl": 0.047929853200912476, "learning_rate": 9.066762181421552e-07, "loss": -0.0221, "num_tokens": 7306632.0, "reward": 0.34375, "reward_std": 0.6205305457115173, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.2905977964401245, "sampling/importance_sampling_ratio/mean": 0.9999635219573975, "sampling/importance_sampling_ratio/min": 0.69340980052948, "sampling/sampling_logp_difference/max": 0.3661341667175293, "sampling/sampling_logp_difference/mean": 0.01918640546500683, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 166.75, "completions/mean_terminated_length": 166.75, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4249437749385834, "epoch": 0.5592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 1.0349278536279007, "kl": 0.0165907870978117, "learning_rate": 9.057757107238894e-07, "loss": -0.0517, "num_tokens": 7327704.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.4352306127548218, "sampling/importance_sampling_ratio/mean": 0.9991052150726318, "sampling/importance_sampling_ratio/min": 0.6202441453933716, "sampling/sampling_logp_difference/max": 0.4776420593261719, "sampling/sampling_logp_difference/mean": 0.016329392790794373, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 109.09375, "completions/mean_terminated_length": 109.09375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.34463635087013245, "epoch": 0.5610619469026549, "frac_reward_zero_std": 1.0, "grad_norm": 0.03992014966658588, "kl": 0.014742556028068066, "learning_rate": 9.048713312325804e-07, "loss": 0.0001, "num_tokens": 7344350.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4840832948684692, "sampling/importance_sampling_ratio/mean": 0.9999971985816956, "sampling/importance_sampling_ratio/min": 0.61069256067276, "sampling/sampling_logp_difference/max": 0.493161678314209, "sampling/sampling_logp_difference/mean": 0.017180006951093674, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 153.765625, "completions/mean_terminated_length": 153.765625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.45609360933303833, "epoch": 0.5628318584070796, "frac_reward_zero_std": 0.75, "grad_norm": 1.1303939749009286, "kl": 0.04069334641098976, "learning_rate": 9.039630882981768e-07, "loss": 0.0028, "num_tokens": 7365327.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.4444701671600342, "sampling/importance_sampling_ratio/mean": 1.0001757144927979, "sampling/importance_sampling_ratio/min": 0.7404763698577881, "sampling/sampling_logp_difference/max": 0.36774253845214844, "sampling/sampling_logp_difference/mean": 0.016026653349399567, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 204.34375, "completions/mean_terminated_length": 204.34375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.512562096118927, "epoch": 0.5646017699115045, "frac_reward_zero_std": 0.75, "grad_norm": 0.8567449108608022, "kl": 0.024660678580403328, "learning_rate": 9.030509905874932e-07, "loss": 0.0184, "num_tokens": 7391493.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.388451099395752, "sampling/importance_sampling_ratio/mean": 1.0002787113189697, "sampling/importance_sampling_ratio/min": 0.6836274862289429, "sampling/sampling_logp_difference/max": 0.38034212589263916, "sampling/sampling_logp_difference/mean": 0.017032500356435776, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 200.265625, "completions/mean_terminated_length": 200.265625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4987943470478058, "epoch": 0.5663716814159292, "frac_reward_zero_std": 1.0, "grad_norm": 0.02476306652548267, "kl": 0.027325624600052834, "learning_rate": 9.021350468041287e-07, "loss": 0.0003, "num_tokens": 7416406.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3389766216278076, "sampling/importance_sampling_ratio/mean": 1.0004692077636719, "sampling/importance_sampling_ratio/min": 0.6792095303535461, "sampling/sampling_logp_difference/max": 0.3868255615234375, "sampling/sampling_logp_difference/mean": 0.01691562309861183, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 175.34375, "completions/mean_terminated_length": 175.34375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5358196496963501, "epoch": 0.5681415929203539, "frac_reward_zero_std": 0.5, "grad_norm": 1.197771469521168, "kl": 0.03339190408587456, "learning_rate": 9.012152656883822e-07, "loss": -0.0202, "num_tokens": 7442124.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.3223949670791626, "sampling/importance_sampling_ratio/mean": 0.999968409538269, "sampling/importance_sampling_ratio/min": 0.6171384453773499, "sampling/sampling_logp_difference/max": 0.4826619625091553, "sampling/sampling_logp_difference/mean": 0.01843232661485672, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 140.234375, "completions/mean_terminated_length": 140.234375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.24724894762039185, "epoch": 0.5699115044247788, "frac_reward_zero_std": 1.0, "grad_norm": 0.037065985528952475, "kl": 0.01480955071747303, "learning_rate": 9.002916560171712e-07, "loss": 0.0001, "num_tokens": 7460347.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.3267300128936768, "sampling/importance_sampling_ratio/mean": 1.0000386238098145, "sampling/importance_sampling_ratio/min": 0.6084439158439636, "sampling/sampling_logp_difference/max": 0.49685049057006836, "sampling/sampling_logp_difference/mean": 0.01217254064977169, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 254.9375, "completions/mean_terminated_length": 254.9375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4838610589504242, "epoch": 0.5716814159292035, "frac_reward_zero_std": 0.25, "grad_norm": 1.4102688538404478, "kl": 0.02883777767419815, "learning_rate": 8.993642266039456e-07, "loss": 0.1031, "num_tokens": 7489527.0, "reward": 0.8125, "reward_std": 0.4973389506340027, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4121158123016357, "sampling/importance_sampling_ratio/mean": 1.000025749206543, "sampling/importance_sampling_ratio/min": 0.5279315114021301, "sampling/sampling_logp_difference/max": 0.6387887001037598, "sampling/sampling_logp_difference/mean": 0.01626652479171753, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 176.375, "completions/mean_terminated_length": 176.375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.580811083316803, "epoch": 0.5734513274336284, "frac_reward_zero_std": 0.25, "grad_norm": 1.7990374585024924, "kl": 0.0485653281211853, "learning_rate": 8.984329862986055e-07, "loss": -0.0115, "num_tokens": 7514879.0, "reward": 0.15625, "reward_std": 0.5827301740646362, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6363136768341064, "sampling/importance_sampling_ratio/mean": 0.9998959898948669, "sampling/importance_sampling_ratio/min": 0.6743044853210449, "sampling/sampling_logp_difference/max": 0.4924459457397461, "sampling/sampling_logp_difference/mean": 0.01841394044458866, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 161.28125, "completions/mean_terminated_length": 161.28125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.4510684609413147, "epoch": 0.5752212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.03822921565943506, "kl": 0.02629006840288639, "learning_rate": 8.97497943987416e-07, "loss": 0.0003, "num_tokens": 7535505.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.311832070350647, "sampling/importance_sampling_ratio/mean": 0.9999359250068665, "sampling/importance_sampling_ratio/min": 0.644705593585968, "sampling/sampling_logp_difference/max": 0.4389615058898926, "sampling/sampling_logp_difference/mean": 0.017147788777947426, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 99.234375, "completions/mean_terminated_length": 99.234375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.231645405292511, "epoch": 0.5769911504424778, "frac_reward_zero_std": 1.0, "grad_norm": 0.04359184759156683, "kl": 0.016105014830827713, "learning_rate": 8.96559108592922e-07, "loss": 0.0002, "num_tokens": 7550720.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.451245903968811, "sampling/importance_sampling_ratio/mean": 0.9995250105857849, "sampling/importance_sampling_ratio/min": 0.6763100028038025, "sampling/sampling_logp_difference/max": 0.39110374450683594, "sampling/sampling_logp_difference/mean": 0.01318065170198679, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 199.8125, "completions/mean_terminated_length": 199.8125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.43389004468917847, "epoch": 0.5787610619469027, "frac_reward_zero_std": 0.75, "grad_norm": 0.8302330875336456, "kl": 0.034609388560056686, "learning_rate": 8.956164890738642e-07, "loss": 0.0015, "num_tokens": 7575060.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.3011858463287354, "sampling/importance_sampling_ratio/mean": 1.000328540802002, "sampling/importance_sampling_ratio/min": 0.7293118834495544, "sampling/sampling_logp_difference/max": 0.31565380096435547, "sampling/sampling_logp_difference/mean": 0.0162799172103405, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 155.09375, "completions/mean_terminated_length": 155.09375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.27465489506721497, "epoch": 0.5805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.035554886971738475, "kl": 0.021240783855319023, "learning_rate": 8.946700944250924e-07, "loss": 0.0002, "num_tokens": 7594906.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2974066734313965, "sampling/importance_sampling_ratio/mean": 0.9997029304504395, "sampling/importance_sampling_ratio/min": 0.6262715458869934, "sampling/sampling_logp_difference/max": 0.4679713249206543, "sampling/sampling_logp_difference/mean": 0.013311211951076984, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 229.59375, "completions/mean_terminated_length": 229.59375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.4771236479282379, "epoch": 0.5823008849557522, "frac_reward_zero_std": 0.75, "grad_norm": 0.8169328181258667, "kl": 0.023737013339996338, "learning_rate": 8.937199336774804e-07, "loss": -0.0056, "num_tokens": 7621680.0, "reward": 0.3125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.3591655492782593, "sampling/importance_sampling_ratio/mean": 1.00018310546875, "sampling/importance_sampling_ratio/min": 0.6493704915046692, "sampling/sampling_logp_difference/max": 0.4317518472671509, "sampling/sampling_logp_difference/mean": 0.015672830864787102, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 243.203125, "completions/mean_terminated_length": 243.203125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5708993077278137, "epoch": 0.584070796460177, "frac_reward_zero_std": 0.25, "grad_norm": 1.5306653694538026, "kl": 0.036455169320106506, "learning_rate": 8.927660158978392e-07, "loss": -0.1243, "num_tokens": 7650397.0, "reward": 0.15625, "reward_std": 0.6505630612373352, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.6598302125930786, "sampling/importance_sampling_ratio/mean": 1.0003546476364136, "sampling/importance_sampling_ratio/min": 0.7050821185112, "sampling/sampling_logp_difference/max": 0.5067152976989746, "sampling/sampling_logp_difference/mean": 0.017485732212662697, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 174.65625, "completions/mean_terminated_length": 174.65625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.39911821484565735, "epoch": 0.5858407079646017, "frac_reward_zero_std": 1.0, "grad_norm": 0.027272171052515736, "kl": 0.017386943101882935, "learning_rate": 8.918083501888316e-07, "loss": 0.0002, "num_tokens": 7673063.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4191452264785767, "sampling/importance_sampling_ratio/mean": 1.0000393390655518, "sampling/importance_sampling_ratio/min": 0.6963819861412048, "sampling/sampling_logp_difference/max": 0.36185693740844727, "sampling/sampling_logp_difference/mean": 0.015225267969071865, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 204.546875, "completions/mean_terminated_length": 204.546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.5385278463363647, "epoch": 0.5876106194690266, "frac_reward_zero_std": 0.5, "grad_norm": 1.1898785104977088, "kl": 0.04100334271788597, "learning_rate": 8.908469456888843e-07, "loss": 0.0137, "num_tokens": 7700042.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.2864842414855957, "sampling/importance_sampling_ratio/mean": 0.999769926071167, "sampling/importance_sampling_ratio/min": 0.7454503178596497, "sampling/sampling_logp_difference/max": 0.2937668561935425, "sampling/sampling_logp_difference/mean": 0.016970038414001465, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 203.734375, "completions/mean_terminated_length": 203.734375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.40644773840904236, "epoch": 0.5893805309734513, "frac_reward_zero_std": 0.75, "grad_norm": 0.8706282295748837, "kl": 0.021324289962649345, "learning_rate": 8.898818115721007e-07, "loss": -0.0221, "num_tokens": 7723001.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.377809762954712, "sampling/importance_sampling_ratio/mean": 0.9999908208847046, "sampling/importance_sampling_ratio/min": 0.6941059231758118, "sampling/sampling_logp_difference/max": 0.3651306629180908, "sampling/sampling_logp_difference/mean": 0.014977142214775085, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 152.953125, "completions/mean_terminated_length": 152.953125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4270297884941101, "epoch": 0.5911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.026822486395567488, "kl": 0.02106567844748497, "learning_rate": 8.889129570481741e-07, "loss": 0.0002, "num_tokens": 7743942.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3990821838378906, "sampling/importance_sampling_ratio/mean": 0.9998767375946045, "sampling/importance_sampling_ratio/min": 0.6077051162719727, "sampling/sampling_logp_difference/max": 0.4980654716491699, "sampling/sampling_logp_difference/mean": 0.015651099383831024, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 198.546875, "completions/mean_terminated_length": 198.546875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.325345516204834, "epoch": 0.5929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 0.9778925706608806, "kl": 0.013278895057737827, "learning_rate": 8.879403913622996e-07, "loss": -0.023, "num_tokens": 7766473.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4353985786437988, "sampling/importance_sampling_ratio/mean": 1.0006654262542725, "sampling/importance_sampling_ratio/min": 0.7160588502883911, "sampling/sampling_logp_difference/max": 0.36144256591796875, "sampling/sampling_logp_difference/mean": 0.013793321326375008, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 165.609375, "completions/mean_terminated_length": 165.609375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4705030024051666, "epoch": 0.5946902654867257, "frac_reward_zero_std": 0.75, "grad_norm": 1.1162423522001792, "kl": 0.031908683478832245, "learning_rate": 8.869641237950849e-07, "loss": -0.0083, "num_tokens": 7788256.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.6634169816970825, "sampling/importance_sampling_ratio/mean": 1.000626564025879, "sampling/importance_sampling_ratio/min": 0.6571769714355469, "sampling/sampling_logp_difference/max": 0.5088739395141602, "sampling/sampling_logp_difference/mean": 0.0166355911642313, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 192.59375, "completions/mean_terminated_length": 192.59375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.5487221479415894, "epoch": 0.5964601769911504, "frac_reward_zero_std": 0.5, "grad_norm": 1.3387937200408704, "kl": 0.03449319303035736, "learning_rate": 8.859841636624631e-07, "loss": 0.0164, "num_tokens": 7811094.0, "reward": 0.59375, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.4217135906219482, "sampling/importance_sampling_ratio/mean": 1.0003790855407715, "sampling/importance_sampling_ratio/min": 0.6393097639083862, "sampling/sampling_logp_difference/max": 0.44736623764038086, "sampling/sampling_logp_difference/mean": 0.01947716996073723, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 115.78125, "completions/mean_terminated_length": 115.78125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.296596884727478, "epoch": 0.5982300884955752, "frac_reward_zero_std": 1.0, "grad_norm": 0.033744789320011105, "kl": 0.018130596727132797, "learning_rate": 8.850005203156034e-07, "loss": 0.0002, "num_tokens": 7828952.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6452924013137817, "sampling/importance_sampling_ratio/mean": 0.9998478293418884, "sampling/importance_sampling_ratio/min": 0.6643782258033752, "sampling/sampling_logp_difference/max": 0.49791812896728516, "sampling/sampling_logp_difference/mean": 0.014338867738842964, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 138.28125, "completions/mean_terminated_length": 138.28125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.23987334966659546, "epoch": 0.6, "frac_reward_zero_std": 1.0, "grad_norm": 0.028806517551722206, "kl": 0.017187688499689102, "learning_rate": 8.84013203140821e-07, "loss": 0.0002, "num_tokens": 7847242.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5104879140853882, "sampling/importance_sampling_ratio/mean": 1.0002355575561523, "sampling/importance_sampling_ratio/min": 0.6625238060951233, "sampling/sampling_logp_difference/max": 0.4124326705932617, "sampling/sampling_logp_difference/mean": 0.011877824552357197, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 118.0, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.32074832916259766, "epoch": 0.6017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.028997403607390542, "kl": 0.016999173909425735, "learning_rate": 8.83022221559489e-07, "loss": 0.0002, "num_tokens": 7865306.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.600737452507019, "sampling/importance_sampling_ratio/mean": 0.9993951320648193, "sampling/importance_sampling_ratio/min": 0.6845306158065796, "sampling/sampling_logp_difference/max": 0.47046446800231934, "sampling/sampling_logp_difference/mean": 0.015037331730127335, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 201.984375, "completions/mean_terminated_length": 201.984375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.4528687596321106, "epoch": 0.6035398230088496, "frac_reward_zero_std": 0.75, "grad_norm": 0.8374241068347515, "kl": 0.03223993629217148, "learning_rate": 8.820275850279472e-07, "loss": -0.0105, "num_tokens": 7889417.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.418492317199707, "sampling/importance_sampling_ratio/mean": 0.9998888969421387, "sampling/importance_sampling_ratio/min": 0.7561723589897156, "sampling/sampling_logp_difference/max": 0.3495945930480957, "sampling/sampling_logp_difference/mean": 0.016138648614287376, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 216.40625, "completions/mean_terminated_length": 216.40625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.5420851111412048, "epoch": 0.6053097345132743, "frac_reward_zero_std": 0.5, "grad_norm": 1.2654400146455447, "kl": 0.03524719923734665, "learning_rate": 8.810293030374125e-07, "loss": 0.0027, "num_tokens": 7914755.0, "reward": 0.8125, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6463134288787842, "sampling/importance_sampling_ratio/mean": 1.0001258850097656, "sampling/importance_sampling_ratio/min": 0.6932322382926941, "sampling/sampling_logp_difference/max": 0.4985384941101074, "sampling/sampling_logp_difference/mean": 0.017083358019590378, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 154.421875, "completions/mean_terminated_length": 154.421875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3786289691925049, "epoch": 0.6070796460176991, "frac_reward_zero_std": 1.0, "grad_norm": 0.03575700654546547, "kl": 0.025098759680986404, "learning_rate": 8.800273851138882e-07, "loss": 0.0003, "num_tokens": 7934798.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3124302625656128, "sampling/importance_sampling_ratio/mean": 1.0006170272827148, "sampling/importance_sampling_ratio/min": 0.6380122303962708, "sampling/sampling_logp_difference/max": 0.4493978023529053, "sampling/sampling_logp_difference/mean": 0.01563396491110325, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 195.8125, "completions/mean_terminated_length": 195.8125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.43428149819374084, "epoch": 0.6088495575221239, "frac_reward_zero_std": 0.75, "grad_norm": 0.9633281384458705, "kl": 0.022706853225827217, "learning_rate": 8.790218408180734e-07, "loss": -0.0124, "num_tokens": 7959218.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.5093923807144165, "sampling/importance_sampling_ratio/mean": 0.9997909665107727, "sampling/importance_sampling_ratio/min": 0.6833602786064148, "sampling/sampling_logp_difference/max": 0.41170716285705566, "sampling/sampling_logp_difference/mean": 0.015266085043549538, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 230.4375, "completions/mean_terminated_length": 230.4375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5112296342849731, "epoch": 0.6106194690265486, "frac_reward_zero_std": 1.0, "grad_norm": 0.028634278924611033, "kl": 0.032195795327425, "learning_rate": 8.780126797452712e-07, "loss": 0.0003, "num_tokens": 7986686.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3169021606445312, "sampling/importance_sampling_ratio/mean": 0.9998841881752014, "sampling/importance_sampling_ratio/min": 0.6206711530685425, "sampling/sampling_logp_difference/max": 0.47695398330688477, "sampling/sampling_logp_difference/mean": 0.016089120879769325, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 277.65625, "completions/mean_terminated_length": 277.65625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.48142367601394653, "epoch": 0.6123893805309735, "frac_reward_zero_std": 0.5, "grad_norm": 1.0819649635422508, "kl": 0.030770402401685715, "learning_rate": 8.769999115252975e-07, "loss": 0.009, "num_tokens": 8015784.0, "reward": 0.53125, "reward_std": 0.48935678601264954, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.424164056777954, "sampling/importance_sampling_ratio/mean": 0.9999499320983887, "sampling/importance_sampling_ratio/min": 0.6622363924980164, "sampling/sampling_logp_difference/max": 0.41213274002075195, "sampling/sampling_logp_difference/mean": 0.014971626922488213, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 173.34375, "completions/mean_terminated_length": 173.34375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.43033796548843384, "epoch": 0.6141592920353982, "frac_reward_zero_std": 0.75, "grad_norm": 1.1118008381723494, "kl": 0.036823272705078125, "learning_rate": 8.759835458223887e-07, "loss": 0.0226, "num_tokens": 8037438.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3416407108306885, "sampling/importance_sampling_ratio/mean": 0.9997933506965637, "sampling/importance_sampling_ratio/min": 0.6783764958381653, "sampling/sampling_logp_difference/max": 0.3880528211593628, "sampling/sampling_logp_difference/mean": 0.01579831913113594, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 225.734375, "completions/mean_terminated_length": 225.734375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.528110682964325, "epoch": 0.6159292035398231, "frac_reward_zero_std": 1.0, "grad_norm": 0.03858983418164625, "kl": 0.038228943943977356, "learning_rate": 8.749635923351106e-07, "loss": 0.0004, "num_tokens": 8063229.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4603391885757446, "sampling/importance_sampling_ratio/mean": 1.0001890659332275, "sampling/importance_sampling_ratio/min": 0.6262895464897156, "sampling/sampling_logp_difference/max": 0.467942476272583, "sampling/sampling_logp_difference/mean": 0.01644430309534073, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 226.9375, "completions/mean_terminated_length": 226.9375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4355280101299286, "epoch": 0.6176991150442478, "frac_reward_zero_std": 0.75, "grad_norm": 0.8883931500274885, "kl": 0.03403376042842865, "learning_rate": 8.739400607962644e-07, "loss": 0.0072, "num_tokens": 8089033.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.3030188083648682, "sampling/importance_sampling_ratio/mean": 1.000458002090454, "sampling/importance_sampling_ratio/min": 0.6413252353668213, "sampling/sampling_logp_difference/max": 0.44421863555908203, "sampling/sampling_logp_difference/mean": 0.013998275622725487, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 177.265625, "completions/mean_terminated_length": 177.265625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.2729702591896057, "epoch": 0.6194690265486725, "frac_reward_zero_std": 1.0, "grad_norm": 0.027398615108238537, "kl": 0.02054606005549431, "learning_rate": 8.729129609727946e-07, "loss": 0.0002, "num_tokens": 8108938.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4834272861480713, "sampling/importance_sampling_ratio/mean": 1.0010185241699219, "sampling/importance_sampling_ratio/min": 0.5620219707489014, "sampling/sampling_logp_difference/max": 0.5762143135070801, "sampling/sampling_logp_difference/mean": 0.01200198009610176, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 213.515625, "completions/mean_terminated_length": 213.515625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5209338665008545, "epoch": 0.6212389380530974, "frac_reward_zero_std": 0.5, "grad_norm": 1.2635862005627876, "kl": 0.04583410546183586, "learning_rate": 8.718823026656958e-07, "loss": -0.0236, "num_tokens": 8134235.0, "reward": 0.65625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.6144993305206299, "sampling/importance_sampling_ratio/mean": 0.9988173246383667, "sampling/importance_sampling_ratio/min": 0.63479083776474, "sampling/sampling_logp_difference/max": 0.47902488708496094, "sampling/sampling_logp_difference/mean": 0.01699173077940941, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 215.1875, "completions/mean_terminated_length": 215.1875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.46796682476997375, "epoch": 0.6230088495575221, "frac_reward_zero_std": 0.75, "grad_norm": 0.9188342619639853, "kl": 0.0415828675031662, "learning_rate": 8.708480957099193e-07, "loss": -0.0114, "num_tokens": 8157383.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.3758742809295654, "sampling/importance_sampling_ratio/mean": 1.0000108480453491, "sampling/importance_sampling_ratio/min": 0.6447747945785522, "sampling/sampling_logp_difference/max": 0.4388542175292969, "sampling/sampling_logp_difference/mean": 0.015339828096330166, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 221.234375, "completions/mean_terminated_length": 221.234375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4628385305404663, "epoch": 0.6247787610619469, "frac_reward_zero_std": 0.5, "grad_norm": 1.1133548655824281, "kl": 0.040711820125579834, "learning_rate": 8.698103499742783e-07, "loss": -0.0059, "num_tokens": 8181526.0, "reward": 0.4375, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4460031986236572, "sampling/importance_sampling_ratio/mean": 1.0004066228866577, "sampling/importance_sampling_ratio/min": 0.6387195587158203, "sampling/sampling_logp_difference/max": 0.4482898712158203, "sampling/sampling_logp_difference/mean": 0.01546501275151968, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 201.96875, "completions/mean_terminated_length": 201.96875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.39816412329673767, "epoch": 0.6265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 1.0763734372573512, "kl": 0.029497426003217697, "learning_rate": 8.687690753613554e-07, "loss": -0.0271, "num_tokens": 8206212.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.560969352722168, "sampling/importance_sampling_ratio/mean": 1.0002760887145996, "sampling/importance_sampling_ratio/min": 0.6801750063896179, "sampling/sampling_logp_difference/max": 0.44530701637268066, "sampling/sampling_logp_difference/mean": 0.014909004792571068, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 134.390625, "completions/mean_terminated_length": 134.390625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.262448251247406, "epoch": 0.6283185840707964, "frac_reward_zero_std": 1.0, "grad_norm": 0.05111182214123538, "kl": 0.025020107626914978, "learning_rate": 8.677242818074062e-07, "loss": 0.0003, "num_tokens": 8224557.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3412717580795288, "sampling/importance_sampling_ratio/mean": 0.9999301433563232, "sampling/importance_sampling_ratio/min": 0.6426113247871399, "sampling/sampling_logp_difference/max": 0.4422152042388916, "sampling/sampling_logp_difference/mean": 0.0129863191395998, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 217.0625, "completions/mean_terminated_length": 217.0625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.4070281386375427, "epoch": 0.6300884955752213, "frac_reward_zero_std": 0.75, "grad_norm": 1.1382826206720111, "kl": 0.02475142851471901, "learning_rate": 8.666759792822661e-07, "loss": -0.0316, "num_tokens": 8261377.0, "reward": -0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.553475260734558, "sampling/importance_sampling_ratio/mean": 1.000364065170288, "sampling/importance_sampling_ratio/min": 0.6571546792984009, "sampling/sampling_logp_difference/max": 0.4404945373535156, "sampling/sampling_logp_difference/mean": 0.01452541071921587, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 331.96875, "completions/mean_terminated_length": 331.96875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.39330345392227173, "epoch": 0.631858407079646, "frac_reward_zero_std": 1.0, "grad_norm": 0.02467953743492263, "kl": 0.027912279590964317, "learning_rate": 8.656241777892542e-07, "loss": 0.0003, "num_tokens": 8293039.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.547163963317871, "sampling/importance_sampling_ratio/mean": 0.9996451139450073, "sampling/importance_sampling_ratio/min": 0.636603593826294, "sampling/sampling_logp_difference/max": 0.45160818099975586, "sampling/sampling_logp_difference/mean": 0.011738335713744164, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 155.1875, "completions/mean_terminated_length": 155.1875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.27595287561416626, "epoch": 0.6336283185840708, "frac_reward_zero_std": 1.0, "grad_norm": 0.026142118969928976, "kl": 0.018282443284988403, "learning_rate": 8.645688873650784e-07, "loss": 0.0002, "num_tokens": 8312731.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6021919250488281, "sampling/importance_sampling_ratio/mean": 1.0000102519989014, "sampling/importance_sampling_ratio/min": 0.6061299443244934, "sampling/sampling_logp_difference/max": 0.5006608963012695, "sampling/sampling_logp_difference/mean": 0.012841126881539822, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 172.21875, "completions/mean_terminated_length": 172.21875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4368191361427307, "epoch": 0.6353982300884956, "frac_reward_zero_std": 0.75, "grad_norm": 1.0767510877429516, "kl": 0.0403921864926815, "learning_rate": 8.63510118079739e-07, "loss": -0.0141, "num_tokens": 8333945.0, "reward": 0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.2823578119277954, "sampling/importance_sampling_ratio/mean": 0.9997091293334961, "sampling/importance_sampling_ratio/min": 0.6186152696609497, "sampling/sampling_logp_difference/max": 0.4802718162536621, "sampling/sampling_logp_difference/mean": 0.015755271539092064, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 181.296875, "completions/mean_terminated_length": 181.296875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4091697931289673, "epoch": 0.6371681415929203, "frac_reward_zero_std": 0.75, "grad_norm": 1.0322824050142376, "kl": 0.03155152499675751, "learning_rate": 8.624478800364331e-07, "loss": -0.0314, "num_tokens": 8358892.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5558624267578125, "sampling/importance_sampling_ratio/mean": 0.9999942779541016, "sampling/importance_sampling_ratio/min": 0.6341093182563782, "sampling/sampling_logp_difference/max": 0.4555339813232422, "sampling/sampling_logp_difference/mean": 0.015035311691462994, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 264.453125, "completions/mean_terminated_length": 264.453125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.4108230769634247, "epoch": 0.6389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.6428188955893065, "kl": 0.03412599489092827, "learning_rate": 8.613821833714583e-07, "loss": -0.0075, "num_tokens": 8387337.0, "reward": 0.375, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.4704291820526123, "sampling/importance_sampling_ratio/mean": 1.0001084804534912, "sampling/importance_sampling_ratio/min": 0.2541913092136383, "sampling/sampling_logp_difference/max": 1.3696681261062622, "sampling/sampling_logp_difference/mean": 0.013900469988584518, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1369.0, "completions/max_terminated_length": 1369.0, "completions/mean_length": 317.78125, "completions/mean_terminated_length": 317.78125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.5605416297912598, "epoch": 0.6407079646017699, "frac_reward_zero_std": 0.5, "grad_norm": 0.8708143773037155, "kl": 0.04219198226928711, "learning_rate": 8.603130382541155e-07, "loss": 0.0043, "num_tokens": 8421947.0, "reward": 0.21875, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4345859289169312, "sampling/importance_sampling_ratio/mean": 0.9998886585235596, "sampling/importance_sampling_ratio/min": 0.7422106862068176, "sampling/sampling_logp_difference/max": 0.360876202583313, "sampling/sampling_logp_difference/mean": 0.017081569880247116, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 313.640625, "completions/mean_terminated_length": 313.640625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.43918853998184204, "epoch": 0.6424778761061947, "frac_reward_zero_std": 0.75, "grad_norm": 0.6168597521682825, "kl": 0.0383281372487545, "learning_rate": 8.592404548866122e-07, "loss": 0.0008, "num_tokens": 8453940.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.4403674602508545, "sampling/importance_sampling_ratio/mean": 1.0000804662704468, "sampling/importance_sampling_ratio/min": 0.6487054824829102, "sampling/sampling_logp_difference/max": 0.43277645111083984, "sampling/sampling_logp_difference/mean": 0.013403661549091339, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 181.125, "completions/mean_terminated_length": 181.125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.39058035612106323, "epoch": 0.6442477876106195, "frac_reward_zero_std": 1.0, "grad_norm": 0.036823206988136556, "kl": 0.033125072717666626, "learning_rate": 8.58164443503965e-07, "loss": 0.0004, "num_tokens": 8477036.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4265888929367065, "sampling/importance_sampling_ratio/mean": 0.9999474883079529, "sampling/importance_sampling_ratio/min": 0.6711859703063965, "sampling/sampling_logp_difference/max": 0.3987090587615967, "sampling/sampling_logp_difference/mean": 0.014679424464702606, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 244.421875, "completions/mean_terminated_length": 244.421875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.6227244734764099, "epoch": 0.6460176991150443, "frac_reward_zero_std": 0.5, "grad_norm": 1.1229622924470637, "kl": 0.05889035388827324, "learning_rate": 8.570850143739021e-07, "loss": -0.0174, "num_tokens": 8505047.0, "reward": -0.0625, "reward_std": 0.5081988573074341, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.4740734100341797, "sampling/importance_sampling_ratio/mean": 1.0005154609680176, "sampling/importance_sampling_ratio/min": 0.7484645843505859, "sampling/sampling_logp_difference/max": 0.3880295753479004, "sampling/sampling_logp_difference/mean": 0.01888107880949974, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 169.1875, "completions/mean_terminated_length": 169.1875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.4301868677139282, "epoch": 0.647787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 1.2027250960354323, "kl": 0.042081568390131, "learning_rate": 8.560021777967648e-07, "loss": 0.0043, "num_tokens": 8525699.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.6007353067398071, "sampling/importance_sampling_ratio/mean": 1.000962495803833, "sampling/importance_sampling_ratio/min": 0.6136767864227295, "sampling/sampling_logp_difference/max": 0.48828697204589844, "sampling/sampling_logp_difference/mean": 0.01758481003344059, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 286.9375, "completions/mean_terminated_length": 286.9375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.4721408784389496, "epoch": 0.6495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 0.8144029201387767, "kl": 0.03806294500827789, "learning_rate": 8.549159441054104e-07, "loss": 0.0139, "num_tokens": 8556655.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3278887271881104, "sampling/importance_sampling_ratio/mean": 1.0000855922698975, "sampling/importance_sampling_ratio/min": 0.6211404800415039, "sampling/sampling_logp_difference/max": 0.4761979579925537, "sampling/sampling_logp_difference/mean": 0.015013427473604679, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "entropy": 0.42947787046432495, "epoch": 0.6513274336283186, "frac_reward_zero_std": 0.5, "grad_norm": 0.9964883662934144, "kl": 0.03491639345884323, "learning_rate": 8.538263236651117e-07, "loss": 0.047, "num_tokens": 8586303.0, "reward": 0.40625, "reward_std": 0.4515564441680908, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.2862749099731445, "sampling/importance_sampling_ratio/mean": 0.9998199939727783, "sampling/importance_sampling_ratio/min": 0.6631476879119873, "sampling/sampling_logp_difference/max": 0.41075754165649414, "sampling/sampling_logp_difference/mean": 0.013806729577481747, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5510585308074951, "epoch": 0.6530973451327433, "frac_reward_zero_std": 0.5, "grad_norm": 1.0826589295928573, "kl": 0.04940249025821686, "learning_rate": 8.527333268734606e-07, "loss": -0.0047, "num_tokens": 8615221.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3706663846969604, "sampling/importance_sampling_ratio/mean": 1.0001425743103027, "sampling/importance_sampling_ratio/min": 0.7628521919250488, "sampling/sampling_logp_difference/max": 0.31529712677001953, "sampling/sampling_logp_difference/mean": 0.01648496463894844, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 367.125, "completions/mean_terminated_length": 367.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "entropy": 0.5697993636131287, "epoch": 0.6548672566371682, "frac_reward_zero_std": 0.5, "grad_norm": 0.7974082551907156, "kl": 0.044508419930934906, "learning_rate": 8.516369641602661e-07, "loss": 0.0134, "num_tokens": 8654493.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.354619026184082, "sampling/importance_sampling_ratio/mean": 1.0000743865966797, "sampling/importance_sampling_ratio/min": 0.6262628436088562, "sampling/sampling_logp_difference/max": 0.4679851531982422, "sampling/sampling_logp_difference/mean": 0.016434911638498306, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 352.453125, "completions/mean_terminated_length": 352.453125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.38086485862731934, "epoch": 0.6566371681415929, "frac_reward_zero_std": 0.75, "grad_norm": 0.6697272735790052, "kl": 0.027844320982694626, "learning_rate": 8.505372459874571e-07, "loss": -0.0038, "num_tokens": 8688570.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5612245798110962, "sampling/importance_sampling_ratio/mean": 0.9996849894523621, "sampling/importance_sampling_ratio/min": 0.7282742857933044, "sampling/sampling_logp_difference/max": 0.4454704523086548, "sampling/sampling_logp_difference/mean": 0.01266162283718586, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.4918649196624756, "epoch": 0.6584070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 0.9628604908198412, "kl": 0.041082967072725296, "learning_rate": 8.494341828489812e-07, "loss": 0.0088, "num_tokens": 8711906.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4009339809417725, "sampling/importance_sampling_ratio/mean": 0.9996036291122437, "sampling/importance_sampling_ratio/min": 0.6651098132133484, "sampling/sampling_logp_difference/max": 0.4078030586242676, "sampling/sampling_logp_difference/mean": 0.01707690767943859, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 299.40625, "completions/mean_terminated_length": 299.40625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.663341760635376, "epoch": 0.6601769911504425, "frac_reward_zero_std": 0.25, "grad_norm": 1.1956210092052122, "kl": 0.05895734950900078, "learning_rate": 8.483277852707052e-07, "loss": 0.0174, "num_tokens": 8744108.0, "reward": -0.375, "reward_std": 0.7581988573074341, "rewards/decision_reward_func/mean": -0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.3005774021148682, "sampling/importance_sampling_ratio/mean": 0.9997754096984863, "sampling/importance_sampling_ratio/min": 0.7420384883880615, "sampling/sampling_logp_difference/max": 0.2983541488647461, "sampling/sampling_logp_difference/mean": 0.018573878332972527, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 219.6875, "completions/mean_terminated_length": 219.6875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.549524188041687, "epoch": 0.6619469026548672, "frac_reward_zero_std": 0.5, "grad_norm": 1.2155712285760218, "kl": 0.04660700261592865, "learning_rate": 8.472180638103143e-07, "loss": 0.0298, "num_tokens": 8771112.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.4421229362487793, "sampling/importance_sampling_ratio/mean": 1.0000449419021606, "sampling/importance_sampling_ratio/min": 0.7006356716156006, "sampling/sampling_logp_difference/max": 0.3661162853240967, "sampling/sampling_logp_difference/mean": 0.017872437834739685, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 262.203125, "completions/mean_terminated_length": 262.203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "entropy": 0.5852094292640686, "epoch": 0.6637168141592921, "frac_reward_zero_std": 0.25, "grad_norm": 1.3117610178807275, "kl": 0.05262136831879616, "learning_rate": 8.461050290572113e-07, "loss": 0.0303, "num_tokens": 8800069.0, "reward": 0.78125, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.3352786302566528, "sampling/importance_sampling_ratio/mean": 0.9999169111251831, "sampling/importance_sampling_ratio/min": 0.6927358508110046, "sampling/sampling_logp_difference/max": 0.367106556892395, "sampling/sampling_logp_difference/mean": 0.017396733164787292, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 246.453125, "completions/mean_terminated_length": 246.453125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "entropy": 0.38401997089385986, "epoch": 0.6654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 0.7449451717953398, "kl": 0.03761235252022743, "learning_rate": 8.449886916324166e-07, "loss": 0.0005, "num_tokens": 8826594.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.6086597442626953, "sampling/importance_sampling_ratio/mean": 1.000878930091858, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.4754014015197754, "sampling/sampling_logp_difference/mean": 0.013770446181297302, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 143.421875, "completions/mean_terminated_length": 143.421875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.3362015187740326, "epoch": 0.6672566371681415, "frac_reward_zero_std": 0.75, "grad_norm": 1.2989116729881696, "kl": 0.03373100236058235, "learning_rate": 8.438690621884649e-07, "loss": -0.0125, "num_tokens": 8844765.0, "reward": 0.21875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4606289863586426, "sampling/importance_sampling_ratio/mean": 0.9999160170555115, "sampling/importance_sampling_ratio/min": 0.6257045269012451, "sampling/sampling_logp_difference/max": 0.46887707710266113, "sampling/sampling_logp_difference/mean": 0.01427266001701355, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 163.3125, "completions/mean_terminated_length": 163.3125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.3471616804599762, "epoch": 0.6690265486725664, "frac_reward_zero_std": 0.75, "grad_norm": 1.1550938859904358, "kl": 0.03041689656674862, "learning_rate": 8.427461514093055e-07, "loss": -0.0106, "num_tokens": 8865377.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.313628911972046, "sampling/importance_sampling_ratio/mean": 0.9998461604118347, "sampling/importance_sampling_ratio/min": 0.6218625903129578, "sampling/sampling_logp_difference/max": 0.4750361442565918, "sampling/sampling_logp_difference/mean": 0.014026266522705555, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 230.375, "completions/mean_terminated_length": 230.375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.45404207706451416, "epoch": 0.6707964601769911, "frac_reward_zero_std": 0.5, "grad_norm": 1.1282184976345067, "kl": 0.028066381812095642, "learning_rate": 8.41619970010199e-07, "loss": -0.0127, "num_tokens": 8890969.0, "reward": 0.28125, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.2957441806793213, "sampling/importance_sampling_ratio/mean": 1.0000749826431274, "sampling/importance_sampling_ratio/min": 0.6950652003288269, "sampling/sampling_logp_difference/max": 0.363749623298645, "sampling/sampling_logp_difference/mean": 0.015464250929653645, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 166.359375, "completions/mean_terminated_length": 166.359375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.4227294921875, "epoch": 0.672566371681416, "frac_reward_zero_std": 0.75, "grad_norm": 0.9049689389200966, "kl": 0.03660879656672478, "learning_rate": 8.404905287376157e-07, "loss": 0.0145, "num_tokens": 8912064.0, "reward": 0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.3383833169937134, "sampling/importance_sampling_ratio/mean": 0.9998734593391418, "sampling/importance_sampling_ratio/min": 0.6398648023605347, "sampling/sampling_logp_difference/max": 0.44649839401245117, "sampling/sampling_logp_difference/mean": 0.016387417912483215, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 158.6875, "completions/mean_terminated_length": 158.6875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.37693119049072266, "epoch": 0.6743362831858407, "frac_reward_zero_std": 1.0, "grad_norm": 0.02550021586580856, "kl": 0.0251535065472126, "learning_rate": 8.393578383691328e-07, "loss": 0.0003, "num_tokens": 8933004.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3655242919921875, "sampling/importance_sampling_ratio/mean": 0.999870777130127, "sampling/importance_sampling_ratio/min": 0.7550402879714966, "sampling/sampling_logp_difference/max": 0.3115384578704834, "sampling/sampling_logp_difference/mean": 0.014674804173409939, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 148.234375, "completions/mean_terminated_length": 148.234375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.45803314447402954, "epoch": 0.6761061946902654, "frac_reward_zero_std": 0.75, "grad_norm": 0.9489645812081653, "kl": 0.03578249737620354, "learning_rate": 8.382219097133323e-07, "loss": 0.0026, "num_tokens": 8953259.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.621260404586792, "sampling/importance_sampling_ratio/mean": 0.9990768432617188, "sampling/importance_sampling_ratio/min": 0.714861273765564, "sampling/sampling_logp_difference/max": 0.4832038879394531, "sampling/sampling_logp_difference/mean": 0.01736932247877121, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 182.34375, "completions/mean_terminated_length": 182.34375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.40728506445884705, "epoch": 0.6778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.02369537274113039, "kl": 0.024733269587159157, "learning_rate": 8.370827536096964e-07, "loss": 0.0003, "num_tokens": 8975265.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5361980199813843, "sampling/importance_sampling_ratio/mean": 0.9993815422058105, "sampling/importance_sampling_ratio/min": 0.6953165531158447, "sampling/sampling_logp_difference/max": 0.42931056022644043, "sampling/sampling_logp_difference/mean": 0.015199595130980015, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 121.671875, "completions/mean_terminated_length": 121.671875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.4286119341850281, "epoch": 0.679646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.3224448549975067, "kl": 0.03849584236741066, "learning_rate": 8.359403809285053e-07, "loss": -0.0127, "num_tokens": 8994108.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.4355621337890625, "sampling/importance_sampling_ratio/mean": 0.9998904466629028, "sampling/importance_sampling_ratio/min": 0.6399005651473999, "sampling/sampling_logp_difference/max": 0.44644248485565186, "sampling/sampling_logp_difference/mean": 0.017456576228141785, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 107.328125, "completions/mean_terminated_length": 107.328125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.4205324053764343, "epoch": 0.6814159292035398, "frac_reward_zero_std": 1.0, "grad_norm": 0.035414373865660514, "kl": 0.032268770039081573, "learning_rate": 8.347948025707329e-07, "loss": 0.0003, "num_tokens": 9013777.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3150197267532349, "sampling/importance_sampling_ratio/mean": 0.9994502663612366, "sampling/importance_sampling_ratio/min": 0.6689095497131348, "sampling/sampling_logp_difference/max": 0.4021064043045044, "sampling/sampling_logp_difference/mean": 0.017202619463205338, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 190.765625, "completions/mean_terminated_length": 190.765625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.46052759885787964, "epoch": 0.6831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.9175729001782233, "kl": 0.035584956407547, "learning_rate": 8.336460294679431e-07, "loss": 0.1104, "num_tokens": 9036690.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.8059618473052979, "sampling/importance_sampling_ratio/mean": 1.0006980895996094, "sampling/importance_sampling_ratio/min": 0.7699272036552429, "sampling/sampling_logp_difference/max": 0.5910933017730713, "sampling/sampling_logp_difference/mean": 0.016931939870119095, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 174.765625, "completions/mean_terminated_length": 174.765625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.5552585124969482, "epoch": 0.6849557522123894, "frac_reward_zero_std": 0.25, "grad_norm": 1.788329312938343, "kl": 0.04846896976232529, "learning_rate": 8.324940725821852e-07, "loss": -0.0461, "num_tokens": 9059203.0, "reward": 0.125, "reward_std": 0.5351393222808838, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.4157276153564453, "sampling/importance_sampling_ratio/mean": 1.0004475116729736, "sampling/importance_sampling_ratio/min": 0.7033500671386719, "sampling/sampling_logp_difference/max": 0.351900577545166, "sampling/sampling_logp_difference/mean": 0.018314823508262634, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 138.09375, "completions/mean_terminated_length": 138.09375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.45556119084358215, "epoch": 0.6867256637168142, "frac_reward_zero_std": 1.0, "grad_norm": 0.039107353994337235, "kl": 0.03828766942024231, "learning_rate": 8.313389429058895e-07, "loss": 0.0005, "num_tokens": 9079785.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4468210935592651, "sampling/importance_sampling_ratio/mean": 0.9999643564224243, "sampling/importance_sampling_ratio/min": 0.6927420496940613, "sampling/sampling_logp_difference/max": 0.3693687915802002, "sampling/sampling_logp_difference/mean": 0.017646033316850662, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 146.265625, "completions/mean_terminated_length": 146.265625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.5707560777664185, "epoch": 0.6884955752212389, "frac_reward_zero_std": 0.5, "grad_norm": 1.4928868726004139, "kl": 0.05419418215751648, "learning_rate": 8.30180651461762e-07, "loss": -0.0125, "num_tokens": 9103706.0, "reward": -0.0625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": -0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.440291166305542, "sampling/importance_sampling_ratio/mean": 1.0000207424163818, "sampling/importance_sampling_ratio/min": 0.7370258569717407, "sampling/sampling_logp_difference/max": 0.36484527587890625, "sampling/sampling_logp_difference/mean": 0.018560871481895447, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 154.09375, "completions/mean_terminated_length": 154.09375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.5176286101341248, "epoch": 0.6902654867256637, "frac_reward_zero_std": 0.75, "grad_norm": 1.1155849207511874, "kl": 0.04577094689011574, "learning_rate": 8.290192093026805e-07, "loss": -0.0055, "num_tokens": 9126128.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4447511434555054, "sampling/importance_sampling_ratio/mean": 1.0000211000442505, "sampling/importance_sampling_ratio/min": 0.6860484480857849, "sampling/sampling_logp_difference/max": 0.37680697441101074, "sampling/sampling_logp_difference/mean": 0.016872867941856384, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 141.9375, "completions/mean_terminated_length": 141.9375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4314468502998352, "epoch": 0.6920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.03694870082996628, "kl": 0.036875467747449875, "learning_rate": 8.278546275115869e-07, "loss": 0.0004, "num_tokens": 9145948.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5289959907531738, "sampling/importance_sampling_ratio/mean": 0.9996025562286377, "sampling/importance_sampling_ratio/min": 0.5001392960548401, "sampling/sampling_logp_difference/max": 0.692868709564209, "sampling/sampling_logp_difference/mean": 0.01830694079399109, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 196.015625, "completions/mean_terminated_length": 196.015625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5099686980247498, "epoch": 0.6938053097345133, "frac_reward_zero_std": 0.25, "grad_norm": 1.4299765430530396, "kl": 0.04674728214740753, "learning_rate": 8.266869172013835e-07, "loss": 0.0203, "num_tokens": 9168909.0, "reward": 0.78125, "reward_std": 0.5281128883361816, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4223401546478271, "sampling/importance_sampling_ratio/mean": 0.9999047517776489, "sampling/importance_sampling_ratio/min": 0.7822982668876648, "sampling/sampling_logp_difference/max": 0.35230350494384766, "sampling/sampling_logp_difference/mean": 0.016416629776358604, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 118.421875, "completions/mean_terminated_length": 118.421875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.43823981285095215, "epoch": 0.695575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.3434754243547569, "kl": 0.05030140280723572, "learning_rate": 8.255160895148262e-07, "loss": -0.002, "num_tokens": 9186840.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.331215262413025, "sampling/importance_sampling_ratio/mean": 0.9995092153549194, "sampling/importance_sampling_ratio/min": 0.7431110143661499, "sampling/sampling_logp_difference/max": 0.29690980911254883, "sampling/sampling_logp_difference/mean": 0.01721595600247383, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 126.15625, "completions/mean_terminated_length": 126.15625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.36454033851623535, "epoch": 0.6973451327433628, "frac_reward_zero_std": 0.75, "grad_norm": 1.2443335047884587, "kl": 0.04676324874162674, "learning_rate": 8.243421556244178e-07, "loss": -0.0285, "num_tokens": 9205954.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3202030658721924, "sampling/importance_sampling_ratio/mean": 1.0000194311141968, "sampling/importance_sampling_ratio/min": 0.7287810444831848, "sampling/sampling_logp_difference/max": 0.31638193130493164, "sampling/sampling_logp_difference/mean": 0.014255122281610966, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 132.3125, "completions/mean_terminated_length": 132.3125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.3857053518295288, "epoch": 0.6991150442477876, "frac_reward_zero_std": 1.0, "grad_norm": 0.036256310135995104, "kl": 0.0411846861243248, "learning_rate": 8.231651267323018e-07, "loss": 0.0004, "num_tokens": 9223254.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4263430833816528, "sampling/importance_sampling_ratio/mean": 1.0001716613769531, "sampling/importance_sampling_ratio/min": 0.7204696536064148, "sampling/sampling_logp_difference/max": 0.3551138639450073, "sampling/sampling_logp_difference/mean": 0.014991642907261848, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 99.78125, "completions/mean_terminated_length": 99.78125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.29990869760513306, "epoch": 0.7008849557522124, "frac_reward_zero_std": 1.0, "grad_norm": 0.047019865250950525, "kl": 0.03183341026306152, "learning_rate": 8.219850140701556e-07, "loss": 0.0003, "num_tokens": 9239544.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5087276697158813, "sampling/importance_sampling_ratio/mean": 1.000213623046875, "sampling/importance_sampling_ratio/min": 0.626640260219574, "sampling/sampling_logp_difference/max": 0.46738266944885254, "sampling/sampling_logp_difference/mean": 0.015097476541996002, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 202.0, "completions/max_terminated_length": 202.0, "completions/mean_length": 109.75, "completions/mean_terminated_length": 109.75, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4533824026584625, "epoch": 0.7026548672566372, "frac_reward_zero_std": 0.75, "grad_norm": 1.4732923471056585, "kl": 0.04384038597345352, "learning_rate": 8.208018288990831e-07, "loss": 0.0008, "num_tokens": 9257400.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.7252179384231567, "sampling/importance_sampling_ratio/mean": 0.9997067451477051, "sampling/importance_sampling_ratio/min": 0.7307789325714111, "sampling/sampling_logp_difference/max": 0.5453534126281738, "sampling/sampling_logp_difference/mean": 0.016809294000267982, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 160.46875, "completions/mean_terminated_length": 160.46875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.46921366453170776, "epoch": 0.7044247787610619, "frac_reward_zero_std": 0.5, "grad_norm": 1.357055443061283, "kl": 0.04166853800415993, "learning_rate": 8.196155825095072e-07, "loss": 0.0085, "num_tokens": 9278518.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.536620020866394, "sampling/importance_sampling_ratio/mean": 1.0007810592651367, "sampling/importance_sampling_ratio/min": 0.697200357913971, "sampling/sampling_logp_difference/max": 0.42958521842956543, "sampling/sampling_logp_difference/mean": 0.016781406477093697, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 160.96875, "completions/mean_terminated_length": 160.96875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3876239061355591, "epoch": 0.7061946902654868, "frac_reward_zero_std": 0.5, "grad_norm": 1.4759216717531352, "kl": 0.04244627803564072, "learning_rate": 8.184262862210624e-07, "loss": 0.0385, "num_tokens": 9299732.0, "reward": 0.90625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5744906663894653, "sampling/importance_sampling_ratio/mean": 0.9994622468948364, "sampling/importance_sampling_ratio/min": 0.6394615769386292, "sampling/sampling_logp_difference/max": 0.4539318084716797, "sampling/sampling_logp_difference/mean": 0.015496197156608105, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 125.984375, "completions/mean_terminated_length": 125.984375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3927186131477356, "epoch": 0.7079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.5864410065005021, "kl": 0.037036724388599396, "learning_rate": 8.172339513824862e-07, "loss": 0.0137, "num_tokens": 9322035.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.3844969272613525, "sampling/importance_sampling_ratio/mean": 1.0005230903625488, "sampling/importance_sampling_ratio/min": 0.7229098677635193, "sampling/sampling_logp_difference/max": 0.3253368139266968, "sampling/sampling_logp_difference/mean": 0.014183755964040756, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 167.375, "completions/mean_terminated_length": 167.375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.5327609181404114, "epoch": 0.7097345132743362, "frac_reward_zero_std": 0.5, "grad_norm": 1.5360476310515012, "kl": 0.04352780431509018, "learning_rate": 8.160385893715112e-07, "loss": 0.0377, "num_tokens": 9343595.0, "reward": 0.375, "reward_std": 0.34156501293182373, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5233222246170044, "sampling/importance_sampling_ratio/mean": 1.0008280277252197, "sampling/importance_sampling_ratio/min": 0.6530730128288269, "sampling/sampling_logp_difference/max": 0.42606639862060547, "sampling/sampling_logp_difference/mean": 0.01846783608198166, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 124.078125, "completions/mean_terminated_length": 124.078125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.3296247124671936, "epoch": 0.7115044247787611, "frac_reward_zero_std": 0.75, "grad_norm": 1.1383720765457108, "kl": 0.03181477636098862, "learning_rate": 8.14840211594757e-07, "loss": 0.0082, "num_tokens": 9361040.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.298300862312317, "sampling/importance_sampling_ratio/mean": 0.9994966983795166, "sampling/importance_sampling_ratio/min": 0.7810361981391907, "sampling/sampling_logp_difference/max": 0.26105642318725586, "sampling/sampling_logp_difference/mean": 0.013512702658772469, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 133.34375, "completions/mean_terminated_length": 133.34375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4810064435005188, "epoch": 0.7132743362831858, "frac_reward_zero_std": 0.75, "grad_norm": 1.3724334344488918, "kl": 0.05646929517388344, "learning_rate": 8.136388294876202e-07, "loss": 0.004, "num_tokens": 9380198.0, "reward": 0.65625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.4132767915725708, "sampling/importance_sampling_ratio/mean": 0.9991341233253479, "sampling/importance_sampling_ratio/min": 0.643557071685791, "sampling/sampling_logp_difference/max": 0.44074463844299316, "sampling/sampling_logp_difference/mean": 0.017204541712999344, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 119.796875, "completions/mean_terminated_length": 119.796875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.40257346630096436, "epoch": 0.7150442477876107, "frac_reward_zero_std": 0.75, "grad_norm": 1.5563109733512384, "kl": 0.03649323433637619, "learning_rate": 8.124344545141661e-07, "loss": 0.0446, "num_tokens": 9403401.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.6273202896118164, "sampling/importance_sampling_ratio/mean": 0.9993991255760193, "sampling/importance_sampling_ratio/min": 0.6329724788665771, "sampling/sampling_logp_difference/max": 0.4869346618652344, "sampling/sampling_logp_difference/mean": 0.01654677465558052, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 182.921875, "completions/mean_terminated_length": 182.921875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.5903284549713135, "epoch": 0.7168141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 0.9608264057391309, "kl": 0.05380180850625038, "learning_rate": 8.112270981670195e-07, "loss": 0.0267, "num_tokens": 9430356.0, "reward": 0.4375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.3267014026641846, "sampling/importance_sampling_ratio/mean": 0.9999366998672485, "sampling/importance_sampling_ratio/min": 0.6254509687423706, "sampling/sampling_logp_difference/max": 0.4692823886871338, "sampling/sampling_logp_difference/mean": 0.018537428230047226, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 117.609375, "completions/mean_terminated_length": 117.609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.3920556604862213, "epoch": 0.7185840707964601, "frac_reward_zero_std": 1.0, "grad_norm": 0.03612522798777008, "kl": 0.04312152415513992, "learning_rate": 8.10016771967254e-07, "loss": 0.0005, "num_tokens": 9448315.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3156917095184326, "sampling/importance_sampling_ratio/mean": 0.9998875260353088, "sampling/importance_sampling_ratio/min": 0.6056219339370728, "sampling/sampling_logp_difference/max": 0.5014994144439697, "sampling/sampling_logp_difference/mean": 0.016677189618349075, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 187.65625, "completions/mean_terminated_length": 187.65625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.48553669452667236, "epoch": 0.720353982300885, "frac_reward_zero_std": 0.5, "grad_norm": 1.222016051787163, "kl": 0.04862046241760254, "learning_rate": 8.088034874642833e-07, "loss": -0.0122, "num_tokens": 9470517.0, "reward": -0.65625, "reward_std": 0.375, "rewards/decision_reward_func/mean": -0.65625, "rewards/decision_reward_func/std": 0.7605084180831909, "sampling/importance_sampling_ratio/max": 1.3227328062057495, "sampling/importance_sampling_ratio/mean": 1.000306248664856, "sampling/importance_sampling_ratio/min": 0.7772423028945923, "sampling/sampling_logp_difference/max": 0.2796999216079712, "sampling/sampling_logp_difference/mean": 0.016164880245923996, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 121.640625, "completions/mean_terminated_length": 121.640625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.41761600971221924, "epoch": 0.7221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 0.03810339140122863, "kl": 0.04997194930911064, "learning_rate": 8.0758725623575e-07, "loss": 0.0005, "num_tokens": 9488990.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2934409379959106, "sampling/importance_sampling_ratio/mean": 1.0001788139343262, "sampling/importance_sampling_ratio/min": 0.735434353351593, "sampling/sampling_logp_difference/max": 0.30729401111602783, "sampling/sampling_logp_difference/mean": 0.015452738851308823, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 160.609375, "completions/mean_terminated_length": 160.609375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.49578094482421875, "epoch": 0.7238938053097345, "frac_reward_zero_std": 0.5, "grad_norm": 1.4227949776008897, "kl": 0.057519737631082535, "learning_rate": 8.063680898874157e-07, "loss": -0.0002, "num_tokens": 9511509.0, "reward": 0.9375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2921876907348633, "sampling/importance_sampling_ratio/mean": 0.9999642372131348, "sampling/importance_sampling_ratio/min": 0.7029215693473816, "sampling/sampling_logp_difference/max": 0.3525099754333496, "sampling/sampling_logp_difference/mean": 0.015130658634006977, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 113.078125, "completions/mean_terminated_length": 113.078125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.3672482967376709, "epoch": 0.7256637168141593, "frac_reward_zero_std": 1.0, "grad_norm": 0.04104218369204801, "kl": 0.039082761853933334, "learning_rate": 8.051460000530501e-07, "loss": 0.0004, "num_tokens": 9528218.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.295780897140503, "sampling/importance_sampling_ratio/mean": 0.999136745929718, "sampling/importance_sampling_ratio/min": 0.6808854341506958, "sampling/sampling_logp_difference/max": 0.38436126708984375, "sampling/sampling_logp_difference/mean": 0.016387619078159332, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 144.671875, "completions/mean_terminated_length": 144.671875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.2750190794467926, "epoch": 0.727433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.02738721989729678, "kl": 0.03227146714925766, "learning_rate": 8.039209983943201e-07, "loss": 0.0003, "num_tokens": 9547013.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3791532516479492, "sampling/importance_sampling_ratio/mean": 0.9994186162948608, "sampling/importance_sampling_ratio/min": 0.7481702566146851, "sampling/sampling_logp_difference/max": 0.32146966457366943, "sampling/sampling_logp_difference/mean": 0.012029212899506092, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 126.3125, "completions/mean_terminated_length": 126.3125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5413132905960083, "epoch": 0.7292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.04237971922838883, "kl": 0.07601577043533325, "learning_rate": 8.026930966006778e-07, "loss": 0.0008, "num_tokens": 9567369.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5532801151275635, "sampling/importance_sampling_ratio/mean": 0.9999905824661255, "sampling/importance_sampling_ratio/min": 0.6942715644836426, "sampling/sampling_logp_difference/max": 0.4403688907623291, "sampling/sampling_logp_difference/mean": 0.017333954572677612, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 132.078125, "completions/mean_terminated_length": 132.078125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.46055909991264343, "epoch": 0.7309734513274336, "frac_reward_zero_std": 0.75, "grad_norm": 1.2854979780959792, "kl": 0.05208270251750946, "learning_rate": 8.014623063892503e-07, "loss": -0.0066, "num_tokens": 9588606.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.306519627571106, "sampling/importance_sampling_ratio/mean": 0.999424934387207, "sampling/importance_sampling_ratio/min": 0.608730673789978, "sampling/sampling_logp_difference/max": 0.49637937545776367, "sampling/sampling_logp_difference/mean": 0.01695268228650093, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 180.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.6001425981521606, "epoch": 0.7327433628318584, "frac_reward_zero_std": 0.25, "grad_norm": 1.606530499810133, "kl": 0.06114194542169571, "learning_rate": 8.002286395047266e-07, "loss": -0.0075, "num_tokens": 9617982.0, "reward": 0.28125, "reward_std": 0.5457825064659119, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.288370132446289, "sampling/importance_sampling_ratio/mean": 0.9996581673622131, "sampling/importance_sampling_ratio/min": 0.7591173052787781, "sampling/sampling_logp_difference/max": 0.27559900283813477, "sampling/sampling_logp_difference/mean": 0.01883193850517273, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 147.359375, "completions/mean_terminated_length": 147.359375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.5782200694084167, "epoch": 0.7345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 1.6151609782878316, "kl": 0.061108771711587906, "learning_rate": 7.989921077192463e-07, "loss": -0.0003, "num_tokens": 9644149.0, "reward": 0.53125, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5092495679855347, "sampling/importance_sampling_ratio/mean": 0.9999904632568359, "sampling/importance_sampling_ratio/min": 0.6828128099441528, "sampling/sampling_logp_difference/max": 0.41161251068115234, "sampling/sampling_logp_difference/mean": 0.018274560570716858, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 171.65625, "completions/mean_terminated_length": 171.65625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.37550604343414307, "epoch": 0.736283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 1.0364107658945485, "kl": 0.05150478333234787, "learning_rate": 7.97752722832287e-07, "loss": 0.0202, "num_tokens": 9667007.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2921267747879028, "sampling/importance_sampling_ratio/mean": 1.0000576972961426, "sampling/importance_sampling_ratio/min": 0.7077885866165161, "sampling/sampling_logp_difference/max": 0.3456099033355713, "sampling/sampling_logp_difference/mean": 0.013839101418852806, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 129.40625, "completions/mean_terminated_length": 129.40625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.48952698707580566, "epoch": 0.7380530973451327, "frac_reward_zero_std": 0.75, "grad_norm": 1.3150585729508277, "kl": 0.05840027332305908, "learning_rate": 7.965104966705517e-07, "loss": 0.0002, "num_tokens": 9687401.0, "reward": 0.75, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4359703063964844, "sampling/importance_sampling_ratio/mean": 1.0009379386901855, "sampling/importance_sampling_ratio/min": 0.771267294883728, "sampling/sampling_logp_difference/max": 0.36184072494506836, "sampling/sampling_logp_difference/mean": 0.017398536205291748, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 133.34375, "completions/mean_terminated_length": 133.34375, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.5486599802970886, "epoch": 0.7398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.1852547391700583, "kl": 0.0636061578989029, "learning_rate": 7.952654410878558e-07, "loss": -0.0088, "num_tokens": 9708095.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5659292936325073, "sampling/importance_sampling_ratio/mean": 1.0008774995803833, "sampling/importance_sampling_ratio/min": 0.7863955497741699, "sampling/sampling_logp_difference/max": 0.44847941398620605, "sampling/sampling_logp_difference/mean": 0.016996072605252266, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 133.0625, "completions/mean_terminated_length": 133.0625, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.5455623865127563, "epoch": 0.7415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.1828438991449177, "kl": 0.07124529778957367, "learning_rate": 7.940175679650145e-07, "loss": 0.0085, "num_tokens": 9729603.0, "reward": 0.0625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.0625, "rewards/decision_reward_func/std": 1.0059348344802856, "sampling/importance_sampling_ratio/max": 1.2635778188705444, "sampling/importance_sampling_ratio/mean": 0.9996267557144165, "sampling/importance_sampling_ratio/min": 0.778712272644043, "sampling/sampling_logp_difference/max": 0.25011372566223145, "sampling/sampling_logp_difference/mean": 0.016824547201395035, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 145.5, "completions/mean_terminated_length": 145.5, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.34660637378692627, "epoch": 0.7433628318584071, "frac_reward_zero_std": 1.0, "grad_norm": 0.0312409799423157, "kl": 0.038545429706573486, "learning_rate": 7.927668892097288e-07, "loss": 0.0003, "num_tokens": 9748435.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3718312978744507, "sampling/importance_sampling_ratio/mean": 0.9999306201934814, "sampling/importance_sampling_ratio/min": 0.7005653381347656, "sampling/sampling_logp_difference/max": 0.3558676242828369, "sampling/sampling_logp_difference/mean": 0.014107013121247292, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 98.859375, "completions/mean_terminated_length": 98.859375, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2662732005119324, "epoch": 0.7451327433628319, "frac_reward_zero_std": 1.0, "grad_norm": 0.040150729530619926, "kl": 0.03974389284849167, "learning_rate": 7.915134167564723e-07, "loss": 0.0004, "num_tokens": 9764202.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.362242579460144, "sampling/importance_sampling_ratio/mean": 0.9992560148239136, "sampling/importance_sampling_ratio/min": 0.36203813552856445, "sampling/sampling_logp_difference/max": 1.0160057544708252, "sampling/sampling_logp_difference/mean": 0.013069668784737587, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 99.25, "completions/mean_terminated_length": 99.25, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.3316885828971863, "epoch": 0.7469026548672566, "frac_reward_zero_std": 1.0, "grad_norm": 0.05377375733084318, "kl": 0.038888368755578995, "learning_rate": 7.902571625663772e-07, "loss": 0.0004, "num_tokens": 9780138.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3458665609359741, "sampling/importance_sampling_ratio/mean": 1.0000932216644287, "sampling/importance_sampling_ratio/min": 0.660549521446228, "sampling/sampling_logp_difference/max": 0.4146832227706909, "sampling/sampling_logp_difference/mean": 0.015012386254966259, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 136.453125, "completions/mean_terminated_length": 136.453125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.2856818437576294, "epoch": 0.7486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.05363430229119572, "kl": 0.03845176845788956, "learning_rate": 7.8899813862712e-07, "loss": 0.0004, "num_tokens": 9797879.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4129092693328857, "sampling/importance_sampling_ratio/mean": 0.999391496181488, "sampling/importance_sampling_ratio/min": 0.6080278754234314, "sampling/sampling_logp_difference/max": 0.49753451347351074, "sampling/sampling_logp_difference/mean": 0.011145094409584999, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 121.828125, "completions/mean_terminated_length": 121.828125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4224778413772583, "epoch": 0.7504424778761062, "frac_reward_zero_std": 1.0, "grad_norm": 0.038346174136042306, "kl": 0.05765840411186218, "learning_rate": 7.877363569528075e-07, "loss": 0.0005, "num_tokens": 9815260.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.384250521659851, "sampling/importance_sampling_ratio/mean": 1.0004199743270874, "sampling/importance_sampling_ratio/min": 0.7318248152732849, "sampling/sampling_logp_difference/max": 0.32515883445739746, "sampling/sampling_logp_difference/mean": 0.015470411628484726, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 188.0, "completions/max_terminated_length": 188.0, "completions/mean_length": 113.9375, "completions/mean_terminated_length": 113.9375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.35458582639694214, "epoch": 0.7522123893805309, "frac_reward_zero_std": 1.0, "grad_norm": 0.03858518652079706, "kl": 0.042834073305130005, "learning_rate": 7.864718295838614e-07, "loss": 0.0004, "num_tokens": 9833256.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.295134425163269, "sampling/importance_sampling_ratio/mean": 1.0006794929504395, "sampling/importance_sampling_ratio/min": 0.6788650155067444, "sampling/sampling_logp_difference/max": 0.3873330354690552, "sampling/sampling_logp_difference/mean": 0.013968059793114662, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 196.34375, "completions/mean_terminated_length": 196.34375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.5816829204559326, "epoch": 0.7539823008849558, "frac_reward_zero_std": 0.5, "grad_norm": 1.251531006828788, "kl": 0.061315182596445084, "learning_rate": 7.852045685869044e-07, "loss": 0.0473, "num_tokens": 9860062.0, "reward": 0.8125, "reward_std": 0.3943893015384674, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0004279613494873, "sampling/importance_sampling_ratio/min": 0.6298375129699707, "sampling/sampling_logp_difference/max": 0.8718147277832031, "sampling/sampling_logp_difference/mean": 0.017862189561128616, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 91.1875, "completions/mean_terminated_length": 91.1875, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.3042639493942261, "epoch": 0.7557522123893805, "frac_reward_zero_std": 1.0, "grad_norm": 0.051794052447220504, "kl": 0.0483333058655262, "learning_rate": 7.839345860546447e-07, "loss": 0.0005, "num_tokens": 9875098.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4351999759674072, "sampling/importance_sampling_ratio/mean": 1.000496745109558, "sampling/importance_sampling_ratio/min": 0.6453400254249573, "sampling/sampling_logp_difference/max": 0.43797802925109863, "sampling/sampling_logp_difference/mean": 0.014278100803494453, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 228.953125, "completions/mean_terminated_length": 228.953125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.43442392349243164, "epoch": 0.7575221238938054, "frac_reward_zero_std": 0.75, "grad_norm": 0.6040956124876707, "kl": 0.04339404031634331, "learning_rate": 7.826618941057597e-07, "loss": -0.0081, "num_tokens": 9901031.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.344829797744751, "sampling/importance_sampling_ratio/mean": 0.9997045993804932, "sampling/importance_sampling_ratio/min": 0.7581149339675903, "sampling/sampling_logp_difference/max": 0.29626739025115967, "sampling/sampling_logp_difference/mean": 0.014257797971367836, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 144.875, "completions/mean_terminated_length": 144.875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4813069701194763, "epoch": 0.7592920353982301, "frac_reward_zero_std": 0.75, "grad_norm": 1.5604049958307447, "kl": 0.0463547520339489, "learning_rate": 7.813865048847818e-07, "loss": 0.0146, "num_tokens": 9921391.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6355764865875244, "sampling/importance_sampling_ratio/mean": 1.0004425048828125, "sampling/importance_sampling_ratio/min": 0.6171419620513916, "sampling/sampling_logp_difference/max": 0.49199533462524414, "sampling/sampling_logp_difference/mean": 0.01544689480215311, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 186.9375, "completions/mean_terminated_length": 186.9375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.5194053649902344, "epoch": 0.7610619469026548, "frac_reward_zero_std": 0.75, "grad_norm": 0.7701415244045536, "kl": 0.062382422387599945, "learning_rate": 7.801084305619818e-07, "loss": 0.014, "num_tokens": 9946155.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.811177134513855, "sampling/importance_sampling_ratio/mean": 1.0002188682556152, "sampling/importance_sampling_ratio/min": 0.6984061598777771, "sampling/sampling_logp_difference/max": 0.5939769744873047, "sampling/sampling_logp_difference/mean": 0.017058037221431732, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 133.421875, "completions/mean_terminated_length": 133.421875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.32108262181282043, "epoch": 0.7628318584070797, "frac_reward_zero_std": 1.0, "grad_norm": 0.02278105974564533, "kl": 0.023188531398773193, "learning_rate": 7.788276833332525e-07, "loss": 0.0002, "num_tokens": 9964166.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3770564794540405, "sampling/importance_sampling_ratio/mean": 1.000881314277649, "sampling/importance_sampling_ratio/min": 0.6873236894607544, "sampling/sampling_logp_difference/max": 0.37494993209838867, "sampling/sampling_logp_difference/mean": 0.012426692992448807, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 198.578125, "completions/mean_terminated_length": 198.578125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5111083984375, "epoch": 0.7646017699115044, "frac_reward_zero_std": 0.5, "grad_norm": 1.2206219434768952, "kl": 0.05128113925457001, "learning_rate": 7.775442754199928e-07, "loss": 0.0149, "num_tokens": 9989691.0, "reward": 0.40625, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.65772545337677, "sampling/importance_sampling_ratio/mean": 0.9998399615287781, "sampling/importance_sampling_ratio/min": 0.6675968170166016, "sampling/sampling_logp_difference/max": 0.5054464340209961, "sampling/sampling_logp_difference/mean": 0.01742260530591011, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 176.34375, "completions/mean_terminated_length": 176.34375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.5092491507530212, "epoch": 0.7663716814159292, "frac_reward_zero_std": 0.5, "grad_norm": 1.230637386431485, "kl": 0.04463362321257591, "learning_rate": 7.76258219068991e-07, "loss": 0.038, "num_tokens": 10012513.0, "reward": 0.5625, "reward_std": 0.3265564441680908, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.5303585529327393, "sampling/importance_sampling_ratio/mean": 0.9997532367706299, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.016243431717157364, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 141.046875, "completions/mean_terminated_length": 141.046875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.44375866651535034, "epoch": 0.768141592920354, "frac_reward_zero_std": 0.5, "grad_norm": 1.401467803600503, "kl": 0.04483048617839813, "learning_rate": 7.749695265523075e-07, "loss": -0.0296, "num_tokens": 10032740.0, "reward": 0.25, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4549647569656372, "sampling/importance_sampling_ratio/mean": 0.9999812841415405, "sampling/importance_sampling_ratio/min": 0.6262632608413696, "sampling/sampling_logp_difference/max": 0.4679844379425049, "sampling/sampling_logp_difference/mean": 0.015169515274465084, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 117.875, "completions/mean_terminated_length": 117.875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.28713029623031616, "epoch": 0.7699115044247787, "frac_reward_zero_std": 1.0, "grad_norm": 0.041039635985315244, "kl": 0.030406465753912926, "learning_rate": 7.736782101671586e-07, "loss": 0.0004, "num_tokens": 10050092.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4981794357299805, "sampling/importance_sampling_ratio/mean": 1.0000543594360352, "sampling/importance_sampling_ratio/min": 0.6956551671028137, "sampling/sampling_logp_difference/max": 0.4042506217956543, "sampling/sampling_logp_difference/mean": 0.013050006702542305, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 161.71875, "completions/mean_terminated_length": 161.71875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.45859166979789734, "epoch": 0.7716814159292036, "frac_reward_zero_std": 0.75, "grad_norm": 0.896432444156014, "kl": 0.05509470775723457, "learning_rate": 7.723842822357979e-07, "loss": -0.0129, "num_tokens": 10072474.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.2872775793075562, "sampling/importance_sampling_ratio/mean": 0.9995081424713135, "sampling/importance_sampling_ratio/min": 0.7153425216674805, "sampling/sampling_logp_difference/max": 0.334993839263916, "sampling/sampling_logp_difference/mean": 0.01598282903432846, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 93.078125, "completions/mean_terminated_length": 93.078125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3131830394268036, "epoch": 0.7734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.03250132240398681, "kl": 0.023867376148700714, "learning_rate": 7.710877551054003e-07, "loss": 0.0002, "num_tokens": 10089263.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.301871657371521, "sampling/importance_sampling_ratio/mean": 1.0001564025878906, "sampling/importance_sampling_ratio/min": 0.6190708875656128, "sampling/sampling_logp_difference/max": 0.4795355796813965, "sampling/sampling_logp_difference/mean": 0.014258707873523235, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 117.578125, "completions/mean_terminated_length": 117.578125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.37190645933151245, "epoch": 0.7752212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.04040951774703129, "kl": 0.04006647691130638, "learning_rate": 7.697886411479421e-07, "loss": 0.0005, "num_tokens": 10107716.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3976877927780151, "sampling/importance_sampling_ratio/mean": 0.9995604753494263, "sampling/importance_sampling_ratio/min": 0.6151885390281677, "sampling/sampling_logp_difference/max": 0.4858264923095703, "sampling/sampling_logp_difference/mean": 0.01616573892533779, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 105.859375, "completions/mean_terminated_length": 105.859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3533839285373688, "epoch": 0.7769911504424779, "frac_reward_zero_std": 0.75, "grad_norm": 1.2171059916419273, "kl": 0.031037643551826477, "learning_rate": 7.684869527600856e-07, "loss": 0.0118, "num_tokens": 10124987.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2914077043533325, "sampling/importance_sampling_ratio/mean": 1.0000272989273071, "sampling/importance_sampling_ratio/min": 0.7802943587303162, "sampling/sampling_logp_difference/max": 0.25573277473449707, "sampling/sampling_logp_difference/mean": 0.014694828540086746, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 123.15625, "completions/mean_terminated_length": 123.15625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.35793831944465637, "epoch": 0.7787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 1.1659824024798076, "kl": 0.03780563920736313, "learning_rate": 7.671827023630579e-07, "loss": 0.0051, "num_tokens": 10141941.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.2982903718948364, "sampling/importance_sampling_ratio/mean": 0.9997375011444092, "sampling/importance_sampling_ratio/min": 0.7020929455757141, "sampling/sampling_logp_difference/max": 0.35368943214416504, "sampling/sampling_logp_difference/mean": 0.015126901678740978, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 164.71875, "completions/mean_terminated_length": 164.71875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4047059714794159, "epoch": 0.7805309734513274, "frac_reward_zero_std": 0.75, "grad_norm": 0.8609898222707638, "kl": 0.03574547544121742, "learning_rate": 7.658759024025347e-07, "loss": 0.0028, "num_tokens": 10163923.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.6026140451431274, "sampling/importance_sampling_ratio/mean": 0.9995793104171753, "sampling/importance_sampling_ratio/min": 0.6657065153121948, "sampling/sampling_logp_difference/max": 0.4716360569000244, "sampling/sampling_logp_difference/mean": 0.014743344858288765, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 151.5625, "completions/mean_terminated_length": 151.5625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.33742740750312805, "epoch": 0.7823008849557522, "frac_reward_zero_std": 1.0, "grad_norm": 0.026046778829816937, "kl": 0.019752195104956627, "learning_rate": 7.645665653485205e-07, "loss": 0.0002, "num_tokens": 10183463.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4193668365478516, "sampling/importance_sampling_ratio/mean": 0.9995388388633728, "sampling/importance_sampling_ratio/min": 0.7518067359924316, "sampling/sampling_logp_difference/max": 0.35021090507507324, "sampling/sampling_logp_difference/mean": 0.01452592946588993, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 135.75, "completions/mean_terminated_length": 135.75, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.42516061663627625, "epoch": 0.784070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 1.1784864186775827, "kl": 0.03952057659626007, "learning_rate": 7.632547036952295e-07, "loss": 0.0116, "num_tokens": 10202647.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.3767832517623901, "sampling/importance_sampling_ratio/mean": 1.0001153945922852, "sampling/importance_sampling_ratio/min": 0.6407551169395447, "sampling/sampling_logp_difference/max": 0.44510793685913086, "sampling/sampling_logp_difference/mean": 0.016151659190654755, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 126.921875, "completions/mean_terminated_length": 126.921875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3769015669822693, "epoch": 0.7858407079646018, "frac_reward_zero_std": 0.75, "grad_norm": 1.2305521421237382, "kl": 0.0339299738407135, "learning_rate": 7.619403299609667e-07, "loss": 0.0263, "num_tokens": 10221298.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2958794832229614, "sampling/importance_sampling_ratio/mean": 0.9998077154159546, "sampling/importance_sampling_ratio/min": 0.628684401512146, "sampling/sampling_logp_difference/max": 0.4641258716583252, "sampling/sampling_logp_difference/mean": 0.015832651406526566, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 152.078125, "completions/mean_terminated_length": 152.078125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.5478400588035583, "epoch": 0.7876106194690266, "frac_reward_zero_std": 0.5, "grad_norm": 1.4222336373618596, "kl": 0.06323939561843872, "learning_rate": 7.606234566880088e-07, "loss": 0.0088, "num_tokens": 10241543.0, "reward": 0.5625, "reward_std": 0.49553054571151733, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2920238971710205, "sampling/importance_sampling_ratio/mean": 0.9995840191841125, "sampling/importance_sampling_ratio/min": 0.7800106406211853, "sampling/sampling_logp_difference/max": 0.2562098503112793, "sampling/sampling_logp_difference/mean": 0.019120126962661743, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 112.328125, "completions/mean_terminated_length": 112.328125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.3097269535064697, "epoch": 0.7893805309734513, "frac_reward_zero_std": 1.0, "grad_norm": 0.04644537650187056, "kl": 0.031152140349149704, "learning_rate": 7.593040964424835e-07, "loss": 0.0003, "num_tokens": 10258044.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3562997579574585, "sampling/importance_sampling_ratio/mean": 0.99954754114151, "sampling/importance_sampling_ratio/min": 0.6547458171844482, "sampling/sampling_logp_difference/max": 0.4235081672668457, "sampling/sampling_logp_difference/mean": 0.014366969466209412, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 155.09375, "completions/mean_terminated_length": 155.09375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.416738897562027, "epoch": 0.7911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 1.0408398377802692, "kl": 0.06408806890249252, "learning_rate": 7.579822618142503e-07, "loss": 0.0155, "num_tokens": 10278786.0, "reward": -0.15625, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": -0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.4058080911636353, "sampling/importance_sampling_ratio/mean": 1.0004191398620605, "sampling/importance_sampling_ratio/min": 0.7061727643013, "sampling/sampling_logp_difference/max": 0.34789538383483887, "sampling/sampling_logp_difference/mean": 0.015817837789654732, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 147.859375, "completions/mean_terminated_length": 147.859375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4140564203262329, "epoch": 0.7929203539823009, "frac_reward_zero_std": 0.75, "grad_norm": 1.0493445376584571, "kl": 0.04653691127896309, "learning_rate": 7.56657965416781e-07, "loss": -0.0014, "num_tokens": 10299145.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5432850122451782, "sampling/importance_sampling_ratio/mean": 1.0006117820739746, "sampling/importance_sampling_ratio/min": 0.6773489117622375, "sampling/sampling_logp_difference/max": 0.4339132308959961, "sampling/sampling_logp_difference/mean": 0.015949150547385216, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 122.0625, "completions/mean_terminated_length": 122.0625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.3602309823036194, "epoch": 0.7946902654867256, "frac_reward_zero_std": 0.75, "grad_norm": 1.3145517594184841, "kl": 0.04791180044412613, "learning_rate": 7.553312198870372e-07, "loss": -0.0096, "num_tokens": 10319565.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.3871322870254517, "sampling/importance_sampling_ratio/mean": 0.999633252620697, "sampling/importance_sampling_ratio/min": 0.7378374338150024, "sampling/sampling_logp_difference/max": 0.3272385597229004, "sampling/sampling_logp_difference/mean": 0.015328210778534412, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 141.4375, "completions/mean_terminated_length": 141.4375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.346060186624527, "epoch": 0.7964601769911505, "frac_reward_zero_std": 1.0, "grad_norm": 0.05853215178325867, "kl": 0.035528093576431274, "learning_rate": 7.540020378853522e-07, "loss": 0.0004, "num_tokens": 10340185.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.326745867729187, "sampling/importance_sampling_ratio/mean": 1.000696063041687, "sampling/importance_sampling_ratio/min": 0.668019711971283, "sampling/sampling_logp_difference/max": 0.40343761444091797, "sampling/sampling_logp_difference/mean": 0.014905279502272606, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 146.625, "completions/mean_terminated_length": 146.625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.4376601576805115, "epoch": 0.7982300884955752, "frac_reward_zero_std": 0.5, "grad_norm": 1.6127170475491959, "kl": 0.061810243874788284, "learning_rate": 7.52670432095309e-07, "loss": -0.021, "num_tokens": 10360865.0, "reward": 0.59375, "reward_std": 0.497555673122406, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3121674060821533, "sampling/importance_sampling_ratio/mean": 1.0003948211669922, "sampling/importance_sampling_ratio/min": 0.6485395431518555, "sampling/sampling_logp_difference/max": 0.4330322742462158, "sampling/sampling_logp_difference/mean": 0.016728384420275688, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 142.09375, "completions/mean_terminated_length": 142.09375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4573453962802887, "epoch": 0.8, "frac_reward_zero_std": 0.75, "grad_norm": 1.135755391878933, "kl": 0.06898960471153259, "learning_rate": 7.513364152236185e-07, "loss": 0.0024, "num_tokens": 10382919.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.532122015953064, "sampling/importance_sampling_ratio/mean": 1.0002293586730957, "sampling/importance_sampling_ratio/min": 0.6331928968429565, "sampling/sampling_logp_difference/max": 0.45698022842407227, "sampling/sampling_logp_difference/mean": 0.017683709040284157, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 125.828125, "completions/mean_terminated_length": 125.828125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.36240512132644653, "epoch": 0.8017699115044248, "frac_reward_zero_std": 1.0, "grad_norm": 0.09174085224713212, "kl": 0.06328853964805603, "learning_rate": 7.5e-07, "loss": 0.0007, "num_tokens": 10401644.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3621101379394531, "sampling/importance_sampling_ratio/mean": 0.9999123811721802, "sampling/importance_sampling_ratio/min": 0.6479644775390625, "sampling/sampling_logp_difference/max": 0.43391942977905273, "sampling/sampling_logp_difference/mean": 0.014787204563617706, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 150.046875, "completions/mean_terminated_length": 150.046875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "entropy": 0.3293919563293457, "epoch": 0.8035398230088495, "frac_reward_zero_std": 1.0, "grad_norm": 0.04347173613789339, "kl": 0.03755633533000946, "learning_rate": 7.486611991770585e-07, "loss": 0.0004, "num_tokens": 10422335.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3207083940505981, "sampling/importance_sampling_ratio/mean": 0.9996446371078491, "sampling/importance_sampling_ratio/min": 0.6304682493209839, "sampling/sampling_logp_difference/max": 0.4612925052642822, "sampling/sampling_logp_difference/mean": 0.013409500941634178, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 124.96875, "completions/mean_terminated_length": 124.96875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.30635595321655273, "epoch": 0.8053097345132744, "frac_reward_zero_std": 1.0, "grad_norm": 0.047622189612138245, "kl": 0.033256709575653076, "learning_rate": 7.473200255301634e-07, "loss": 0.0004, "num_tokens": 10441773.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5281130075454712, "sampling/importance_sampling_ratio/mean": 1.0002222061157227, "sampling/importance_sampling_ratio/min": 0.6043351292610168, "sampling/sampling_logp_difference/max": 0.5036263465881348, "sampling/sampling_logp_difference/mean": 0.015033609233796597, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 173.5625, "completions/mean_terminated_length": 173.5625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.43391820788383484, "epoch": 0.8070796460176991, "frac_reward_zero_std": 0.75, "grad_norm": 1.1702017725660032, "kl": 0.06150873750448227, "learning_rate": 7.459764918573264e-07, "loss": -0.0113, "num_tokens": 10464657.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4234800338745117, "sampling/importance_sampling_ratio/mean": 0.9993876218795776, "sampling/importance_sampling_ratio/min": 0.6902082562446594, "sampling/sampling_logp_difference/max": 0.3707618713378906, "sampling/sampling_logp_difference/mean": 0.016423087567090988, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 121.109375, "completions/mean_terminated_length": 121.109375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.270940899848938, "epoch": 0.8088495575221238, "frac_reward_zero_std": 1.0, "grad_norm": 0.0566857769828913, "kl": 0.040656525641679764, "learning_rate": 7.446306109790797e-07, "loss": 0.0004, "num_tokens": 10482408.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3243651390075684, "sampling/importance_sampling_ratio/mean": 0.999417781829834, "sampling/importance_sampling_ratio/min": 0.6547061800956726, "sampling/sampling_logp_difference/max": 0.4235687255859375, "sampling/sampling_logp_difference/mean": 0.012482231482863426, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 122.40625, "completions/mean_terminated_length": 122.40625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.27112001180648804, "epoch": 0.8106194690265487, "frac_reward_zero_std": 1.0, "grad_norm": 0.04500104098821302, "kl": 0.025868939235806465, "learning_rate": 7.432823957383531e-07, "loss": 0.0003, "num_tokens": 10500258.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3218246698379517, "sampling/importance_sampling_ratio/mean": 0.9997153282165527, "sampling/importance_sampling_ratio/min": 0.695496678352356, "sampling/sampling_logp_difference/max": 0.36312901973724365, "sampling/sampling_logp_difference/mean": 0.011474039405584335, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 121.546875, "completions/mean_terminated_length": 121.546875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.42170801758766174, "epoch": 0.8123893805309734, "frac_reward_zero_std": 0.75, "grad_norm": 1.4227323614675174, "kl": 0.05972127616405487, "learning_rate": 7.419318590003523e-07, "loss": -0.0236, "num_tokens": 10520549.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.2994980812072754, "sampling/importance_sampling_ratio/mean": 1.0001108646392822, "sampling/importance_sampling_ratio/min": 0.6151829361915588, "sampling/sampling_logp_difference/max": 0.48583555221557617, "sampling/sampling_logp_difference/mean": 0.017824754118919373, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 149.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.3732178807258606, "epoch": 0.8141592920353983, "frac_reward_zero_std": 1.0, "grad_norm": 0.05982100635587919, "kl": 0.04914889484643936, "learning_rate": 7.405790136524352e-07, "loss": 0.0005, "num_tokens": 10539333.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.672418475151062, "sampling/importance_sampling_ratio/mean": 0.9997685551643372, "sampling/importance_sampling_ratio/min": 0.6069730520248413, "sampling/sampling_logp_difference/max": 0.5142707824707031, "sampling/sampling_logp_difference/mean": 0.014909964986145496, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 140.828125, "completions/mean_terminated_length": 140.828125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.34480398893356323, "epoch": 0.815929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 1.1062441954754016, "kl": 0.04970972612500191, "learning_rate": 7.392238726039897e-07, "loss": 0.0043, "num_tokens": 10557898.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.2949442863464355, "sampling/importance_sampling_ratio/mean": 0.9999631643295288, "sampling/importance_sampling_ratio/min": 0.47875940799713135, "sampling/sampling_logp_difference/max": 0.736557126045227, "sampling/sampling_logp_difference/mean": 0.014596865512430668, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 187.9375, "completions/mean_terminated_length": 187.9375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.49509763717651367, "epoch": 0.8176991150442477, "frac_reward_zero_std": 0.25, "grad_norm": 1.3417823296242846, "kl": 0.06408271193504333, "learning_rate": 7.378664487863102e-07, "loss": 0.0008, "num_tokens": 10579830.0, "reward": 0.28125, "reward_std": 0.5061737298965454, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.6127060651779175, "sampling/importance_sampling_ratio/mean": 0.9998746514320374, "sampling/importance_sampling_ratio/min": 0.7186934947967529, "sampling/sampling_logp_difference/max": 0.47791361808776855, "sampling/sampling_logp_difference/mean": 0.016279179602861404, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 124.421875, "completions/mean_terminated_length": 124.421875, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.38590937852859497, "epoch": 0.8194690265486726, "frac_reward_zero_std": 1.0, "grad_norm": 0.04903077285315654, "kl": 0.05098006874322891, "learning_rate": 7.365067551524739e-07, "loss": 0.0007, "num_tokens": 10599633.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3535634279251099, "sampling/importance_sampling_ratio/mean": 0.9995144605636597, "sampling/importance_sampling_ratio/min": 0.6368655562400818, "sampling/sampling_logp_difference/max": 0.45119667053222656, "sampling/sampling_logp_difference/mean": 0.01614277996122837, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 86.09375, "completions/mean_terminated_length": 86.09375, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.3199424147605896, "epoch": 0.8212389380530973, "frac_reward_zero_std": 1.0, "grad_norm": 0.0330539341227297, "kl": 0.027219675481319427, "learning_rate": 7.351448046772177e-07, "loss": 0.0003, "num_tokens": 10614487.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3655897378921509, "sampling/importance_sampling_ratio/mean": 0.9999443292617798, "sampling/importance_sampling_ratio/min": 0.6294812560081482, "sampling/sampling_logp_difference/max": 0.4628591537475586, "sampling/sampling_logp_difference/mean": 0.01586727797985077, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 156.625, "completions/mean_terminated_length": 156.625, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.4999556243419647, "epoch": 0.8230088495575221, "frac_reward_zero_std": 0.5, "grad_norm": 1.4178807890164686, "kl": 0.05209490656852722, "learning_rate": 7.33780610356814e-07, "loss": 0.0135, "num_tokens": 10637375.0, "reward": 0.78125, "reward_std": 0.4101392924785614, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5122649669647217, "sampling/importance_sampling_ratio/mean": 1.0001907348632812, "sampling/importance_sampling_ratio/min": 0.7380893230438232, "sampling/sampling_logp_difference/max": 0.4136085510253906, "sampling/sampling_logp_difference/mean": 0.016630396246910095, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 147.515625, "completions/mean_terminated_length": 147.515625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.4395650029182434, "epoch": 0.8247787610619469, "frac_reward_zero_std": 0.75, "grad_norm": 1.1663733853942424, "kl": 0.04514811187982559, "learning_rate": 7.324141852089471e-07, "loss": -0.065, "num_tokens": 10658352.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.3001909255981445, "sampling/importance_sampling_ratio/mean": 0.9995812177658081, "sampling/importance_sampling_ratio/min": 0.6797474026679993, "sampling/sampling_logp_difference/max": 0.3860340118408203, "sampling/sampling_logp_difference/mean": 0.016010360792279243, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 149.375, "completions/mean_terminated_length": 149.375, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.5551590919494629, "epoch": 0.8265486725663717, "frac_reward_zero_std": 0.75, "grad_norm": 1.0624761608590831, "kl": 0.061054326593875885, "learning_rate": 7.310455422725889e-07, "loss": 0.0303, "num_tokens": 10680408.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.2980353832244873, "sampling/importance_sampling_ratio/mean": 1.0006985664367676, "sampling/importance_sampling_ratio/min": 0.7282189726829529, "sampling/sampling_logp_difference/max": 0.3171534538269043, "sampling/sampling_logp_difference/mean": 0.017895396798849106, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 114.75, "completions/mean_terminated_length": 114.75, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4307693839073181, "epoch": 0.8283185840707965, "frac_reward_zero_std": 0.75, "grad_norm": 1.3482466982523051, "kl": 0.051996998488903046, "learning_rate": 7.296746946078736e-07, "loss": 0.0043, "num_tokens": 10698520.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.3564072847366333, "sampling/importance_sampling_ratio/mean": 0.9999678730964661, "sampling/importance_sampling_ratio/min": 0.7147090435028076, "sampling/sampling_logp_difference/max": 0.3358798027038574, "sampling/sampling_logp_difference/mean": 0.015811704099178314, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 177.46875, "completions/mean_terminated_length": 177.46875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.40639728307724, "epoch": 0.8300884955752212, "frac_reward_zero_std": 0.75, "grad_norm": 0.9309105446327044, "kl": 0.03369994834065437, "learning_rate": 7.283016552959744e-07, "loss": 0.0127, "num_tokens": 10719158.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.2958474159240723, "sampling/importance_sampling_ratio/mean": 0.9998207092285156, "sampling/importance_sampling_ratio/min": 0.7702411413192749, "sampling/sampling_logp_difference/max": 0.26105165481567383, "sampling/sampling_logp_difference/mean": 0.013439834117889404, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 153.640625, "completions/mean_terminated_length": 153.640625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.4660520553588867, "epoch": 0.831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 1.2011060342315654, "kl": 0.04497406259179115, "learning_rate": 7.26926437438978e-07, "loss": -0.0239, "num_tokens": 10739599.0, "reward": 0.8125, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.8125, "rewards/decision_reward_func/std": 0.5875696539878845, "sampling/importance_sampling_ratio/max": 1.4329568147659302, "sampling/importance_sampling_ratio/mean": 1.0004122257232666, "sampling/importance_sampling_ratio/min": 0.7717304229736328, "sampling/sampling_logp_difference/max": 0.3597400188446045, "sampling/sampling_logp_difference/mean": 0.015442630276083946, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 137.578125, "completions/mean_terminated_length": 137.578125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.42849624156951904, "epoch": 0.8336283185840708, "frac_reward_zero_std": 0.75, "grad_norm": 1.453756598093294, "kl": 0.04561780393123627, "learning_rate": 7.255490541597594e-07, "loss": 0.0486, "num_tokens": 10765268.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.6207348108291626, "sampling/importance_sampling_ratio/mean": 1.000959873199463, "sampling/importance_sampling_ratio/min": 0.6237696409225464, "sampling/sampling_logp_difference/max": 0.482879638671875, "sampling/sampling_logp_difference/mean": 0.016448060050606728, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 126.0, "completions/mean_terminated_length": 126.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.30105122923851013, "epoch": 0.8353982300884956, "frac_reward_zero_std": 1.0, "grad_norm": 0.03465108632451289, "kl": 0.02864006534218788, "learning_rate": 7.241695186018573e-07, "loss": 0.0003, "num_tokens": 10781780.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5226410627365112, "sampling/importance_sampling_ratio/mean": 1.0001323223114014, "sampling/importance_sampling_ratio/min": 0.672307014465332, "sampling/sampling_logp_difference/max": 0.42044639587402344, "sampling/sampling_logp_difference/mean": 0.013548655435442924, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 204.28125, "completions/mean_terminated_length": 204.28125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.3732837438583374, "epoch": 0.8371681415929203, "frac_reward_zero_std": 1.0, "grad_norm": 0.020149862214229524, "kl": 0.03531515970826149, "learning_rate": 7.227878439293476e-07, "loss": 0.0003, "num_tokens": 10804550.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3706494569778442, "sampling/importance_sampling_ratio/mean": 1.0007022619247437, "sampling/importance_sampling_ratio/min": 0.621472179889679, "sampling/sampling_logp_difference/max": 0.4756641387939453, "sampling/sampling_logp_difference/mean": 0.012865693308413029, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 155.875, "completions/mean_terminated_length": 155.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.585537314414978, "epoch": 0.8389380530973451, "frac_reward_zero_std": 0.75, "grad_norm": 0.9360960877970692, "kl": 0.06327377259731293, "learning_rate": 7.214040433267198e-07, "loss": 0.0017, "num_tokens": 10827886.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.4348258972167969, "sampling/importance_sampling_ratio/mean": 0.9995330572128296, "sampling/importance_sampling_ratio/min": 0.6743716597557068, "sampling/sampling_logp_difference/max": 0.3939739465713501, "sampling/sampling_logp_difference/mean": 0.01880214363336563, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 150.359375, "completions/mean_terminated_length": 150.359375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5375866889953613, "epoch": 0.8407079646017699, "frac_reward_zero_std": 0.75, "grad_norm": 1.147876242707666, "kl": 0.053672075271606445, "learning_rate": 7.200181299987482e-07, "loss": 0.0424, "num_tokens": 10849205.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.456792950630188, "sampling/importance_sampling_ratio/mean": 0.9994627833366394, "sampling/importance_sampling_ratio/min": 0.6453744769096375, "sampling/sampling_logp_difference/max": 0.4379246234893799, "sampling/sampling_logp_difference/mean": 0.018585899844765663, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 128.609375, "completions/mean_terminated_length": 128.609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.3609449863433838, "epoch": 0.8424778761061947, "frac_reward_zero_std": 1.0, "grad_norm": 0.028518032051111687, "kl": 0.03358837962150574, "learning_rate": 7.186301171703688e-07, "loss": 0.0004, "num_tokens": 10867388.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3706374168395996, "sampling/importance_sampling_ratio/mean": 1.0000519752502441, "sampling/importance_sampling_ratio/min": 0.6793352365493774, "sampling/sampling_logp_difference/max": 0.3866405487060547, "sampling/sampling_logp_difference/mean": 0.014430605806410313, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 144.96875, "completions/mean_terminated_length": 144.96875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.5752789378166199, "epoch": 0.8442477876106195, "frac_reward_zero_std": 0.75, "grad_norm": 1.062236280844594, "kl": 0.06892964243888855, "learning_rate": 7.172400180865513e-07, "loss": -0.0185, "num_tokens": 10888986.0, "reward": 0.84375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.84375, "rewards/decision_reward_func/std": 0.5409794449806213, "sampling/importance_sampling_ratio/max": 1.2689205408096313, "sampling/importance_sampling_ratio/mean": 0.9994760751724243, "sampling/importance_sampling_ratio/min": 0.7767874002456665, "sampling/sampling_logp_difference/max": 0.2525886297225952, "sampling/sampling_logp_difference/mean": 0.01805744506418705, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 173.859375, "completions/mean_terminated_length": 173.859375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.5288122892379761, "epoch": 0.8460176991150442, "frac_reward_zero_std": 0.5, "grad_norm": 1.3337386411903083, "kl": 0.05211670696735382, "learning_rate": 7.158478460121734e-07, "loss": 0.0176, "num_tokens": 10911441.0, "reward": 0.75, "reward_std": 0.42078250646591187, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.4720792770385742, "sampling/importance_sampling_ratio/mean": 0.9999220371246338, "sampling/importance_sampling_ratio/min": 0.6960464119911194, "sampling/sampling_logp_difference/max": 0.3866758346557617, "sampling/sampling_logp_difference/mean": 0.016892949119210243, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 252.328125, "completions/mean_terminated_length": 252.328125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.6607073545455933, "epoch": 0.8477876106194691, "frac_reward_zero_std": 0.0, "grad_norm": 1.3976858016934577, "kl": 0.07387363910675049, "learning_rate": 7.144536142318944e-07, "loss": 0.0108, "num_tokens": 10939126.0, "reward": 0.28125, "reward_std": 0.6601393222808838, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.2892944812774658, "sampling/importance_sampling_ratio/mean": 0.9996263980865479, "sampling/importance_sampling_ratio/min": 0.6066848635673523, "sampling/sampling_logp_difference/max": 0.49974584579467773, "sampling/sampling_logp_difference/mean": 0.018056483939290047, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 144.609375, "completions/mean_terminated_length": 144.609375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.5071415901184082, "epoch": 0.8495575221238938, "frac_reward_zero_std": 0.75, "grad_norm": 1.1595070620281003, "kl": 0.06269523501396179, "learning_rate": 7.130573360500276e-07, "loss": 0.0044, "num_tokens": 10960237.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.4925907850265503, "sampling/importance_sampling_ratio/mean": 1.000237226486206, "sampling/importance_sampling_ratio/min": 0.694139301776886, "sampling/sampling_logp_difference/max": 0.4005134105682373, "sampling/sampling_logp_difference/mean": 0.01662774570286274, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 151.71875, "completions/mean_terminated_length": 151.71875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.49337881803512573, "epoch": 0.8513274336283185, "frac_reward_zero_std": 1.0, "grad_norm": 0.036517167279457426, "kl": 0.05040563642978668, "learning_rate": 7.116590247904143e-07, "loss": 0.0005, "num_tokens": 10979451.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3399325609207153, "sampling/importance_sampling_ratio/mean": 0.9999549984931946, "sampling/importance_sampling_ratio/min": 0.7470673322677612, "sampling/sampling_logp_difference/max": 0.2926192283630371, "sampling/sampling_logp_difference/mean": 0.016386952251195908, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 160.0, "completions/mean_terminated_length": 160.0, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.385990709066391, "epoch": 0.8530973451327434, "frac_reward_zero_std": 1.0, "grad_norm": 0.02866313871220871, "kl": 0.043752796947956085, "learning_rate": 7.10258693796296e-07, "loss": 0.0005, "num_tokens": 10999867.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4061437845230103, "sampling/importance_sampling_ratio/mean": 1.0009467601776123, "sampling/importance_sampling_ratio/min": 0.7561548352241516, "sampling/sampling_logp_difference/max": 0.3408510684967041, "sampling/sampling_logp_difference/mean": 0.013079589232802391, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 223.21875, "completions/mean_terminated_length": 223.21875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4007635712623596, "epoch": 0.8548672566371681, "frac_reward_zero_std": 0.75, "grad_norm": 0.7446374945017039, "kl": 0.03801294416189194, "learning_rate": 7.088563564301873e-07, "loss": -0.0126, "num_tokens": 11024857.0, "reward": 0.5625, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.5625, "rewards/decision_reward_func/std": 0.8333333730697632, "sampling/importance_sampling_ratio/max": 1.2873942852020264, "sampling/importance_sampling_ratio/mean": 0.999816358089447, "sampling/importance_sampling_ratio/min": 0.7708359956741333, "sampling/sampling_logp_difference/max": 0.26027965545654297, "sampling/sampling_logp_difference/mean": 0.013511259108781815, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 141.953125, "completions/mean_terminated_length": 141.953125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.5220211148262024, "epoch": 0.856637168141593, "frac_reward_zero_std": 0.5, "grad_norm": 1.5450247512624973, "kl": 0.07348264753818512, "learning_rate": 7.074520260737487e-07, "loss": -0.0014, "num_tokens": 11045814.0, "reward": 0.375, "reward_std": 0.36435678601264954, "rewards/decision_reward_func/mean": 0.375, "rewards/decision_reward_func/std": 0.934353232383728, "sampling/importance_sampling_ratio/max": 1.5953445434570312, "sampling/importance_sampling_ratio/mean": 1.0001152753829956, "sampling/importance_sampling_ratio/min": 0.6447339057922363, "sampling/sampling_logp_difference/max": 0.4670896530151367, "sampling/sampling_logp_difference/mean": 0.01728680171072483, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 96.9375, "completions/mean_terminated_length": 96.9375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.32135164737701416, "epoch": 0.8584070796460177, "frac_reward_zero_std": 1.0, "grad_norm": 0.04095930272407262, "kl": 0.036933060735464096, "learning_rate": 7.06045716127658e-07, "loss": 0.0004, "num_tokens": 11062098.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.60867440700531, "sampling/importance_sampling_ratio/mean": 1.0000686645507812, "sampling/importance_sampling_ratio/min": 0.6944010853767395, "sampling/sampling_logp_difference/max": 0.47541046142578125, "sampling/sampling_logp_difference/mean": 0.01514124684035778, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 151.421875, "completions/mean_terminated_length": 151.421875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.42495667934417725, "epoch": 0.8601769911504424, "frac_reward_zero_std": 1.0, "grad_norm": 0.03181384667927138, "kl": 0.04488690197467804, "learning_rate": 7.04637440011484e-07, "loss": 0.0005, "num_tokens": 11082749.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3687118291854858, "sampling/importance_sampling_ratio/mean": 1.0004634857177734, "sampling/importance_sampling_ratio/min": 0.6973159909248352, "sampling/sampling_logp_difference/max": 0.3605165481567383, "sampling/sampling_logp_difference/mean": 0.015262722969055176, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 916.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 200.703125, "completions/mean_terminated_length": 200.703125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4573553204536438, "epoch": 0.8619469026548673, "frac_reward_zero_std": 1.0, "grad_norm": 0.02711366077742337, "kl": 0.04593441262841225, "learning_rate": 7.032272111635565e-07, "loss": 0.0004, "num_tokens": 11105466.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.2976807355880737, "sampling/importance_sampling_ratio/mean": 1.000078797340393, "sampling/importance_sampling_ratio/min": 0.6960489749908447, "sampling/sampling_logp_difference/max": 0.362335205078125, "sampling/sampling_logp_difference/mean": 0.016789868474006653, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 161.4375, "completions/mean_terminated_length": 161.4375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.5254327058792114, "epoch": 0.863716814159292, "frac_reward_zero_std": 1.0, "grad_norm": 0.0332785383253741, "kl": 0.05357012152671814, "learning_rate": 7.018150430408394e-07, "loss": 0.0006, "num_tokens": 11130358.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.4161489009857178, "sampling/importance_sampling_ratio/mean": 0.9994027018547058, "sampling/importance_sampling_ratio/min": 0.7706308960914612, "sampling/sampling_logp_difference/max": 0.34794116020202637, "sampling/sampling_logp_difference/mean": 0.01691107265651226, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5288914442062378, "epoch": 0.8654867256637168, "frac_reward_zero_std": 0.75, "grad_norm": 0.9572866540516841, "kl": 0.05500596761703491, "learning_rate": 7.004009491188022e-07, "loss": -0.0156, "num_tokens": 11154518.0, "reward": 0.28125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.28125, "rewards/decision_reward_func/std": 0.9672207236289978, "sampling/importance_sampling_ratio/max": 1.4253398180007935, "sampling/importance_sampling_ratio/mean": 1.0004236698150635, "sampling/importance_sampling_ratio/min": 0.7022836804389954, "sampling/sampling_logp_difference/max": 0.35441017150878906, "sampling/sampling_logp_difference/mean": 0.016183752566576004, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 123.421875, "completions/mean_terminated_length": 123.421875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.3059542775154114, "epoch": 0.8672566371681416, "frac_reward_zero_std": 1.0, "grad_norm": 0.02822826327338971, "kl": 0.031212633475661278, "learning_rate": 6.989849428912907e-07, "loss": 0.0003, "num_tokens": 11171825.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2959455251693726, "sampling/importance_sampling_ratio/mean": 0.9996449947357178, "sampling/importance_sampling_ratio/min": 0.6889712810516357, "sampling/sampling_logp_difference/max": 0.3725557327270508, "sampling/sampling_logp_difference/mean": 0.013596318662166595, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 131.09375, "completions/mean_terminated_length": 131.09375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.3289196193218231, "epoch": 0.8690265486725663, "frac_reward_zero_std": 1.0, "grad_norm": 0.025211836053442026, "kl": 0.03459160774946213, "learning_rate": 6.975670378703992e-07, "loss": 0.0004, "num_tokens": 11190055.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.405799150466919, "sampling/importance_sampling_ratio/mean": 0.9996116161346436, "sampling/importance_sampling_ratio/min": 0.6268575191497803, "sampling/sampling_logp_difference/max": 0.46703600883483887, "sampling/sampling_logp_difference/mean": 0.013733677566051483, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 211.171875, "completions/mean_terminated_length": 211.171875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.5561668872833252, "epoch": 0.8707964601769912, "frac_reward_zero_std": 0.5, "grad_norm": 1.1722902782288536, "kl": 0.06487689912319183, "learning_rate": 6.961472475863405e-07, "loss": 0.0084, "num_tokens": 11217634.0, "reward": 0.09375, "reward_std": 0.34860679507255554, "rewards/decision_reward_func/mean": 0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.549800992012024, "sampling/importance_sampling_ratio/mean": 0.9999545812606812, "sampling/importance_sampling_ratio/min": 0.7396615743637085, "sampling/sampling_logp_difference/max": 0.4381265640258789, "sampling/sampling_logp_difference/mean": 0.016957048326730728, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 338.65625, "completions/mean_terminated_length": 338.65625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.4920404851436615, "epoch": 0.8725663716814159, "frac_reward_zero_std": 0.5, "grad_norm": 1.023947735032229, "kl": 0.03984961286187172, "learning_rate": 6.947255855873176e-07, "loss": 0.0137, "num_tokens": 11248828.0, "reward": 0.46875, "reward_std": 0.29578250646591187, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.2872081995010376, "sampling/importance_sampling_ratio/mean": 1.0005526542663574, "sampling/importance_sampling_ratio/min": 0.7777249813079834, "sampling/sampling_logp_difference/max": 0.2524757385253906, "sampling/sampling_logp_difference/mean": 0.014068529941141605, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 203.03125, "completions/mean_terminated_length": 203.03125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.54365473985672, "epoch": 0.8743362831858407, "frac_reward_zero_std": 0.75, "grad_norm": 0.8781848845775025, "kl": 0.04530525952577591, "learning_rate": 6.93302065439394e-07, "loss": 0.0259, "num_tokens": 11275774.0, "reward": 0.25, "reward_std": 0.25819888710975647, "rewards/decision_reward_func/mean": 0.25, "rewards/decision_reward_func/std": 0.9759001135826111, "sampling/importance_sampling_ratio/max": 1.4039392471313477, "sampling/importance_sampling_ratio/mean": 1.000096321105957, "sampling/importance_sampling_ratio/min": 0.6772672533988953, "sampling/sampling_logp_difference/max": 0.3896893262863159, "sampling/sampling_logp_difference/mean": 0.017321724444627762, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 182.0, "completions/max_terminated_length": 182.0, "completions/mean_length": 107.1875, "completions/mean_terminated_length": 107.1875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.28042668104171753, "epoch": 0.8761061946902655, "frac_reward_zero_std": 1.0, "grad_norm": 0.026140178793081347, "kl": 0.025253865867853165, "learning_rate": 6.918767007263645e-07, "loss": 0.0003, "num_tokens": 11291530.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.564685344696045, "sampling/importance_sampling_ratio/mean": 1.0004124641418457, "sampling/importance_sampling_ratio/min": 0.7787970900535583, "sampling/sampling_logp_difference/max": 0.44768476486206055, "sampling/sampling_logp_difference/mean": 0.012597857043147087, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 109.5625, "completions/mean_terminated_length": 109.5625, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.3203164339065552, "epoch": 0.8778761061946903, "frac_reward_zero_std": 1.0, "grad_norm": 0.029914991404061227, "kl": 0.029022786766290665, "learning_rate": 6.904495050496258e-07, "loss": 0.0003, "num_tokens": 11309710.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4687899351119995, "sampling/importance_sampling_ratio/mean": 1.0006183385849, "sampling/importance_sampling_ratio/min": 0.7097292542457581, "sampling/sampling_logp_difference/max": 0.3844388723373413, "sampling/sampling_logp_difference/mean": 0.014193766750395298, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 168.09375, "completions/mean_terminated_length": 168.09375, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.48099666833877563, "epoch": 0.879646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.0155494990382203, "kl": 0.046392716467380524, "learning_rate": 6.890204920280457e-07, "loss": 0.0237, "num_tokens": 11332868.0, "reward": 0.125, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.125, "rewards/decision_reward_func/std": 1.0, "sampling/importance_sampling_ratio/max": 1.3051731586456299, "sampling/importance_sampling_ratio/mean": 0.9998812675476074, "sampling/importance_sampling_ratio/min": 0.6971226334571838, "sampling/sampling_logp_difference/max": 0.36079394817352295, "sampling/sampling_logp_difference/mean": 0.015075111761689186, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 154.6875, "completions/mean_terminated_length": 154.6875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4950066804885864, "epoch": 0.8814159292035398, "frac_reward_zero_std": 0.75, "grad_norm": 1.0197985747632474, "kl": 0.06584899127483368, "learning_rate": 6.875896752978344e-07, "loss": -0.0003, "num_tokens": 11355344.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.282453179359436, "sampling/importance_sampling_ratio/mean": 0.999314546585083, "sampling/importance_sampling_ratio/min": 0.7079713940620422, "sampling/sampling_logp_difference/max": 0.34535157680511475, "sampling/sampling_logp_difference/mean": 0.016382791101932526, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 172.84375, "completions/mean_terminated_length": 172.84375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "entropy": 0.4558072090148926, "epoch": 0.8831858407079646, "frac_reward_zero_std": 0.75, "grad_norm": 0.897514938846476, "kl": 0.06502187997102737, "learning_rate": 6.861570685124134e-07, "loss": 0.0245, "num_tokens": 11376726.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.3139907121658325, "sampling/importance_sampling_ratio/mean": 0.9999817609786987, "sampling/importance_sampling_ratio/min": 0.631839394569397, "sampling/sampling_logp_difference/max": 0.4591200351715088, "sampling/sampling_logp_difference/mean": 0.015304536558687687, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 180.578125, "completions/mean_terminated_length": 180.578125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.5417348146438599, "epoch": 0.8849557522123894, "frac_reward_zero_std": 1.0, "grad_norm": 0.027568689738492257, "kl": 0.05047761648893356, "learning_rate": 6.847226853422861e-07, "loss": 0.0005, "num_tokens": 11403035.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3918542861938477, "sampling/importance_sampling_ratio/mean": 1.0008066892623901, "sampling/importance_sampling_ratio/min": 0.6774313449859619, "sampling/sampling_logp_difference/max": 0.38944709300994873, "sampling/sampling_logp_difference/mean": 0.01726376637816429, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 114.390625, "completions/mean_terminated_length": 114.390625, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "entropy": 0.4357292354106903, "epoch": 0.8867256637168142, "frac_reward_zero_std": 0.75, "grad_norm": 1.3679384762315754, "kl": 0.056346792727708817, "learning_rate": 6.832865394749065e-07, "loss": 0.0006, "num_tokens": 11422356.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.2793991565704346, "sampling/importance_sampling_ratio/mean": 0.9996085166931152, "sampling/importance_sampling_ratio/min": 0.6171379685401917, "sampling/sampling_logp_difference/max": 0.4826626777648926, "sampling/sampling_logp_difference/mean": 0.016202447935938835, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 128.390625, "completions/mean_terminated_length": 128.390625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.4350653886795044, "epoch": 0.8884955752212389, "frac_reward_zero_std": 1.0, "grad_norm": 0.030346917844838946, "kl": 0.0385807603597641, "learning_rate": 6.818486446145486e-07, "loss": 0.0004, "num_tokens": 11441917.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.406125783920288, "sampling/importance_sampling_ratio/mean": 0.9999099969863892, "sampling/importance_sampling_ratio/min": 0.6496437191963196, "sampling/sampling_logp_difference/max": 0.43133115768432617, "sampling/sampling_logp_difference/mean": 0.017174214124679565, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 116.765625, "completions/mean_terminated_length": 116.765625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.45700663328170776, "epoch": 0.8902654867256637, "frac_reward_zero_std": 1.0, "grad_norm": 0.030081798969186637, "kl": 0.04402727261185646, "learning_rate": 6.804090144821772e-07, "loss": 0.0005, "num_tokens": 11459886.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4959650039672852, "sampling/importance_sampling_ratio/mean": 1.0003783702850342, "sampling/importance_sampling_ratio/min": 0.73115074634552, "sampling/sampling_logp_difference/max": 0.4027714729309082, "sampling/sampling_logp_difference/mean": 0.017588015645742416, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 143.5, "completions/mean_terminated_length": 143.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.43855568766593933, "epoch": 0.8920353982300885, "frac_reward_zero_std": 1.0, "grad_norm": 0.03683240239684041, "kl": 0.04783044010400772, "learning_rate": 6.789676628153143e-07, "loss": 0.0005, "num_tokens": 11480830.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.3063552379608154, "sampling/importance_sampling_ratio/mean": 1.0003074407577515, "sampling/importance_sampling_ratio/min": 0.7497009038925171, "sampling/sampling_logp_difference/max": 0.28808093070983887, "sampling/sampling_logp_difference/mean": 0.014854389242827892, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 145.234375, "completions/mean_terminated_length": 145.234375, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.38934558629989624, "epoch": 0.8938053097345132, "frac_reward_zero_std": 1.0, "grad_norm": 0.024098191218664168, "kl": 0.031024843454360962, "learning_rate": 6.775246033679104e-07, "loss": 0.0003, "num_tokens": 11500669.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5753343105316162, "sampling/importance_sampling_ratio/mean": 1.0002654790878296, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.4544675350189209, "sampling/sampling_logp_difference/mean": 0.014444435946643353, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 155.1875, "completions/mean_terminated_length": 155.1875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.5077278017997742, "epoch": 0.8955752212389381, "frac_reward_zero_std": 1.0, "grad_norm": 0.033627166083113544, "kl": 0.06282509118318558, "learning_rate": 6.76079849910212e-07, "loss": 0.0007, "num_tokens": 11520953.0, "reward": 0.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.0, "rewards/decision_reward_func/std": 1.0079052448272705, "sampling/importance_sampling_ratio/max": 1.469017744064331, "sampling/importance_sampling_ratio/mean": 0.9998058080673218, "sampling/importance_sampling_ratio/min": 0.6368654370307922, "sampling/sampling_logp_difference/max": 0.45119690895080566, "sampling/sampling_logp_difference/mean": 0.016574004665017128, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 97.96875, "completions/mean_terminated_length": 97.96875, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.24034933745861053, "epoch": 0.8973451327433628, "frac_reward_zero_std": 1.0, "grad_norm": 0.030981095291774924, "kl": 0.020224817097187042, "learning_rate": 6.746334162286307e-07, "loss": 0.0002, "num_tokens": 11536951.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6272845268249512, "sampling/importance_sampling_ratio/mean": 1.00011146068573, "sampling/importance_sampling_ratio/min": 0.43750959634780884, "sampling/sampling_logp_difference/max": 0.826656699180603, "sampling/sampling_logp_difference/mean": 0.013065225444734097, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 130.84375, "completions/mean_terminated_length": 130.84375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3243350386619568, "epoch": 0.8991150442477877, "frac_reward_zero_std": 1.0, "grad_norm": 0.022043362891743003, "kl": 0.029396357014775276, "learning_rate": 6.731853161256113e-07, "loss": 0.0003, "num_tokens": 11554589.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5598468780517578, "sampling/importance_sampling_ratio/mean": 1.000610589981079, "sampling/importance_sampling_ratio/min": 0.6803026795387268, "sampling/sampling_logp_difference/max": 0.44458770751953125, "sampling/sampling_logp_difference/mean": 0.013580295257270336, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 135.4375, "completions/mean_terminated_length": 135.4375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4605197608470917, "epoch": 0.9008849557522124, "frac_reward_zero_std": 0.75, "grad_norm": 1.1726230337445993, "kl": 0.052446477115154266, "learning_rate": 6.717355634195004e-07, "loss": -0.0156, "num_tokens": 11573881.0, "reward": 0.34375, "reward_std": 0.23935678601264954, "rewards/decision_reward_func/mean": 0.34375, "rewards/decision_reward_func/std": 0.9464847445487976, "sampling/importance_sampling_ratio/max": 1.595988154411316, "sampling/importance_sampling_ratio/mean": 0.9996259212493896, "sampling/importance_sampling_ratio/min": 0.7329869270324707, "sampling/sampling_logp_difference/max": 0.46749305725097656, "sampling/sampling_logp_difference/mean": 0.016616176813840866, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 171.1875, "completions/mean_terminated_length": 171.1875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.5736621022224426, "epoch": 0.9026548672566371, "frac_reward_zero_std": 0.75, "grad_norm": 0.929166370597461, "kl": 0.08164776861667633, "learning_rate": 6.70284171944414e-07, "loss": -0.0152, "num_tokens": 11595221.0, "reward": 0.40625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.40625, "rewards/decision_reward_func/std": 0.9209855198860168, "sampling/importance_sampling_ratio/max": 1.3658151626586914, "sampling/importance_sampling_ratio/mean": 0.9998589754104614, "sampling/importance_sampling_ratio/min": 0.6254509687423706, "sampling/sampling_logp_difference/max": 0.4692823886871338, "sampling/sampling_logp_difference/mean": 0.01716122031211853, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 120.90625, "completions/mean_terminated_length": 120.90625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "entropy": 0.4975433349609375, "epoch": 0.904424778761062, "frac_reward_zero_std": 0.75, "grad_norm": 1.4552774074894985, "kl": 0.06420423090457916, "learning_rate": 6.688311555501063e-07, "loss": 0.0125, "num_tokens": 11614815.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.4139158725738525, "sampling/importance_sampling_ratio/mean": 0.9999160766601562, "sampling/importance_sampling_ratio/min": 0.6870979070663452, "sampling/sampling_logp_difference/max": 0.3752784729003906, "sampling/sampling_logp_difference/mean": 0.018069611862301826, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 158.21875, "completions/mean_terminated_length": 158.21875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.48136845231056213, "epoch": 0.9061946902654867, "frac_reward_zero_std": 0.75, "grad_norm": 1.0306633495867539, "kl": 0.04570081830024719, "learning_rate": 6.673765281018372e-07, "loss": 0.0012, "num_tokens": 11634749.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.2743909358978271, "sampling/importance_sampling_ratio/mean": 0.9994518756866455, "sampling/importance_sampling_ratio/min": 0.7109566926956177, "sampling/sampling_logp_difference/max": 0.3411438465118408, "sampling/sampling_logp_difference/mean": 0.017170319333672523, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 159.078125, "completions/mean_terminated_length": 159.078125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "entropy": 0.40064916014671326, "epoch": 0.9079646017699115, "frac_reward_zero_std": 0.75, "grad_norm": 1.1565868940811759, "kl": 0.03733426332473755, "learning_rate": 6.659203034802396e-07, "loss": 0.0168, "num_tokens": 11656114.0, "reward": 0.875, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.875, "rewards/decision_reward_func/std": 0.48795005679130554, "sampling/importance_sampling_ratio/max": 1.2868289947509766, "sampling/importance_sampling_ratio/mean": 1.0002918243408203, "sampling/importance_sampling_ratio/min": 0.7122735977172852, "sampling/sampling_logp_difference/max": 0.3392932415008545, "sampling/sampling_logp_difference/mean": 0.014757532626390457, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 123.796875, "completions/mean_terminated_length": 123.796875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5170900821685791, "epoch": 0.9097345132743363, "frac_reward_zero_std": 0.75, "grad_norm": 1.2877298459068507, "kl": 0.07492925971746445, "learning_rate": 6.644624955811873e-07, "loss": 0.0121, "num_tokens": 11682117.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.297922134399414, "sampling/importance_sampling_ratio/mean": 0.9996405839920044, "sampling/importance_sampling_ratio/min": 0.6141785383224487, "sampling/sampling_logp_difference/max": 0.4874696731567383, "sampling/sampling_logp_difference/mean": 0.01811598241329193, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 142.3125, "completions/mean_terminated_length": 142.3125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.3735528290271759, "epoch": 0.911504424778761, "frac_reward_zero_std": 1.0, "grad_norm": 0.02537209831694192, "kl": 0.025548133999109268, "learning_rate": 6.630031183156627e-07, "loss": 0.0002, "num_tokens": 11700953.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5218216180801392, "sampling/importance_sampling_ratio/mean": 0.9998520612716675, "sampling/importance_sampling_ratio/min": 0.6755814552307129, "sampling/sampling_logp_difference/max": 0.4199080467224121, "sampling/sampling_logp_difference/mean": 0.015386695973575115, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 145.875, "completions/mean_terminated_length": 145.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4176895022392273, "epoch": 0.9132743362831859, "frac_reward_zero_std": 1.0, "grad_norm": 0.028163093425068782, "kl": 0.03728806972503662, "learning_rate": 6.61542185609623e-07, "loss": 0.0004, "num_tokens": 11722417.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6107532978057861, "sampling/importance_sampling_ratio/mean": 0.9997249841690063, "sampling/importance_sampling_ratio/min": 0.7600250244140625, "sampling/sampling_logp_difference/max": 0.4767019748687744, "sampling/sampling_logp_difference/mean": 0.015322001650929451, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 166.34375, "completions/mean_terminated_length": 166.34375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.3786749839782715, "epoch": 0.9150442477876106, "frac_reward_zero_std": 1.0, "grad_norm": 0.03326183617417285, "kl": 0.03697134554386139, "learning_rate": 6.60079711403869e-07, "loss": 0.0004, "num_tokens": 11744871.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3370469808578491, "sampling/importance_sampling_ratio/mean": 1.000791072845459, "sampling/importance_sampling_ratio/min": 0.7023168206214905, "sampling/sampling_logp_difference/max": 0.35337066650390625, "sampling/sampling_logp_difference/mean": 0.013923844322562218, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 175.65625, "completions/mean_terminated_length": 175.65625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.26630160212516785, "epoch": 0.9168141592920354, "frac_reward_zero_std": 1.0, "grad_norm": 0.022789951115606263, "kl": 0.02661610022187233, "learning_rate": 6.586157096539104e-07, "loss": 0.0003, "num_tokens": 11765681.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.468764305114746, "sampling/importance_sampling_ratio/mean": 0.9996398091316223, "sampling/importance_sampling_ratio/min": 0.6202806830406189, "sampling/sampling_logp_difference/max": 0.4775831699371338, "sampling/sampling_logp_difference/mean": 0.01199787575751543, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 133.609375, "completions/mean_terminated_length": 133.609375, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.3493505120277405, "epoch": 0.9185840707964602, "frac_reward_zero_std": 1.0, "grad_norm": 0.03952156425404006, "kl": 0.0363231897354126, "learning_rate": 6.571501943298335e-07, "loss": 0.0004, "num_tokens": 11783624.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.530820608139038, "sampling/importance_sampling_ratio/mean": 1.000678539276123, "sampling/importance_sampling_ratio/min": 0.6168789267539978, "sampling/sampling_logp_difference/max": 0.48308253288269043, "sampling/sampling_logp_difference/mean": 0.015622604638338089, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 171.109375, "completions/mean_terminated_length": 171.109375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3881581425666809, "epoch": 0.9203539823008849, "frac_reward_zero_std": 1.0, "grad_norm": 0.020049575859725548, "kl": 0.03495844081044197, "learning_rate": 6.556831794161677e-07, "loss": 0.0003, "num_tokens": 11804463.0, "reward": -0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": -0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5633995532989502, "sampling/importance_sampling_ratio/mean": 1.000249981880188, "sampling/importance_sampling_ratio/min": 0.679577648639679, "sampling/sampling_logp_difference/max": 0.44686269760131836, "sampling/sampling_logp_difference/mean": 0.01593586802482605, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 203.609375, "completions/mean_terminated_length": 203.609375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.3289709985256195, "epoch": 0.9221238938053097, "frac_reward_zero_std": 1.0, "grad_norm": 0.030300394275281445, "kl": 0.03109907917678356, "learning_rate": 6.542146789117523e-07, "loss": 0.0003, "num_tokens": 11826758.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5853697061538696, "sampling/importance_sampling_ratio/mean": 0.9997753500938416, "sampling/importance_sampling_ratio/min": 0.7064435482025146, "sampling/sampling_logp_difference/max": 0.4608175754547119, "sampling/sampling_logp_difference/mean": 0.013363179750740528, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 132.546875, "completions/mean_terminated_length": 132.546875, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3163865804672241, "epoch": 0.9238938053097345, "frac_reward_zero_std": 1.0, "grad_norm": 0.027720750192685717, "kl": 0.03124506026506424, "learning_rate": 6.527447068296025e-07, "loss": 0.0003, "num_tokens": 11844713.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4982117414474487, "sampling/importance_sampling_ratio/mean": 0.9996817111968994, "sampling/importance_sampling_ratio/min": 0.7301310896873474, "sampling/sampling_logp_difference/max": 0.404272198677063, "sampling/sampling_logp_difference/mean": 0.01464778184890747, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 144.140625, "completions/mean_terminated_length": 144.140625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "entropy": 0.4128933846950531, "epoch": 0.9256637168141593, "frac_reward_zero_std": 0.75, "grad_norm": 1.1419393301748153, "kl": 0.042740676552057266, "learning_rate": 6.512732771967758e-07, "loss": -0.0221, "num_tokens": 11865010.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.326668381690979, "sampling/importance_sampling_ratio/mean": 1.0002739429473877, "sampling/importance_sampling_ratio/min": 0.6690137982368469, "sampling/sampling_logp_difference/max": 0.4019505977630615, "sampling/sampling_logp_difference/mean": 0.016076423227787018, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 227.734375, "completions/mean_terminated_length": 227.734375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "entropy": 0.4210180640220642, "epoch": 0.9274336283185841, "frac_reward_zero_std": 1.0, "grad_norm": 0.02338603654633819, "kl": 0.03688063472509384, "learning_rate": 6.498004040542384e-07, "loss": 0.0004, "num_tokens": 11895681.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5071555376052856, "sampling/importance_sampling_ratio/mean": 0.9997272491455078, "sampling/importance_sampling_ratio/min": 0.6994218230247498, "sampling/sampling_logp_difference/max": 0.4102240800857544, "sampling/sampling_logp_difference/mean": 0.01420167088508606, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 153.625, "completions/mean_terminated_length": 153.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.3878311514854431, "epoch": 0.9292035398230089, "frac_reward_zero_std": 1.0, "grad_norm": 0.02325109143409543, "kl": 0.031965360045433044, "learning_rate": 6.483261014567311e-07, "loss": 0.0003, "num_tokens": 11916889.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2996877431869507, "sampling/importance_sampling_ratio/mean": 0.9991119503974915, "sampling/importance_sampling_ratio/min": 0.6263453960418701, "sampling/sampling_logp_difference/max": 0.467853307723999, "sampling/sampling_logp_difference/mean": 0.015429418534040451, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 167.90625, "completions/mean_terminated_length": 167.90625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "entropy": 0.47103917598724365, "epoch": 0.9309734513274336, "frac_reward_zero_std": 1.0, "grad_norm": 0.026374450972294128, "kl": 0.03843798488378525, "learning_rate": 6.468503834726349e-07, "loss": 0.0004, "num_tokens": 11939987.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2908649444580078, "sampling/importance_sampling_ratio/mean": 1.0000659227371216, "sampling/importance_sampling_ratio/min": 0.6879383325576782, "sampling/sampling_logp_difference/max": 0.3740561008453369, "sampling/sampling_logp_difference/mean": 0.017067167907953262, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 173.671875, "completions/mean_terminated_length": 173.671875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.44619613885879517, "epoch": 0.9327433628318584, "frac_reward_zero_std": 1.0, "grad_norm": 0.03301712563254451, "kl": 0.053856320679187775, "learning_rate": 6.453732641838371e-07, "loss": 0.0006, "num_tokens": 11962830.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.463612675666809, "sampling/importance_sampling_ratio/mean": 0.9998467564582825, "sampling/importance_sampling_ratio/min": 0.6220102906227112, "sampling/sampling_logp_difference/max": 0.47479867935180664, "sampling/sampling_logp_difference/mean": 0.016407469287514687, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 241.078125, "completions/mean_terminated_length": 241.078125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "entropy": 0.5302072763442993, "epoch": 0.9345132743362832, "frac_reward_zero_std": 0.5, "grad_norm": 1.0162961773695491, "kl": 0.05674497038125992, "learning_rate": 6.438947576855966e-07, "loss": 0.065, "num_tokens": 11995027.0, "reward": 0.3125, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.3125, "rewards/decision_reward_func/std": 0.9574271440505981, "sampling/importance_sampling_ratio/max": 1.4823534488677979, "sampling/importance_sampling_ratio/mean": 1.0003397464752197, "sampling/importance_sampling_ratio/min": 0.6627068519592285, "sampling/sampling_logp_difference/max": 0.4114224910736084, "sampling/sampling_logp_difference/mean": 0.016194399446249008, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "entropy": 0.5207070708274841, "epoch": 0.9362831858407079, "frac_reward_zero_std": 0.75, "grad_norm": 0.8818870799114665, "kl": 0.05992306023836136, "learning_rate": 6.424148780864103e-07, "loss": -0.0149, "num_tokens": 12021675.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.5495630502700806, "sampling/importance_sampling_ratio/mean": 1.0000014305114746, "sampling/importance_sampling_ratio/min": 0.7507371306419373, "sampling/sampling_logp_difference/max": 0.4379730224609375, "sampling/sampling_logp_difference/mean": 0.016500603407621384, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 143.53125, "completions/mean_terminated_length": 143.53125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.45067259669303894, "epoch": 0.9380530973451328, "frac_reward_zero_std": 1.0, "grad_norm": 0.11701618369725716, "kl": 0.059737443923950195, "learning_rate": 6.409336395078771e-07, "loss": 0.0007, "num_tokens": 12042589.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3233445882797241, "sampling/importance_sampling_ratio/mean": 0.9995430707931519, "sampling/importance_sampling_ratio/min": 0.6298382878303528, "sampling/sampling_logp_difference/max": 0.4622921943664551, "sampling/sampling_logp_difference/mean": 0.017670437693595886, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 158.046875, "completions/mean_terminated_length": 158.046875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "entropy": 0.4408401846885681, "epoch": 0.9398230088495575, "frac_reward_zero_std": 0.75, "grad_norm": 1.181717859164751, "kl": 0.058012235909700394, "learning_rate": 6.394510560845636e-07, "loss": 0.0071, "num_tokens": 12069264.0, "reward": 0.625, "reward_std": 0.22360679507255554, "rewards/decision_reward_func/mean": 0.625, "rewards/decision_reward_func/std": 0.7867957949638367, "sampling/importance_sampling_ratio/max": 1.8133851289749146, "sampling/importance_sampling_ratio/mean": 1.000122308731079, "sampling/importance_sampling_ratio/min": 0.6903498768806458, "sampling/sampling_logp_difference/max": 0.5951954126358032, "sampling/sampling_logp_difference/mean": 0.016324251890182495, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 186.71875, "completions/mean_terminated_length": 186.71875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.4281838536262512, "epoch": 0.9415929203539823, "frac_reward_zero_std": 0.75, "grad_norm": 0.96229328750554, "kl": 0.04793298989534378, "learning_rate": 6.379671419638702e-07, "loss": 0.0329, "num_tokens": 12093038.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2909739017486572, "sampling/importance_sampling_ratio/mean": 1.000369906425476, "sampling/importance_sampling_ratio/min": 0.6395174860954285, "sampling/sampling_logp_difference/max": 0.447041392326355, "sampling/sampling_logp_difference/mean": 0.01448612567037344, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 141.984375, "completions/mean_terminated_length": 141.984375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.39643681049346924, "epoch": 0.9433628318584071, "frac_reward_zero_std": 0.75, "grad_norm": 1.0649369196683396, "kl": 0.04488767683506012, "learning_rate": 6.364819113058951e-07, "loss": -0.0014, "num_tokens": 12122205.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.6062982082366943, "sampling/importance_sampling_ratio/mean": 0.9994792938232422, "sampling/importance_sampling_ratio/min": 0.6171591281890869, "sampling/sampling_logp_difference/max": 0.48262834548950195, "sampling/sampling_logp_difference/mean": 0.015572316013276577, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 160.390625, "completions/mean_terminated_length": 160.390625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.37770381569862366, "epoch": 0.9451327433628318, "frac_reward_zero_std": 0.75, "grad_norm": 1.086701832076824, "kl": 0.03709157556295395, "learning_rate": 6.349953782832991e-07, "loss": 0.0184, "num_tokens": 12143590.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.2991387844085693, "sampling/importance_sampling_ratio/mean": 0.999364972114563, "sampling/importance_sampling_ratio/min": 0.6300749778747559, "sampling/sampling_logp_difference/max": 0.461916446685791, "sampling/sampling_logp_difference/mean": 0.015483014285564423, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 194.390625, "completions/mean_terminated_length": 194.390625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "entropy": 0.4973522126674652, "epoch": 0.9469026548672567, "frac_reward_zero_std": 0.5, "grad_norm": 1.2320857968486798, "kl": 0.08781930059194565, "learning_rate": 6.335075570811708e-07, "loss": 0.0507, "num_tokens": 12168047.0, "reward": 0.75, "reward_std": 0.44091323018074036, "rewards/decision_reward_func/mean": 0.75, "rewards/decision_reward_func/std": 0.6666666865348816, "sampling/importance_sampling_ratio/max": 1.3621286153793335, "sampling/importance_sampling_ratio/mean": 0.9995167851448059, "sampling/importance_sampling_ratio/min": 0.6857494115829468, "sampling/sampling_logp_difference/max": 0.3772430419921875, "sampling/sampling_logp_difference/mean": 0.015815015882253647, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 102.984375, "completions/mean_terminated_length": 102.984375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.2278555929660797, "epoch": 0.9486725663716814, "frac_reward_zero_std": 1.0, "grad_norm": 0.07993346210059651, "kl": 0.033862821757793427, "learning_rate": 6.320184618968914e-07, "loss": 0.0003, "num_tokens": 12184318.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6206591129302979, "sampling/importance_sampling_ratio/mean": 1.0003883838653564, "sampling/importance_sampling_ratio/min": 0.6372146010398865, "sampling/sampling_logp_difference/max": 0.4828329086303711, "sampling/sampling_logp_difference/mean": 0.012828746810555458, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 157.625, "completions/mean_terminated_length": 157.625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.43120914697647095, "epoch": 0.9504424778761061, "frac_reward_zero_std": 0.75, "grad_norm": 1.0956675860374605, "kl": 0.06771903485059738, "learning_rate": 6.305281069399988e-07, "loss": -0.017, "num_tokens": 12205334.0, "reward": 0.1875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.1875, "rewards/decision_reward_func/std": 0.9900296926498413, "sampling/importance_sampling_ratio/max": 1.5264860391616821, "sampling/importance_sampling_ratio/mean": 1.000443458557129, "sampling/importance_sampling_ratio/min": 0.7132288813591003, "sampling/sampling_logp_difference/max": 0.42296838760375977, "sampling/sampling_logp_difference/mean": 0.01592453010380268, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 170.265625, "completions/mean_terminated_length": 170.265625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "entropy": 0.39343422651290894, "epoch": 0.952212389380531, "frac_reward_zero_std": 1.0, "grad_norm": 0.05083102662365012, "kl": 0.05068902671337128, "learning_rate": 6.290365064320519e-07, "loss": 0.0006, "num_tokens": 12227047.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.4352738857269287, "sampling/importance_sampling_ratio/mean": 1.0004734992980957, "sampling/importance_sampling_ratio/min": 0.6985859274864197, "sampling/sampling_logp_difference/max": 0.36135566234588623, "sampling/sampling_logp_difference/mean": 0.014874329790472984, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 160.9375, "completions/mean_terminated_length": 160.9375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.48297345638275146, "epoch": 0.9539823008849557, "frac_reward_zero_std": 0.75, "grad_norm": 1.179285871703919, "kl": 0.06974999606609344, "learning_rate": 6.275436746064956e-07, "loss": -0.0058, "num_tokens": 12248483.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.5349164009094238, "sampling/importance_sampling_ratio/mean": 0.9997453689575195, "sampling/importance_sampling_ratio/min": 0.662337064743042, "sampling/sampling_logp_difference/max": 0.42847585678100586, "sampling/sampling_logp_difference/mean": 0.01870621182024479, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 150.15625, "completions/mean_terminated_length": 150.15625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.37128764390945435, "epoch": 0.9557522123893806, "frac_reward_zero_std": 0.75, "grad_norm": 1.1728417499746477, "kl": 0.061821479350328445, "learning_rate": 6.260496257085239e-07, "loss": 0.0028, "num_tokens": 12268237.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.2984933853149414, "sampling/importance_sampling_ratio/mean": 0.9998453259468079, "sampling/importance_sampling_ratio/min": 0.6117988228797913, "sampling/sampling_logp_difference/max": 0.491351842880249, "sampling/sampling_logp_difference/mean": 0.015235353261232376, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 173.34375, "completions/mean_terminated_length": 173.34375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.394723117351532, "epoch": 0.9575221238938053, "frac_reward_zero_std": 0.75, "grad_norm": 0.9425096043335918, "kl": 0.06985059380531311, "learning_rate": 6.245543739949453e-07, "loss": -0.0066, "num_tokens": 12290307.0, "reward": 0.6875, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.6875, "rewards/decision_reward_func/std": 0.7319250702857971, "sampling/importance_sampling_ratio/max": 1.50187349319458, "sampling/importance_sampling_ratio/mean": 1.0000241994857788, "sampling/importance_sampling_ratio/min": 0.6360552906990051, "sampling/sampling_logp_difference/max": 0.4524698257446289, "sampling/sampling_logp_difference/mean": 0.016086284071207047, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 154.28125, "completions/mean_terminated_length": 154.28125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "entropy": 0.3369419574737549, "epoch": 0.95929203539823, "frac_reward_zero_std": 1.0, "grad_norm": 0.037734265261044175, "kl": 0.029881250113248825, "learning_rate": 6.230579337340456e-07, "loss": 0.0004, "num_tokens": 12310821.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6232856512069702, "sampling/importance_sampling_ratio/mean": 1.00018310546875, "sampling/importance_sampling_ratio/min": 0.6511728763580322, "sampling/sampling_logp_difference/max": 0.4844522476196289, "sampling/sampling_logp_difference/mean": 0.015162945725023746, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 125.578125, "completions/mean_terminated_length": 125.578125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "entropy": 0.2706543505191803, "epoch": 0.9610619469026549, "frac_reward_zero_std": 1.0, "grad_norm": 0.06230287677510357, "kl": 0.030617382377386093, "learning_rate": 6.215603192054521e-07, "loss": 0.0003, "num_tokens": 12328346.0, "reward": 0.5, "reward_std": 0.0, "rewards/decision_reward_func/mean": 0.5, "rewards/decision_reward_func/std": 0.8728715777397156, "sampling/importance_sampling_ratio/max": 1.5816904306411743, "sampling/importance_sampling_ratio/mean": 0.9994117021560669, "sampling/importance_sampling_ratio/min": 0.43092525005340576, "sampling/sampling_logp_difference/max": 0.8418207168579102, "sampling/sampling_logp_difference/mean": 0.014314708299934864, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 116.78125, "completions/mean_terminated_length": 116.78125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.21982413530349731, "epoch": 0.9628318584070796, "frac_reward_zero_std": 1.0, "grad_norm": 0.03639281547593844, "kl": 0.022563684731721878, "learning_rate": 6.200615446999981e-07, "loss": 0.0002, "num_tokens": 12345772.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6089556217193604, "sampling/importance_sampling_ratio/mean": 1.0011626482009888, "sampling/importance_sampling_ratio/min": 0.644199788570404, "sampling/sampling_logp_difference/max": 0.4755852222442627, "sampling/sampling_logp_difference/mean": 0.01181740127503872, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 186.609375, "completions/mean_terminated_length": 186.609375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.4190259277820587, "epoch": 0.9646017699115044, "frac_reward_zero_std": 0.75, "grad_norm": 0.9419568150118021, "kl": 0.07190361618995667, "learning_rate": 6.185616245195848e-07, "loss": 0.0066, "num_tokens": 12370819.0, "reward": 0.59375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.59375, "rewards/decision_reward_func/std": 0.8110105991363525, "sampling/importance_sampling_ratio/max": 1.3118776082992554, "sampling/importance_sampling_ratio/mean": 1.00010347366333, "sampling/importance_sampling_ratio/min": 0.6482194662094116, "sampling/sampling_logp_difference/max": 0.4335259199142456, "sampling/sampling_logp_difference/mean": 0.01625240594148636, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 217.6875, "completions/mean_terminated_length": 217.6875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.4670265316963196, "epoch": 0.9663716814159292, "frac_reward_zero_std": 0.75, "grad_norm": 0.8005211649308785, "kl": 0.050225675106048584, "learning_rate": 6.170605729770469e-07, "loss": 0.0043, "num_tokens": 12398863.0, "reward": -0.09375, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": -0.09375, "rewards/decision_reward_func/std": 1.003466248512268, "sampling/importance_sampling_ratio/max": 1.405803918838501, "sampling/importance_sampling_ratio/mean": 0.9996092319488525, "sampling/importance_sampling_ratio/min": 0.6845299601554871, "sampling/sampling_logp_difference/max": 0.37902283668518066, "sampling/sampling_logp_difference/mean": 0.01664130762219429, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 162.125, "completions/mean_terminated_length": 162.125, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.32101893424987793, "epoch": 0.968141592920354, "frac_reward_zero_std": 0.75, "grad_norm": 1.1268576328489324, "kl": 0.04037083685398102, "learning_rate": 6.155584043960143e-07, "loss": -0.0284, "num_tokens": 12418791.0, "reward": 0.53125, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.53125, "rewards/decision_reward_func/std": 0.8539125919342041, "sampling/importance_sampling_ratio/max": 1.5744267702102661, "sampling/importance_sampling_ratio/mean": 0.9996744394302368, "sampling/importance_sampling_ratio/min": 0.6205015778541565, "sampling/sampling_logp_difference/max": 0.47722721099853516, "sampling/sampling_logp_difference/mean": 0.013379833661019802, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 149.8125, "completions/mean_terminated_length": 149.8125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.38839036226272583, "epoch": 0.9699115044247788, "frac_reward_zero_std": 0.75, "grad_norm": 1.1794775467338567, "kl": 0.05261142924427986, "learning_rate": 6.140551331107766e-07, "loss": -0.0047, "num_tokens": 12439211.0, "reward": 0.46875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.46875, "rewards/decision_reward_func/std": 0.8903138637542725, "sampling/importance_sampling_ratio/max": 1.2884020805358887, "sampling/importance_sampling_ratio/mean": 1.000056505203247, "sampling/importance_sampling_ratio/min": 0.7180588245391846, "sampling/sampling_logp_difference/max": 0.331203818321228, "sampling/sampling_logp_difference/mean": 0.015212539583444595, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 180.890625, "completions/mean_terminated_length": 180.890625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.34552568197250366, "epoch": 0.9716814159292035, "frac_reward_zero_std": 1.0, "grad_norm": 0.02976989825328661, "kl": 0.0316726416349411, "learning_rate": 6.125507734661458e-07, "loss": 0.0003, "num_tokens": 12463476.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5811829566955566, "sampling/importance_sampling_ratio/mean": 0.9996100664138794, "sampling/importance_sampling_ratio/min": 0.6776725649833679, "sampling/sampling_logp_difference/max": 0.4581732749938965, "sampling/sampling_logp_difference/mean": 0.014589247293770313, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 118.328125, "completions/mean_terminated_length": 118.328125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "entropy": 0.3022470474243164, "epoch": 0.9734513274336283, "frac_reward_zero_std": 1.0, "grad_norm": 0.041772460103968415, "kl": 0.028836343437433243, "learning_rate": 6.110453398173187e-07, "loss": 0.0003, "num_tokens": 12480249.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3200703859329224, "sampling/importance_sampling_ratio/mean": 0.9993069767951965, "sampling/importance_sampling_ratio/min": 0.6562559604644775, "sampling/sampling_logp_difference/max": 0.4212043285369873, "sampling/sampling_logp_difference/mean": 0.01448759064078331, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 140.515625, "completions/mean_terminated_length": 140.515625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "entropy": 0.36306560039520264, "epoch": 0.9752212389380531, "frac_reward_zero_std": 0.5, "grad_norm": 1.6733941874735485, "kl": 0.059806544333696365, "learning_rate": 6.095388465297418e-07, "loss": 0.0431, "num_tokens": 12499754.0, "reward": 0.21875, "reward_std": 0.4629635810852051, "rewards/decision_reward_func/mean": 0.21875, "rewards/decision_reward_func/std": 0.983494758605957, "sampling/importance_sampling_ratio/max": 1.4759948253631592, "sampling/importance_sampling_ratio/mean": 1.0003119707107544, "sampling/importance_sampling_ratio/min": 0.6957031488418579, "sampling/sampling_logp_difference/max": 0.3893321752548218, "sampling/sampling_logp_difference/mean": 0.014819087460637093, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 170.703125, "completions/mean_terminated_length": 170.703125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "entropy": 0.5047581791877747, "epoch": 0.9769911504424779, "frac_reward_zero_std": 0.5, "grad_norm": 1.1656785513547003, "kl": 0.0593656450510025, "learning_rate": 6.080313079789723e-07, "loss": -0.0437, "num_tokens": 12523159.0, "reward": 0.4375, "reward_std": 0.25, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.444342017173767, "sampling/importance_sampling_ratio/mean": 0.9997916221618652, "sampling/importance_sampling_ratio/min": 0.6242204308509827, "sampling/sampling_logp_difference/max": 0.4712517261505127, "sampling/sampling_logp_difference/mean": 0.017720595002174377, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 159.359375, "completions/mean_terminated_length": 159.359375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.39272868633270264, "epoch": 0.9787610619469026, "frac_reward_zero_std": 0.75, "grad_norm": 1.0153691210471831, "kl": 0.051775217056274414, "learning_rate": 6.065227385505421e-07, "loss": 0.0171, "num_tokens": 12543166.0, "reward": 0.71875, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.71875, "rewards/decision_reward_func/std": 0.7007648944854736, "sampling/importance_sampling_ratio/max": 1.546797752380371, "sampling/importance_sampling_ratio/mean": 0.9998806715011597, "sampling/importance_sampling_ratio/min": 0.7299336194992065, "sampling/sampling_logp_difference/max": 0.4361867904663086, "sampling/sampling_logp_difference/mean": 0.015707682818174362, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 151.640625, "completions/mean_terminated_length": 151.640625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.33292603492736816, "epoch": 0.9805309734513274, "frac_reward_zero_std": 1.0, "grad_norm": 0.0281427467866011, "kl": 0.025199301540851593, "learning_rate": 6.050131526398201e-07, "loss": 0.0002, "num_tokens": 12562823.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5091066360473633, "sampling/importance_sampling_ratio/mean": 1.000128984451294, "sampling/importance_sampling_ratio/min": 0.6200224161148071, "sampling/sampling_logp_difference/max": 0.4779996871948242, "sampling/sampling_logp_difference/mean": 0.015259217470884323, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 160.125, "completions/mean_terminated_length": 160.125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "entropy": 0.48752403259277344, "epoch": 0.9823008849557522, "frac_reward_zero_std": 0.5, "grad_norm": 1.559147835291522, "kl": 0.05512962117791176, "learning_rate": 6.035025646518746e-07, "loss": 0.0112, "num_tokens": 12585983.0, "reward": 0.15625, "reward_std": 0.47978055477142334, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.3810150623321533, "sampling/importance_sampling_ratio/mean": 0.9998184442520142, "sampling/importance_sampling_ratio/min": 0.6988722681999207, "sampling/sampling_logp_difference/max": 0.35828733444213867, "sampling/sampling_logp_difference/mean": 0.017440207302570343, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 170.625, "completions/mean_terminated_length": 170.625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "entropy": 0.39747118949890137, "epoch": 0.984070796460177, "frac_reward_zero_std": 0.75, "grad_norm": 0.829652216596529, "kl": 0.05210302025079727, "learning_rate": 6.019909890013366e-07, "loss": -0.0149, "num_tokens": 12606711.0, "reward": 0.96875, "reward_std": 0.125, "rewards/decision_reward_func/mean": 0.96875, "rewards/decision_reward_func/std": 0.25, "sampling/importance_sampling_ratio/max": 1.6040233373641968, "sampling/importance_sampling_ratio/mean": 0.9997381567955017, "sampling/importance_sampling_ratio/min": 0.6278918981552124, "sampling/sampling_logp_difference/max": 0.4725151062011719, "sampling/sampling_logp_difference/mean": 0.015872985124588013, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "entropy": 0.5959153175354004, "epoch": 0.9858407079646018, "frac_reward_zero_std": 0.25, "grad_norm": 1.5838772016409295, "kl": 0.06155340000987053, "learning_rate": 6.004784401122612e-07, "loss": 0.0408, "num_tokens": 12634791.0, "reward": 0.90625, "reward_std": 0.375, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.32803213596344, "sampling/importance_sampling_ratio/mean": 0.9998672008514404, "sampling/importance_sampling_ratio/min": 0.608726441860199, "sampling/sampling_logp_difference/max": 0.4963862895965576, "sampling/sampling_logp_difference/mean": 0.019085844978690147, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 179.75, "completions/mean_terminated_length": 179.75, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.3235132098197937, "epoch": 0.9876106194690265, "frac_reward_zero_std": 0.75, "grad_norm": 0.9844865975021971, "kl": 0.02561378851532936, "learning_rate": 5.98964932417991e-07, "loss": -0.0045, "num_tokens": 12656567.0, "reward": 0.78125, "reward_std": 0.2561737596988678, "rewards/decision_reward_func/mean": 0.78125, "rewards/decision_reward_func/std": 0.6291528940200806, "sampling/importance_sampling_ratio/max": 1.4618462324142456, "sampling/importance_sampling_ratio/mean": 0.999975323677063, "sampling/importance_sampling_ratio/min": 0.7354151606559753, "sampling/sampling_logp_difference/max": 0.3797001838684082, "sampling/sampling_logp_difference/mean": 0.013623690232634544, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 184.921875, "completions/mean_terminated_length": 184.921875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.5242243409156799, "epoch": 0.9893805309734514, "frac_reward_zero_std": 0.5, "grad_norm": 1.3583336237942298, "kl": 0.04835689067840576, "learning_rate": 5.974504803610178e-07, "loss": -0.0223, "num_tokens": 12680434.0, "reward": 0.4375, "reward_std": 0.47360679507255554, "rewards/decision_reward_func/mean": 0.4375, "rewards/decision_reward_func/std": 0.9063270092010498, "sampling/importance_sampling_ratio/max": 1.4175808429718018, "sampling/importance_sampling_ratio/mean": 1.000478982925415, "sampling/importance_sampling_ratio/min": 0.7157229781150818, "sampling/sampling_logp_difference/max": 0.3489518165588379, "sampling/sampling_logp_difference/mean": 0.017108961939811707, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 198.859375, "completions/mean_terminated_length": 198.859375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "entropy": 0.46707969903945923, "epoch": 0.9911504424778761, "frac_reward_zero_std": 0.75, "grad_norm": 0.8811747500237802, "kl": 0.04385651648044586, "learning_rate": 5.959350983928445e-07, "loss": 0.0206, "num_tokens": 12705833.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.2786628007888794, "sampling/importance_sampling_ratio/mean": 0.9998224377632141, "sampling/importance_sampling_ratio/min": 0.6214916110038757, "sampling/sampling_logp_difference/max": 0.475632905960083, "sampling/sampling_logp_difference/mean": 0.016529597342014313, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 157.15625, "completions/mean_terminated_length": 157.15625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "entropy": 0.48076122999191284, "epoch": 0.9929203539823008, "frac_reward_zero_std": 1.0, "grad_norm": 0.031460896412224985, "kl": 0.052566226571798325, "learning_rate": 5.944188009738483e-07, "loss": 0.0005, "num_tokens": 12728851.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6598697900772095, "sampling/importance_sampling_ratio/mean": 1.0003383159637451, "sampling/importance_sampling_ratio/min": 0.7128937244415283, "sampling/sampling_logp_difference/max": 0.5067391395568848, "sampling/sampling_logp_difference/mean": 0.016861015930771828, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 138.59375, "completions/mean_terminated_length": 138.59375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3797007203102112, "epoch": 0.9946902654867257, "frac_reward_zero_std": 0.75, "grad_norm": 1.2475786187580753, "kl": 0.037923719733953476, "learning_rate": 5.929016025731413e-07, "loss": 0.0133, "num_tokens": 12747833.0, "reward": 0.9375, "reward_std": 0.17078250646591187, "rewards/decision_reward_func/mean": 0.9375, "rewards/decision_reward_func/std": 0.35073620080947876, "sampling/importance_sampling_ratio/max": 1.56224524974823, "sampling/importance_sampling_ratio/mean": 0.9994237422943115, "sampling/importance_sampling_ratio/min": 0.657938539981842, "sampling/sampling_logp_difference/max": 0.4461240768432617, "sampling/sampling_logp_difference/mean": 0.014938208274543285, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 150.84375, "completions/mean_terminated_length": 150.84375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.27015700936317444, "epoch": 0.9964601769911504, "frac_reward_zero_std": 1.0, "grad_norm": 0.024702193806412247, "kl": 0.02522423304617405, "learning_rate": 5.913835176684334e-07, "loss": 0.0002, "num_tokens": 12767247.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5145760774612427, "sampling/importance_sampling_ratio/mean": 1.000138521194458, "sampling/importance_sampling_ratio/min": 0.7146693468093872, "sampling/sampling_logp_difference/max": 0.41513562202453613, "sampling/sampling_logp_difference/mean": 0.012315331026911736, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 258.828125, "completions/mean_terminated_length": 258.828125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.6288559436798096, "epoch": 0.9982300884955753, "frac_reward_zero_std": 0.0, "grad_norm": 1.2643571409135848, "kl": 0.06781111657619476, "learning_rate": 5.89864560745894e-07, "loss": 0.0007, "num_tokens": 12794596.0, "reward": 0.15625, "reward_std": 0.6601393222808838, "rewards/decision_reward_func/mean": 0.15625, "rewards/decision_reward_func/std": 0.9955257177352905, "sampling/importance_sampling_ratio/max": 1.282141923904419, "sampling/importance_sampling_ratio/mean": 1.0002317428588867, "sampling/importance_sampling_ratio/min": 0.6257727146148682, "sampling/sampling_logp_difference/max": 0.4687681198120117, "sampling/sampling_logp_difference/mean": 0.018289338797330856, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "entropy": 0.41560235619544983, "epoch": 1.0, "frac_reward_zero_std": 0.75, "grad_norm": 0.8499870699291785, "kl": 0.036372072994709015, "learning_rate": 5.883447463000135e-07, "loss": -0.0161, "num_tokens": 12817276.0, "reward": 0.90625, "reward_std": 0.20155644416809082, "rewards/decision_reward_func/mean": 0.90625, "rewards/decision_reward_func/std": 0.42608407139778137, "sampling/importance_sampling_ratio/max": 1.5545761585235596, "sampling/importance_sampling_ratio/mean": 1.0002684593200684, "sampling/importance_sampling_ratio/min": 0.6176016330718994, "sampling/sampling_logp_difference/max": 0.48191165924072266, "sampling/sampling_logp_difference/mean": 0.014586273580789566, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 112.84375, "completions/mean_terminated_length": 112.84375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "entropy": 0.2689701318740845, "epoch": 1.0017699115044247, "frac_reward_zero_std": 1.0, "grad_norm": 0.03643559393027775, "kl": 0.027217669412493706, "learning_rate": 5.868240888334652e-07, "loss": 0.0003, "num_tokens": 12833986.0, "reward": 1.0, "reward_std": 0.0, "rewards/decision_reward_func/mean": 1.0, "rewards/decision_reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.513666033744812, "sampling/importance_sampling_ratio/mean": 1.000664472579956, "sampling/importance_sampling_ratio/min": 0.6641154885292053, "sampling/sampling_logp_difference/max": 0.4145345687866211, "sampling/sampling_logp_difference/mean": 0.012834830209612846, "step": 566 } ], "logging_steps": 1, "max_steps": 1130, "num_input_tokens_seen": 12833986, "num_train_epochs": 2, "save_steps": 283, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }