diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18146 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0017699115044247, + "eval_steps": 500, + "global_step": 566, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 144.484375, + "completions/mean_terminated_length": 144.484375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4271422028541565, + "epoch": 0.0017699115044247787, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1006226002289405, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0047, + "num_tokens": 18911.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.3433542251586914, + "sampling/importance_sampling_ratio/mean": 1.000240683555603, + "sampling/importance_sampling_ratio/min": 0.653702437877655, + "sampling/sampling_logp_difference/max": 0.42510294914245605, + "sampling/sampling_logp_difference/mean": 0.015977157279849052, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 626.0, + "completions/max_terminated_length": 626.0, + "completions/mean_length": 209.75, + "completions/mean_terminated_length": 209.75, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4180360436439514, + "epoch": 0.0035398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7553122903465309, + "kl": 0.0, + "learning_rate": 8.849557522123893e-09, + "loss": -0.015, + "num_tokens": 42495.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.371575951576233, + "sampling/importance_sampling_ratio/mean": 0.9999609589576721, + "sampling/importance_sampling_ratio/min": 0.6392531991004944, + "sampling/sampling_logp_difference/max": 0.44745469093322754, + "sampling/sampling_logp_difference/mean": 0.015478897839784622, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 261.46875, + "completions/mean_terminated_length": 261.46875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.6020745635032654, + "epoch": 0.005309734513274336, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8658353718902592, + "kl": 0.00044076767517253757, + "learning_rate": 1.7699115044247786e-08, + "loss": 0.0159, + "num_tokens": 72909.0, + "reward": 0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5477291345596313, + "sampling/importance_sampling_ratio/mean": 0.9996609687805176, + "sampling/importance_sampling_ratio/min": 0.6928660869598389, + "sampling/sampling_logp_difference/max": 0.43678879737854004, + "sampling/sampling_logp_difference/mean": 0.017934903502464294, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 213.875, + "completions/mean_terminated_length": 213.875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.5642544031143188, + "epoch": 0.007079646017699115, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2078506186870932, + "kl": 0.0004340629675425589, + "learning_rate": 2.654867256637168e-08, + "loss": -0.0096, + "num_tokens": 98837.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.492296576499939, + "sampling/importance_sampling_ratio/mean": 1.0004456043243408, + "sampling/importance_sampling_ratio/min": 0.6158694624900818, + "sampling/sampling_logp_difference/max": 0.48472023010253906, + "sampling/sampling_logp_difference/mean": 0.01798402890563011, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 148.0625, + "completions/mean_terminated_length": 148.0625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.41338294744491577, + "epoch": 0.008849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.00233020676983444, + "kl": 0.0006641986547037959, + "learning_rate": 3.539823008849557e-08, + "loss": 0.0, + "num_tokens": 119753.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5282143354415894, + "sampling/importance_sampling_ratio/mean": 1.0009156465530396, + "sampling/importance_sampling_ratio/min": 0.6264251470565796, + "sampling/sampling_logp_difference/max": 0.4677259922027588, + "sampling/sampling_logp_difference/mean": 0.016537081450223923, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 152.03125, + "completions/mean_terminated_length": 152.03125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.30602705478668213, + "epoch": 0.010619469026548672, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1710728518813673, + "kl": 0.0006627484108321369, + "learning_rate": 4.424778761061947e-08, + "loss": 0.0178, + "num_tokens": 140379.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4749361276626587, + "sampling/importance_sampling_ratio/mean": 0.9997684955596924, + "sampling/importance_sampling_ratio/min": 0.4771636128425598, + "sampling/sampling_logp_difference/max": 0.7398958206176758, + "sampling/sampling_logp_difference/mean": 0.014786459505558014, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 152.28125, + "completions/mean_terminated_length": 152.28125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.48392254114151, + "epoch": 0.012389380530973451, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4522039803604556, + "kl": 0.0007441140478476882, + "learning_rate": 5.309734513274336e-08, + "loss": -0.0063, + "num_tokens": 164637.0, + "reward": 0.125, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4044241905212402, + "sampling/importance_sampling_ratio/mean": 0.9991136789321899, + "sampling/importance_sampling_ratio/min": 0.623917818069458, + "sampling/sampling_logp_difference/max": 0.4717366695404053, + "sampling/sampling_logp_difference/mean": 0.01828671619296074, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 196.265625, + "completions/mean_terminated_length": 196.265625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.5522291660308838, + "epoch": 0.01415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0022623975075869215, + "kl": 0.0005923404823988676, + "learning_rate": 6.194690265486725e-08, + "loss": 0.0, + "num_tokens": 188174.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4031603336334229, + "sampling/importance_sampling_ratio/mean": 0.9999065399169922, + "sampling/importance_sampling_ratio/min": 0.6051700115203857, + "sampling/sampling_logp_difference/max": 0.5022459030151367, + "sampling/sampling_logp_difference/mean": 0.019137494266033173, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 121.09375, + "completions/mean_terminated_length": 121.09375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.25603991746902466, + "epoch": 0.01592920353982301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0029169860087568922, + "kl": 0.0005756879108957946, + "learning_rate": 7.079646017699114e-08, + "loss": 0.0, + "num_tokens": 205732.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5759655237197876, + "sampling/importance_sampling_ratio/mean": 1.0007699728012085, + "sampling/importance_sampling_ratio/min": 0.7221340537071228, + "sampling/sampling_logp_difference/max": 0.4548680782318115, + "sampling/sampling_logp_difference/mean": 0.012160948477685452, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 170.234375, + "completions/mean_terminated_length": 170.234375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.4174914062023163, + "epoch": 0.017699115044247787, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.456361804165413, + "kl": 0.0005390376900322735, + "learning_rate": 7.964601769911503e-08, + "loss": 0.0397, + "num_tokens": 228307.0, + "reward": 0.65625, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.3539308309555054, + "sampling/importance_sampling_ratio/mean": 1.0001657009124756, + "sampling/importance_sampling_ratio/min": 0.6430124640464783, + "sampling/sampling_logp_difference/max": 0.4415912628173828, + "sampling/sampling_logp_difference/mean": 0.01574782468378544, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 176.6875, + "completions/mean_terminated_length": 176.6875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.426904559135437, + "epoch": 0.019469026548672566, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7685811919938644, + "kl": 0.0005052468040958047, + "learning_rate": 8.849557522123894e-08, + "loss": 0.0164, + "num_tokens": 249599.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.69903564453125, + "sampling/importance_sampling_ratio/mean": 1.0001137256622314, + "sampling/importance_sampling_ratio/min": 0.6657065153121948, + "sampling/sampling_logp_difference/max": 0.5300607681274414, + "sampling/sampling_logp_difference/mean": 0.015404904261231422, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 137.78125, + "completions/mean_terminated_length": 137.78125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.3701123595237732, + "epoch": 0.021238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0031209918838396716, + "kl": 0.0006685380358248949, + "learning_rate": 9.734513274336283e-08, + "loss": 0.0, + "num_tokens": 268721.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000590205192566, + "sampling/importance_sampling_ratio/min": 0.6504923701286316, + "sampling/sampling_logp_difference/max": 0.8355003595352173, + "sampling/sampling_logp_difference/mean": 0.01659131795167923, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 181.109375, + "completions/mean_terminated_length": 181.109375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.42291828989982605, + "epoch": 0.023008849557522124, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1080865101289201, + "kl": 0.0004783940967172384, + "learning_rate": 1.0619469026548672e-07, + "loss": 0.0108, + "num_tokens": 291672.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.578126311302185, + "sampling/importance_sampling_ratio/mean": 1.0000258684158325, + "sampling/importance_sampling_ratio/min": 0.6148504018783569, + "sampling/sampling_logp_difference/max": 0.4863762855529785, + "sampling/sampling_logp_difference/mean": 0.015482140704989433, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 215.453125, + "completions/mean_terminated_length": 215.453125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.5591689348220825, + "epoch": 0.024778761061946902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0020184872105857158, + "kl": 0.0005135840037837625, + "learning_rate": 1.1504424778761061e-07, + "loss": 0.0, + "num_tokens": 318917.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.460014820098877, + "sampling/importance_sampling_ratio/mean": 0.9999944567680359, + "sampling/importance_sampling_ratio/min": 0.6955029964447021, + "sampling/sampling_logp_difference/max": 0.3784465789794922, + "sampling/sampling_logp_difference/mean": 0.01834474503993988, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 211.78125, + "completions/mean_terminated_length": 211.78125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.46811556816101074, + "epoch": 0.02654867256637168, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9886943375248237, + "kl": 0.0005040219402872026, + "learning_rate": 1.238938053097345e-07, + "loss": 0.0123, + "num_tokens": 345671.0, + "reward": 0.5, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5770877599716187, + "sampling/importance_sampling_ratio/mean": 1.0002729892730713, + "sampling/importance_sampling_ratio/min": 0.5805851221084595, + "sampling/sampling_logp_difference/max": 0.5437189340591431, + "sampling/sampling_logp_difference/mean": 0.01677098125219345, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 195.515625, + "completions/mean_terminated_length": 195.515625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.2952774167060852, + "epoch": 0.02831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8590553181921636, + "kl": 0.0004932324518449605, + "learning_rate": 1.327433628318584e-07, + "loss": -0.0, + "num_tokens": 369400.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.510075569152832, + "sampling/importance_sampling_ratio/mean": 0.9999840259552002, + "sampling/importance_sampling_ratio/min": 0.5866603255271912, + "sampling/sampling_logp_difference/max": 0.5333092212677002, + "sampling/sampling_logp_difference/mean": 0.01272563450038433, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 227.71875, + "completions/mean_terminated_length": 227.71875, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.44876500964164734, + "epoch": 0.03008849557522124, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.917896025193256, + "kl": 0.000513352919369936, + "learning_rate": 1.4159292035398229e-07, + "loss": -0.0413, + "num_tokens": 394518.0, + "reward": 0.53125, + "reward_std": 0.42516323924064636, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5742335319519043, + "sampling/importance_sampling_ratio/mean": 0.9999843835830688, + "sampling/importance_sampling_ratio/min": 0.6992347240447998, + "sampling/sampling_logp_difference/max": 0.4537684917449951, + "sampling/sampling_logp_difference/mean": 0.01575635001063347, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 218.25, + "completions/mean_terminated_length": 218.25, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.34195274114608765, + "epoch": 0.03185840707964602, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3005797089893403, + "kl": 0.0004412387206684798, + "learning_rate": 1.504424778761062e-07, + "loss": 0.0777, + "num_tokens": 418870.0, + "reward": 0.8125, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6629990339279175, + "sampling/importance_sampling_ratio/mean": 0.9997455477714539, + "sampling/importance_sampling_ratio/min": 0.7294662594795227, + "sampling/sampling_logp_difference/max": 0.5086226463317871, + "sampling/sampling_logp_difference/mean": 0.012046229094266891, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 175.28125, + "completions/mean_terminated_length": 175.28125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.4359934329986572, + "epoch": 0.033628318584070796, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0600244106746555, + "kl": 0.00039431609911844134, + "learning_rate": 1.5929203539823007e-07, + "loss": 0.0539, + "num_tokens": 441736.0, + "reward": 0.0625, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.3352776765823364, + "sampling/importance_sampling_ratio/mean": 1.0002167224884033, + "sampling/importance_sampling_ratio/min": 0.6985317468643188, + "sampling/sampling_logp_difference/max": 0.35877466201782227, + "sampling/sampling_logp_difference/mean": 0.015411154367029667, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 300.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 130.890625, + "completions/mean_terminated_length": 130.890625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3688535988330841, + "epoch": 0.035398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0783491352584433, + "kl": 0.0005861029494553804, + "learning_rate": 1.68141592920354e-07, + "loss": -0.0158, + "num_tokens": 460049.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4999357461929321, + "sampling/importance_sampling_ratio/mean": 1.000003695487976, + "sampling/importance_sampling_ratio/min": 0.6543043851852417, + "sampling/sampling_logp_difference/max": 0.424182653427124, + "sampling/sampling_logp_difference/mean": 0.01564677618443966, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 192.875, + "completions/mean_terminated_length": 192.875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.36204296350479126, + "epoch": 0.03716814159292035, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8896725406897013, + "kl": 0.00045308793778531253, + "learning_rate": 1.7699115044247788e-07, + "loss": 0.0343, + "num_tokens": 482745.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.435202717781067, + "sampling/importance_sampling_ratio/mean": 0.9992131590843201, + "sampling/importance_sampling_ratio/min": 0.6173391342163086, + "sampling/sampling_logp_difference/max": 0.48233675956726074, + "sampling/sampling_logp_difference/mean": 0.014494507573544979, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 220.671875, + "completions/mean_terminated_length": 220.671875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.45146217942237854, + "epoch": 0.03893805309734513, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8407237808788562, + "kl": 0.0005051965126767755, + "learning_rate": 1.8584070796460178e-07, + "loss": -0.0017, + "num_tokens": 509556.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4402657747268677, + "sampling/importance_sampling_ratio/mean": 1.0001249313354492, + "sampling/importance_sampling_ratio/min": 0.6482202410697937, + "sampling/sampling_logp_difference/max": 0.4335247278213501, + "sampling/sampling_logp_difference/mean": 0.015440289862453938, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 199.96875, + "completions/mean_terminated_length": 199.96875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.46031802892684937, + "epoch": 0.04070796460176991, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4091022344664308, + "kl": 0.0005111101781949401, + "learning_rate": 1.9469026548672566e-07, + "loss": 0.0234, + "num_tokens": 532722.0, + "reward": 0.4375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.442039966583252, + "sampling/importance_sampling_ratio/mean": 1.0001800060272217, + "sampling/importance_sampling_ratio/min": 0.6624577641487122, + "sampling/sampling_logp_difference/max": 0.41179847717285156, + "sampling/sampling_logp_difference/mean": 0.016388865187764168, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 192.15625, + "completions/mean_terminated_length": 192.15625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.3538476824760437, + "epoch": 0.04247787610619469, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8647717655608781, + "kl": 0.0006137521704658866, + "learning_rate": 2.0353982300884956e-07, + "loss": -0.024, + "num_tokens": 555900.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.5973563194274902, + "sampling/importance_sampling_ratio/mean": 0.9996161460876465, + "sampling/importance_sampling_ratio/min": 0.5685754418373108, + "sampling/sampling_logp_difference/max": 0.5646212697029114, + "sampling/sampling_logp_difference/mean": 0.015036176890134811, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 193.9375, + "completions/mean_terminated_length": 193.9375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.45706120133399963, + "epoch": 0.04424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0021691898248266707, + "kl": 0.0005267454544082284, + "learning_rate": 2.1238938053097344e-07, + "loss": 0.0, + "num_tokens": 580744.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5684353113174438, + "sampling/importance_sampling_ratio/mean": 1.0003420114517212, + "sampling/importance_sampling_ratio/min": 0.6095874309539795, + "sampling/sampling_logp_difference/max": 0.49497294425964355, + "sampling/sampling_logp_difference/mean": 0.015996476635336876, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 258.984375, + "completions/mean_terminated_length": 258.984375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4292888045310974, + "epoch": 0.04601769911504425, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9510533046556038, + "kl": 0.0004537611675914377, + "learning_rate": 2.2123893805309735e-07, + "loss": -0.0008, + "num_tokens": 608535.0, + "reward": 0.34375, + "reward_std": 0.4597553312778473, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5802814960479736, + "sampling/importance_sampling_ratio/mean": 1.0002368688583374, + "sampling/importance_sampling_ratio/min": 0.6091845035552979, + "sampling/sampling_logp_difference/max": 0.4956340789794922, + "sampling/sampling_logp_difference/mean": 0.014350948855280876, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 153.484375, + "completions/mean_terminated_length": 153.484375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.37147828936576843, + "epoch": 0.047787610619469026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0026069699616681308, + "kl": 0.0006182813085615635, + "learning_rate": 2.3008849557522122e-07, + "loss": 0.0, + "num_tokens": 628438.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4643288850784302, + "sampling/importance_sampling_ratio/mean": 0.999936580657959, + "sampling/importance_sampling_ratio/min": 0.6089222431182861, + "sampling/sampling_logp_difference/max": 0.4960646629333496, + "sampling/sampling_logp_difference/mean": 0.014883618801832199, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 220.875, + "completions/mean_terminated_length": 220.875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4160059094429016, + "epoch": 0.049557522123893805, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0446907347518073, + "kl": 0.0004597888619173318, + "learning_rate": 2.3893805309734513e-07, + "loss": -0.0312, + "num_tokens": 653774.0, + "reward": 0.15625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.5278409719467163, + "sampling/importance_sampling_ratio/mean": 0.9999281167984009, + "sampling/importance_sampling_ratio/min": 0.6271337866783142, + "sampling/sampling_logp_difference/max": 0.4665954113006592, + "sampling/sampling_logp_difference/mean": 0.014859693124890327, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 130.453125, + "completions/mean_terminated_length": 130.453125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.3917607367038727, + "epoch": 0.05132743362831858, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.334381952357411, + "kl": 0.0006424708990380168, + "learning_rate": 2.47787610619469e-07, + "loss": -0.0282, + "num_tokens": 672555.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.620834231376648, + "sampling/importance_sampling_ratio/mean": 0.9999610185623169, + "sampling/importance_sampling_ratio/min": 0.6735600233078003, + "sampling/sampling_logp_difference/max": 0.4829409122467041, + "sampling/sampling_logp_difference/mean": 0.01759732887148857, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 227.421875, + "completions/mean_terminated_length": 227.421875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.49635761976242065, + "epoch": 0.05309734513274336, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5970116235546846, + "kl": 0.00038929813308641315, + "learning_rate": 2.5663716814159294e-07, + "loss": 0.0058, + "num_tokens": 699190.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5870181322097778, + "sampling/importance_sampling_ratio/mean": 1.0004205703735352, + "sampling/importance_sampling_ratio/min": 0.6560810208320618, + "sampling/sampling_logp_difference/max": 0.4618568420410156, + "sampling/sampling_logp_difference/mean": 0.01632857695221901, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1233.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 328.40625, + "completions/mean_terminated_length": 328.40625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.3582524061203003, + "epoch": 0.05486725663716814, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8346609834753668, + "kl": 0.0003085409989580512, + "learning_rate": 2.654867256637168e-07, + "loss": 0.0094, + "num_tokens": 732208.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6506547927856445, + "sampling/importance_sampling_ratio/mean": 1.0000078678131104, + "sampling/importance_sampling_ratio/min": 0.5198546648025513, + "sampling/sampling_logp_difference/max": 0.6542060375213623, + "sampling/sampling_logp_difference/mean": 0.012050572782754898, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 230.21875, + "completions/mean_terminated_length": 230.21875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4924032390117645, + "epoch": 0.05663716814159292, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9186703210868864, + "kl": 0.00048273595166392624, + "learning_rate": 2.743362831858407e-07, + "loss": 0.0103, + "num_tokens": 758846.0, + "reward": 0.21875, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.5340979099273682, + "sampling/importance_sampling_ratio/mean": 0.9997835159301758, + "sampling/importance_sampling_ratio/min": 0.7198682427406311, + "sampling/sampling_logp_difference/max": 0.42794251441955566, + "sampling/sampling_logp_difference/mean": 0.016245532780885696, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 172.765625, + "completions/mean_terminated_length": 172.765625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.46681106090545654, + "epoch": 0.0584070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.92359418250283, + "kl": 0.0004580853274092078, + "learning_rate": 2.8318584070796457e-07, + "loss": 0.0201, + "num_tokens": 781055.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.3864423036575317, + "sampling/importance_sampling_ratio/mean": 1.00001859664917, + "sampling/importance_sampling_ratio/min": 0.6857555508613586, + "sampling/sampling_logp_difference/max": 0.37723398208618164, + "sampling/sampling_logp_difference/mean": 0.016717787832021713, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 191.203125, + "completions/mean_terminated_length": 191.203125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.4282730221748352, + "epoch": 0.06017699115044248, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8838626164036724, + "kl": 0.0005225211498327553, + "learning_rate": 2.920353982300885e-07, + "loss": 0.0266, + "num_tokens": 804764.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4427217245101929, + "sampling/importance_sampling_ratio/mean": 1.0002740621566772, + "sampling/importance_sampling_ratio/min": 0.7011700868606567, + "sampling/sampling_logp_difference/max": 0.3665313720703125, + "sampling/sampling_logp_difference/mean": 0.01491781510412693, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 145.171875, + "completions/mean_terminated_length": 145.171875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.36615437269210815, + "epoch": 0.061946902654867256, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1755523692983083, + "kl": 0.0006177417235448956, + "learning_rate": 3.008849557522124e-07, + "loss": -0.0171, + "num_tokens": 827495.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.485703468322754, + "sampling/importance_sampling_ratio/mean": 1.0003960132598877, + "sampling/importance_sampling_ratio/min": 0.31634464859962463, + "sampling/sampling_logp_difference/max": 1.1509230136871338, + "sampling/sampling_logp_difference/mean": 0.015536945313215256, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 217.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 103.125, + "completions/mean_terminated_length": 103.125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.245346337556839, + "epoch": 0.06371681415929203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004783989031599875, + "kl": 0.0007909094565548003, + "learning_rate": 3.0973451327433626e-07, + "loss": 0.0, + "num_tokens": 843471.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6225351095199585, + "sampling/importance_sampling_ratio/mean": 1.0003888607025146, + "sampling/importance_sampling_ratio/min": 0.620156466960907, + "sampling/sampling_logp_difference/max": 0.4839897155761719, + "sampling/sampling_logp_difference/mean": 0.014604487456381321, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 193.90625, + "completions/mean_terminated_length": 193.90625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4967886805534363, + "epoch": 0.06548672566371681, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.105159124561485, + "kl": 0.0006542068440467119, + "learning_rate": 3.1858407079646014e-07, + "loss": -0.0257, + "num_tokens": 870345.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4062508344650269, + "sampling/importance_sampling_ratio/mean": 0.9996336698532104, + "sampling/importance_sampling_ratio/min": 0.5721895098686218, + "sampling/sampling_logp_difference/max": 0.5582849979400635, + "sampling/sampling_logp_difference/mean": 0.01660425029695034, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 145.34375, + "completions/mean_terminated_length": 145.34375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3922208547592163, + "epoch": 0.06725663716814159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0038207811778539903, + "kl": 0.000669948582071811, + "learning_rate": 3.2743362831858407e-07, + "loss": 0.0, + "num_tokens": 889455.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.567585825920105, + "sampling/importance_sampling_ratio/mean": 1.0002198219299316, + "sampling/importance_sampling_ratio/min": 0.7432371973991394, + "sampling/sampling_logp_difference/max": 0.4495368003845215, + "sampling/sampling_logp_difference/mean": 0.015564528293907642, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 134.453125, + "completions/mean_terminated_length": 134.453125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.2876589298248291, + "epoch": 0.06902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0445104539737837, + "kl": 0.0006731583271175623, + "learning_rate": 3.36283185840708e-07, + "loss": 0.0331, + "num_tokens": 907452.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.3671373128890991, + "sampling/importance_sampling_ratio/mean": 0.9999994039535522, + "sampling/importance_sampling_ratio/min": 0.6149933934211731, + "sampling/sampling_logp_difference/max": 0.4861437678337097, + "sampling/sampling_logp_difference/mean": 0.013353753834962845, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 171.296875, + "completions/mean_terminated_length": 171.296875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3427136242389679, + "epoch": 0.07079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9039518340255397, + "kl": 0.0006144886719994247, + "learning_rate": 3.451327433628318e-07, + "loss": -0.0115, + "num_tokens": 929663.0, + "reward": -0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3595855236053467, + "sampling/importance_sampling_ratio/mean": 1.0000739097595215, + "sampling/importance_sampling_ratio/min": 0.6020215153694153, + "sampling/sampling_logp_difference/max": 0.5074621438980103, + "sampling/sampling_logp_difference/mean": 0.015125567093491554, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 208.9375, + "completions/mean_terminated_length": 208.9375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.5422006845474243, + "epoch": 0.07256637168141593, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.214409285618555, + "kl": 0.0005240167956799269, + "learning_rate": 3.5398230088495575e-07, + "loss": -0.0523, + "num_tokens": 956411.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.2793256044387817, + "sampling/importance_sampling_ratio/mean": 0.9999217391014099, + "sampling/importance_sampling_ratio/min": 0.6142622828483582, + "sampling/sampling_logp_difference/max": 0.4873332977294922, + "sampling/sampling_logp_difference/mean": 0.016808750107884407, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 219.953125, + "completions/mean_terminated_length": 219.953125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.490911602973938, + "epoch": 0.0743362831858407, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.975297783397351, + "kl": 0.000842518697027117, + "learning_rate": 3.6283185840707963e-07, + "loss": -0.0121, + "num_tokens": 982088.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6122313737869263, + "sampling/importance_sampling_ratio/mean": 0.9994571208953857, + "sampling/importance_sampling_ratio/min": 0.6460330486297607, + "sampling/sampling_logp_difference/max": 0.4776191711425781, + "sampling/sampling_logp_difference/mean": 0.017350424081087112, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.0, + "completions/max_terminated_length": 499.0, + "completions/mean_length": 198.140625, + "completions/mean_terminated_length": 198.140625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.496207058429718, + "epoch": 0.07610619469026549, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1555111520132804, + "kl": 0.0008575224783271551, + "learning_rate": 3.7168141592920356e-07, + "loss": 0.027, + "num_tokens": 1005521.0, + "reward": 0.0, + "reward_std": 0.4787135720252991, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3447661399841309, + "sampling/importance_sampling_ratio/mean": 1.000145673751831, + "sampling/importance_sampling_ratio/min": 0.6634519100189209, + "sampling/sampling_logp_difference/max": 0.4102989435195923, + "sampling/sampling_logp_difference/mean": 0.016248438507318497, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 172.140625, + "completions/mean_terminated_length": 172.140625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.39261263608932495, + "epoch": 0.07787610619469026, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.004234071258287348, + "kl": 0.0007658154936507344, + "learning_rate": 3.805309734513274e-07, + "loss": 0.0, + "num_tokens": 1028506.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5592234134674072, + "sampling/importance_sampling_ratio/mean": 1.0003376007080078, + "sampling/importance_sampling_ratio/min": 0.6684509515762329, + "sampling/sampling_logp_difference/max": 0.44418787956237793, + "sampling/sampling_logp_difference/mean": 0.016881383955478668, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 132.15625, + "completions/mean_terminated_length": 132.15625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.28721314668655396, + "epoch": 0.07964601769911504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.005064502940089037, + "kl": 0.0008006412535905838, + "learning_rate": 3.893805309734513e-07, + "loss": 0.0, + "num_tokens": 1046020.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3631056547164917, + "sampling/importance_sampling_ratio/mean": 0.9993762969970703, + "sampling/importance_sampling_ratio/min": 0.6436959505081177, + "sampling/sampling_logp_difference/max": 0.44052886962890625, + "sampling/sampling_logp_difference/mean": 0.014048042707145214, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 808.0, + "completions/max_terminated_length": 808.0, + "completions/mean_length": 255.78125, + "completions/mean_terminated_length": 255.78125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.3829498291015625, + "epoch": 0.08141592920353982, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.09197729542159, + "kl": 0.0007642432465218008, + "learning_rate": 3.982300884955752e-07, + "loss": 0.0471, + "num_tokens": 1073174.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4337151050567627, + "sampling/importance_sampling_ratio/mean": 0.9996473789215088, + "sampling/importance_sampling_ratio/min": 0.7803710103034973, + "sampling/sampling_logp_difference/max": 0.36026906967163086, + "sampling/sampling_logp_difference/mean": 0.013885672204196453, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 664.0, + "completions/max_terminated_length": 664.0, + "completions/mean_length": 208.265625, + "completions/mean_terminated_length": 208.265625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.44653117656707764, + "epoch": 0.0831858407079646, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0658224190385082, + "kl": 0.0009269802249036729, + "learning_rate": 4.0707964601769913e-07, + "loss": 0.0345, + "num_tokens": 1097095.0, + "reward": -0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": -0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.3222029209136963, + "sampling/importance_sampling_ratio/mean": 0.9996322393417358, + "sampling/importance_sampling_ratio/min": 0.689879834651947, + "sampling/sampling_logp_difference/max": 0.3712378740310669, + "sampling/sampling_logp_difference/mean": 0.01496546808630228, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 672.0, + "completions/max_terminated_length": 672.0, + "completions/mean_length": 212.78125, + "completions/mean_terminated_length": 212.78125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.37079882621765137, + "epoch": 0.08495575221238938, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.057700328676415, + "kl": 0.0008619067957624793, + "learning_rate": 4.1592920353982295e-07, + "loss": -0.002, + "num_tokens": 1120665.0, + "reward": 0.875, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.3062375783920288, + "sampling/importance_sampling_ratio/mean": 0.9999436140060425, + "sampling/importance_sampling_ratio/min": 0.654721736907959, + "sampling/sampling_logp_difference/max": 0.4235450029373169, + "sampling/sampling_logp_difference/mean": 0.014391312375664711, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 680.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 159.546875, + "completions/mean_terminated_length": 159.546875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.3836078345775604, + "epoch": 0.08672566371681416, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9627634721725993, + "kl": 0.001014052890241146, + "learning_rate": 4.247787610619469e-07, + "loss": -0.0447, + "num_tokens": 1140876.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.3341999053955078, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.015728633850812912, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 162.1875, + "completions/mean_terminated_length": 162.1875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.46104708313941956, + "epoch": 0.08849557522123894, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.434499473471128, + "kl": 0.0014803860103711486, + "learning_rate": 4.3362831858407076e-07, + "loss": 0.0053, + "num_tokens": 1161736.0, + "reward": 0.40625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4056953191757202, + "sampling/importance_sampling_ratio/mean": 0.9999998807907104, + "sampling/importance_sampling_ratio/min": 0.6213480234146118, + "sampling/sampling_logp_difference/max": 0.4758639335632324, + "sampling/sampling_logp_difference/mean": 0.016189994290471077, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 894.0, + "completions/max_terminated_length": 894.0, + "completions/mean_length": 178.53125, + "completions/mean_terminated_length": 178.53125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3214467167854309, + "epoch": 0.09026548672566372, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1180849217318127, + "kl": 0.001077151857316494, + "learning_rate": 4.424778761061947e-07, + "loss": 0.1117, + "num_tokens": 1183514.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.4481030702590942, + "sampling/importance_sampling_ratio/mean": 1.0001425743103027, + "sampling/importance_sampling_ratio/min": 0.6622359156608582, + "sampling/sampling_logp_difference/max": 0.41213345527648926, + "sampling/sampling_logp_difference/mean": 0.013998802751302719, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 168.21875, + "completions/mean_terminated_length": 168.21875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.4382091164588928, + "epoch": 0.0920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0049643263246611285, + "kl": 0.0008880996610969305, + "learning_rate": 4.5132743362831857e-07, + "loss": 0.0, + "num_tokens": 1208600.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4654942750930786, + "sampling/importance_sampling_ratio/mean": 0.9995968341827393, + "sampling/importance_sampling_ratio/min": 0.4864470064640045, + "sampling/sampling_logp_difference/max": 0.7206273078918457, + "sampling/sampling_logp_difference/mean": 0.015631159767508507, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 224.921875, + "completions/mean_terminated_length": 224.921875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.5588756799697876, + "epoch": 0.09380530973451327, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7168121498298733, + "kl": 0.0012564393691718578, + "learning_rate": 4.6017699115044245e-07, + "loss": 0.0129, + "num_tokens": 1235331.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.325711727142334, + "sampling/importance_sampling_ratio/mean": 1.0006237030029297, + "sampling/importance_sampling_ratio/min": 0.6163890361785889, + "sampling/sampling_logp_difference/max": 0.48387694358825684, + "sampling/sampling_logp_difference/mean": 0.017355667427182198, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 217.265625, + "completions/mean_terminated_length": 217.265625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.44299280643463135, + "epoch": 0.09557522123893805, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1516076883842818, + "kl": 0.0011872118338942528, + "learning_rate": 4.690265486725664e-07, + "loss": -0.0125, + "num_tokens": 1260852.0, + "reward": 0.0625, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.3174514770507812, + "sampling/importance_sampling_ratio/mean": 1.0000274181365967, + "sampling/importance_sampling_ratio/min": 0.6112026572227478, + "sampling/sampling_logp_difference/max": 0.4923267364501953, + "sampling/sampling_logp_difference/mean": 0.015816478058695793, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 151.8125, + "completions/mean_terminated_length": 151.8125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.34421470761299133, + "epoch": 0.09734513274336283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.006991936916443441, + "kl": 0.001308807055465877, + "learning_rate": 4.778761061946903e-07, + "loss": 0.0, + "num_tokens": 1281288.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2738009691238403, + "sampling/importance_sampling_ratio/mean": 0.999666690826416, + "sampling/importance_sampling_ratio/min": 0.6147581934928894, + "sampling/sampling_logp_difference/max": 0.4865262508392334, + "sampling/sampling_logp_difference/mean": 0.014791518449783325, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 190.765625, + "completions/mean_terminated_length": 190.765625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.38345327973365784, + "epoch": 0.09911504424778761, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2417819990578034, + "kl": 0.003148602321743965, + "learning_rate": 4.867256637168141e-07, + "loss": 0.0063, + "num_tokens": 1306313.0, + "reward": 0.4375, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4671752452850342, + "sampling/importance_sampling_ratio/mean": 0.9994722604751587, + "sampling/importance_sampling_ratio/min": 0.5983425974845886, + "sampling/sampling_logp_difference/max": 0.5135917663574219, + "sampling/sampling_logp_difference/mean": 0.015144657343626022, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 149.453125, + "completions/mean_terminated_length": 149.453125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.3478190302848816, + "epoch": 0.10088495575221239, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1688785025902486, + "kl": 0.0018540058517828584, + "learning_rate": 4.95575221238938e-07, + "loss": 0.0011, + "num_tokens": 1325606.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.333213448524475, + "sampling/importance_sampling_ratio/mean": 1.000150203704834, + "sampling/importance_sampling_ratio/min": 0.6299831867218018, + "sampling/sampling_logp_difference/max": 0.46206212043762207, + "sampling/sampling_logp_difference/mean": 0.014898328110575676, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 189.328125, + "completions/mean_terminated_length": 189.328125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.29512307047843933, + "epoch": 0.10265486725663717, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.969598367643197, + "kl": 0.0023700199089944363, + "learning_rate": 5.044247787610619e-07, + "loss": -0.0002, + "num_tokens": 1348667.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.5592995882034302, + "sampling/importance_sampling_ratio/mean": 1.0008137226104736, + "sampling/importance_sampling_ratio/min": 0.609809935092926, + "sampling/sampling_logp_difference/max": 0.49460792541503906, + "sampling/sampling_logp_difference/mean": 0.013864312320947647, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 245.796875, + "completions/mean_terminated_length": 245.796875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.6066912412643433, + "epoch": 0.10442477876106195, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.889969071338536, + "kl": 0.004439334850758314, + "learning_rate": 5.132743362831859e-07, + "loss": -0.0074, + "num_tokens": 1378366.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.6305538415908813, + "sampling/importance_sampling_ratio/mean": 1.0001251697540283, + "sampling/importance_sampling_ratio/min": 0.6057105660438538, + "sampling/sampling_logp_difference/max": 0.5013530254364014, + "sampling/sampling_logp_difference/mean": 0.018475841730833054, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 290.875, + "completions/mean_terminated_length": 290.875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.44423145055770874, + "epoch": 0.10619469026548672, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.1032104959320381, + "kl": 0.0033573941327631474, + "learning_rate": 5.221238938053097e-07, + "loss": 0.0037, + "num_tokens": 1407718.0, + "reward": 0.0625, + "reward_std": 0.6707825064659119, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.2653532028198242, + "sampling/importance_sampling_ratio/mean": 0.9998553991317749, + "sampling/importance_sampling_ratio/min": 0.6232948303222656, + "sampling/sampling_logp_difference/max": 0.4727356433868408, + "sampling/sampling_logp_difference/mean": 0.01365312747657299, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 223.703125, + "completions/mean_terminated_length": 223.703125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4510975182056427, + "epoch": 0.1079646017699115, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3626532503704003, + "kl": 0.009123655967414379, + "learning_rate": 5.309734513274336e-07, + "loss": 0.0482, + "num_tokens": 1435811.0, + "reward": 0.25, + "reward_std": 0.7191373109817505, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5612155199050903, + "sampling/importance_sampling_ratio/mean": 0.9997560381889343, + "sampling/importance_sampling_ratio/min": 0.6180218458175659, + "sampling/sampling_logp_difference/max": 0.4812314510345459, + "sampling/sampling_logp_difference/mean": 0.016338439658284187, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 192.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 106.75, + "completions/mean_terminated_length": 106.75, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.3070604205131531, + "epoch": 0.10973451327433628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020025897738762826, + "kl": 0.004569772630929947, + "learning_rate": 5.398230088495575e-07, + "loss": 0.0001, + "num_tokens": 1452611.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4657670259475708, + "sampling/importance_sampling_ratio/mean": 1.000557541847229, + "sampling/importance_sampling_ratio/min": 0.6666322946548462, + "sampling/sampling_logp_difference/max": 0.4055166244506836, + "sampling/sampling_logp_difference/mean": 0.01649780571460724, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 188.5, + "completions/mean_terminated_length": 188.5, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3099520206451416, + "epoch": 0.11150442477876106, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6726922824460149, + "kl": 0.0026066922582685947, + "learning_rate": 5.486725663716814e-07, + "loss": 0.0253, + "num_tokens": 1475731.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.5191065073013306, + "sampling/importance_sampling_ratio/mean": 1.000030517578125, + "sampling/importance_sampling_ratio/min": 0.6821218729019165, + "sampling/sampling_logp_difference/max": 0.4181222915649414, + "sampling/sampling_logp_difference/mean": 0.012346604838967323, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 208.984375, + "completions/mean_terminated_length": 208.984375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.45609599351882935, + "epoch": 0.11327433628318584, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3096243389823579, + "kl": 0.011606791988015175, + "learning_rate": 5.575221238938052e-07, + "loss": 0.0161, + "num_tokens": 1501154.0, + "reward": 0.53125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4819767475128174, + "sampling/importance_sampling_ratio/mean": 1.0002541542053223, + "sampling/importance_sampling_ratio/min": 0.6301558017730713, + "sampling/sampling_logp_difference/max": 0.4617881774902344, + "sampling/sampling_logp_difference/mean": 0.015784818679094315, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 161.09375, + "completions/mean_terminated_length": 161.09375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3594406843185425, + "epoch": 0.11504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1443597720195766, + "kl": 0.00754031864926219, + "learning_rate": 5.663716814159291e-07, + "loss": 0.0308, + "num_tokens": 1521544.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4212729930877686, + "sampling/importance_sampling_ratio/mean": 0.9998541474342346, + "sampling/importance_sampling_ratio/min": 0.6153431534767151, + "sampling/sampling_logp_difference/max": 0.48557519912719727, + "sampling/sampling_logp_difference/mean": 0.014111566357314587, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 164.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 101.125, + "completions/mean_terminated_length": 101.125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.24357709288597107, + "epoch": 0.1168141592920354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024629826609836886, + "kl": 0.0035986611619591713, + "learning_rate": 5.752212389380531e-07, + "loss": 0.0, + "num_tokens": 1537696.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.377199649810791, + "sampling/importance_sampling_ratio/mean": 1.0006064176559448, + "sampling/importance_sampling_ratio/min": 0.637252151966095, + "sampling/sampling_logp_difference/max": 0.4505898952484131, + "sampling/sampling_logp_difference/mean": 0.013590476475656033, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 178.5625, + "completions/mean_terminated_length": 178.5625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.3457975387573242, + "epoch": 0.11858407079646018, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8047084844593525, + "kl": 0.018204988911747932, + "learning_rate": 5.84070796460177e-07, + "loss": -0.0068, + "num_tokens": 1558852.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.29731285572052, + "sampling/importance_sampling_ratio/mean": 0.9993517398834229, + "sampling/importance_sampling_ratio/min": 0.695947527885437, + "sampling/sampling_logp_difference/max": 0.3624809980392456, + "sampling/sampling_logp_difference/mean": 0.013106441125273705, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 194.1875, + "completions/mean_terminated_length": 194.1875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.39951688051223755, + "epoch": 0.12035398230088495, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2475307094295822, + "kl": 0.00963111873716116, + "learning_rate": 5.929203539823009e-07, + "loss": -0.0971, + "num_tokens": 1580976.0, + "reward": 0.375, + "reward_std": 0.4577302038669586, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4656713008880615, + "sampling/importance_sampling_ratio/mean": 0.9998877644538879, + "sampling/importance_sampling_ratio/min": 0.6100970506668091, + "sampling/sampling_logp_difference/max": 0.4941372871398926, + "sampling/sampling_logp_difference/mean": 0.014960943721234798, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 187.609375, + "completions/mean_terminated_length": 187.609375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3969162106513977, + "epoch": 0.12212389380530973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026519157980291592, + "kl": 0.011400602757930756, + "learning_rate": 6.017699115044248e-07, + "loss": 0.0001, + "num_tokens": 1603575.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6854522228240967, + "sampling/importance_sampling_ratio/mean": 1.000511884689331, + "sampling/importance_sampling_ratio/min": 0.6474390625953674, + "sampling/sampling_logp_difference/max": 0.5220339298248291, + "sampling/sampling_logp_difference/mean": 0.015393278561532497, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 177.546875, + "completions/mean_terminated_length": 177.546875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3258362412452698, + "epoch": 0.12389380530973451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9490586376721595, + "kl": 0.0063695767894387245, + "learning_rate": 6.106194690265486e-07, + "loss": 0.0197, + "num_tokens": 1626794.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.853153944015503, + "sampling/importance_sampling_ratio/mean": 1.000276803970337, + "sampling/importance_sampling_ratio/min": 0.5364962816238403, + "sampling/sampling_logp_difference/max": 0.6226956844329834, + "sampling/sampling_logp_difference/mean": 0.014140763320028782, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 157.625, + "completions/mean_terminated_length": 157.625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3936723470687866, + "epoch": 0.1256637168141593, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.619913369316984, + "kl": 0.013661106117069721, + "learning_rate": 6.194690265486725e-07, + "loss": -0.04, + "num_tokens": 1647106.0, + "reward": 0.40625, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.536644697189331, + "sampling/importance_sampling_ratio/mean": 1.000814437866211, + "sampling/importance_sampling_ratio/min": 0.6920749545097351, + "sampling/sampling_logp_difference/max": 0.42960119247436523, + "sampling/sampling_logp_difference/mean": 0.016903996467590332, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 155.953125, + "completions/mean_terminated_length": 155.953125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.38842615485191345, + "epoch": 0.12743362831858407, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9952886581157503, + "kl": 0.011372420005500317, + "learning_rate": 6.283185840707964e-07, + "loss": -0.0232, + "num_tokens": 1667471.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.597475528717041, + "sampling/importance_sampling_ratio/mean": 1.0003933906555176, + "sampling/importance_sampling_ratio/min": 0.6933631300926208, + "sampling/sampling_logp_difference/max": 0.46842455863952637, + "sampling/sampling_logp_difference/mean": 0.015585072338581085, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 118.015625, + "completions/mean_terminated_length": 118.015625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.254025399684906, + "epoch": 0.12920353982300886, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04284108299035032, + "kl": 0.01039934903383255, + "learning_rate": 6.371681415929203e-07, + "loss": 0.0001, + "num_tokens": 1684752.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.504378080368042, + "sampling/importance_sampling_ratio/mean": 0.9999960064888, + "sampling/importance_sampling_ratio/min": 0.649052083492279, + "sampling/sampling_logp_difference/max": 0.4322422742843628, + "sampling/sampling_logp_difference/mean": 0.013781622983515263, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 201.21875, + "completions/mean_terminated_length": 201.21875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.3895375430583954, + "epoch": 0.13097345132743363, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8645019235320143, + "kl": 0.01647133380174637, + "learning_rate": 6.460176991150442e-07, + "loss": -0.0116, + "num_tokens": 1707614.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6088998317718506, + "sampling/importance_sampling_ratio/mean": 1.0000102519989014, + "sampling/importance_sampling_ratio/min": 0.7015503644943237, + "sampling/sampling_logp_difference/max": 0.47555065155029297, + "sampling/sampling_logp_difference/mean": 0.014648362062871456, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 218.59375, + "completions/mean_terminated_length": 218.59375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.4297330379486084, + "epoch": 0.13274336283185842, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9546869949633048, + "kl": 0.015366164967417717, + "learning_rate": 6.548672566371681e-07, + "loss": 0.0099, + "num_tokens": 1732084.0, + "reward": 0.375, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.492048978805542, + "sampling/importance_sampling_ratio/mean": 1.0000102519989014, + "sampling/importance_sampling_ratio/min": 0.695496678352356, + "sampling/sampling_logp_difference/max": 0.4001502990722656, + "sampling/sampling_logp_difference/mean": 0.015646368265151978, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 133.046875, + "completions/mean_terminated_length": 133.046875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.32887911796569824, + "epoch": 0.13451327433628318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02094079875903086, + "kl": 0.009953726083040237, + "learning_rate": 6.637168141592921e-07, + "loss": 0.0001, + "num_tokens": 1751687.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5874511003494263, + "sampling/importance_sampling_ratio/mean": 0.9999567270278931, + "sampling/importance_sampling_ratio/min": 0.7372974753379822, + "sampling/sampling_logp_difference/max": 0.4621295928955078, + "sampling/sampling_logp_difference/mean": 0.0152928177267313, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 146.71875, + "completions/mean_terminated_length": 146.71875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3579290211200714, + "epoch": 0.13628318584070798, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.97467148237027, + "kl": 0.012682763859629631, + "learning_rate": 6.72566371681416e-07, + "loss": -0.0063, + "num_tokens": 1770389.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.6267775297164917, + "sampling/importance_sampling_ratio/mean": 1.0003312826156616, + "sampling/importance_sampling_ratio/min": 0.6440848112106323, + "sampling/sampling_logp_difference/max": 0.4866011142730713, + "sampling/sampling_logp_difference/mean": 0.015002220869064331, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 145.59375, + "completions/mean_terminated_length": 145.59375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.3425063192844391, + "epoch": 0.13805309734513274, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1006363356593576, + "kl": 0.0066282302141189575, + "learning_rate": 6.814159292035397e-07, + "loss": 0.0246, + "num_tokens": 1794123.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5020347833633423, + "sampling/importance_sampling_ratio/mean": 0.9996710419654846, + "sampling/importance_sampling_ratio/min": 0.5523310303688049, + "sampling/sampling_logp_difference/max": 0.5936076641082764, + "sampling/sampling_logp_difference/mean": 0.014506646431982517, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 155.078125, + "completions/mean_terminated_length": 155.078125, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.2749665379524231, + "epoch": 0.13982300884955753, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017866335754167833, + "kl": 0.006836398039013147, + "learning_rate": 6.902654867256636e-07, + "loss": 0.0001, + "num_tokens": 1814176.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4780030250549316, + "sampling/importance_sampling_ratio/mean": 1.000427484512329, + "sampling/importance_sampling_ratio/min": 0.7656050324440002, + "sampling/sampling_logp_difference/max": 0.390691876411438, + "sampling/sampling_logp_difference/mean": 0.012600925751030445, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 202.546875, + "completions/mean_terminated_length": 202.546875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.5037556290626526, + "epoch": 0.1415929203539823, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3805809279315995, + "kl": 0.020283661782741547, + "learning_rate": 6.991150442477876e-07, + "loss": 0.0377, + "num_tokens": 1839651.0, + "reward": 0.59375, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4834307432174683, + "sampling/importance_sampling_ratio/mean": 0.9995615482330322, + "sampling/importance_sampling_ratio/min": 0.7031538486480713, + "sampling/sampling_logp_difference/max": 0.39435744285583496, + "sampling/sampling_logp_difference/mean": 0.017124194651842117, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 186.859375, + "completions/mean_terminated_length": 186.859375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.37872523069381714, + "epoch": 0.1433628318584071, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7996257439925782, + "kl": 0.017041007056832314, + "learning_rate": 7.079646017699115e-07, + "loss": -0.0076, + "num_tokens": 1862890.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.363723635673523, + "sampling/importance_sampling_ratio/mean": 0.9994292259216309, + "sampling/importance_sampling_ratio/min": 0.63686603307724, + "sampling/sampling_logp_difference/max": 0.45119595527648926, + "sampling/sampling_logp_difference/mean": 0.014645487070083618, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 142.71875, + "completions/mean_terminated_length": 142.71875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.4198603928089142, + "epoch": 0.14513274336283186, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0840679594058147, + "kl": 0.007501678541302681, + "learning_rate": 7.168141592920353e-07, + "loss": 0.0294, + "num_tokens": 1884456.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.564056158065796, + "sampling/importance_sampling_ratio/mean": 1.0002379417419434, + "sampling/importance_sampling_ratio/min": 0.6243125796318054, + "sampling/sampling_logp_difference/max": 0.47110414505004883, + "sampling/sampling_logp_difference/mean": 0.016645008698105812, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 161.421875, + "completions/mean_terminated_length": 161.421875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.39928942918777466, + "epoch": 0.14690265486725665, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5249126492123801, + "kl": 0.008676768280565739, + "learning_rate": 7.256637168141593e-07, + "loss": 0.0494, + "num_tokens": 1906963.0, + "reward": 0.59375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.620729923248291, + "sampling/importance_sampling_ratio/mean": 1.0003002882003784, + "sampling/importance_sampling_ratio/min": 0.7047671675682068, + "sampling/sampling_logp_difference/max": 0.4828765392303467, + "sampling/sampling_logp_difference/mean": 0.016519783064723015, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 163.1875, + "completions/mean_terminated_length": 163.1875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4313996434211731, + "epoch": 0.1486725663716814, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.028906812237053, + "kl": 0.01829027757048607, + "learning_rate": 7.345132743362832e-07, + "loss": -0.0102, + "num_tokens": 1928543.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4684945344924927, + "sampling/importance_sampling_ratio/mean": 0.9997948408126831, + "sampling/importance_sampling_ratio/min": 0.685234010219574, + "sampling/sampling_logp_difference/max": 0.38423776626586914, + "sampling/sampling_logp_difference/mean": 0.016351807862520218, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 211.78125, + "completions/mean_terminated_length": 211.78125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4603821635246277, + "epoch": 0.1504424778761062, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0361716117603827, + "kl": 0.022909455001354218, + "learning_rate": 7.433628318584071e-07, + "loss": 0.0139, + "num_tokens": 1952721.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.2994089126586914, + "sampling/importance_sampling_ratio/mean": 0.9995492696762085, + "sampling/importance_sampling_ratio/min": 0.6153885126113892, + "sampling/sampling_logp_difference/max": 0.4855015277862549, + "sampling/sampling_logp_difference/mean": 0.01490258239209652, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 125.78125, + "completions/mean_terminated_length": 125.78125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2793322503566742, + "epoch": 0.15221238938053097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02465176853858683, + "kl": 0.006941329222172499, + "learning_rate": 7.522123893805308e-07, + "loss": 0.0001, + "num_tokens": 1970899.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.367046594619751, + "sampling/importance_sampling_ratio/mean": 0.9998903870582581, + "sampling/importance_sampling_ratio/min": 0.6791725754737854, + "sampling/sampling_logp_difference/max": 0.3868800401687622, + "sampling/sampling_logp_difference/mean": 0.013056832365691662, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 139.671875, + "completions/mean_terminated_length": 139.671875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.32963648438453674, + "epoch": 0.15398230088495576, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9408148001914002, + "kl": 0.011813381686806679, + "learning_rate": 7.610619469026548e-07, + "loss": -0.0057, + "num_tokens": 1989662.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3227317333221436, + "sampling/importance_sampling_ratio/mean": 0.9998810291290283, + "sampling/importance_sampling_ratio/min": 0.6637370586395264, + "sampling/sampling_logp_difference/max": 0.4098691940307617, + "sampling/sampling_logp_difference/mean": 0.013913685455918312, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 185.25, + "completions/mean_terminated_length": 185.25, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.40117347240448, + "epoch": 0.15575221238938053, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.03732217114201, + "kl": 0.0204781461507082, + "learning_rate": 7.699115044247787e-07, + "loss": 0.0381, + "num_tokens": 2012414.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.3919247388839722, + "sampling/importance_sampling_ratio/mean": 0.9995261430740356, + "sampling/importance_sampling_ratio/min": 0.6509130597114563, + "sampling/sampling_logp_difference/max": 0.4293792247772217, + "sampling/sampling_logp_difference/mean": 0.016024772077798843, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 128.25, + "completions/mean_terminated_length": 128.25, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.34240853786468506, + "epoch": 0.15752212389380532, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.107699474176166, + "kl": 0.01158861257135868, + "learning_rate": 7.787610619469026e-07, + "loss": -0.0249, + "num_tokens": 2030750.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.3654366731643677, + "sampling/importance_sampling_ratio/mean": 1.000300407409668, + "sampling/importance_sampling_ratio/min": 0.6950644850730896, + "sampling/sampling_logp_difference/max": 0.3637505769729614, + "sampling/sampling_logp_difference/mean": 0.015214118175208569, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 156.1875, + "completions/mean_terminated_length": 156.1875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3586588203907013, + "epoch": 0.1592920353982301, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.050407924866668, + "kl": 0.012983415275812149, + "learning_rate": 7.876106194690266e-07, + "loss": 0.0361, + "num_tokens": 2054442.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.5333000421524048, + "sampling/importance_sampling_ratio/mean": 0.9996432662010193, + "sampling/importance_sampling_ratio/min": 0.7462313771247864, + "sampling/sampling_logp_difference/max": 0.42742228507995605, + "sampling/sampling_logp_difference/mean": 0.01414406206458807, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 175.96875, + "completions/mean_terminated_length": 175.96875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.5748320817947388, + "epoch": 0.16106194690265488, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5838592905412583, + "kl": 0.03240203857421875, + "learning_rate": 7.964601769911504e-07, + "loss": 0.039, + "num_tokens": 2082136.0, + "reward": 0.25, + "reward_std": 0.6285127401351929, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.5486781597137451, + "sampling/importance_sampling_ratio/mean": 1.0000864267349243, + "sampling/importance_sampling_ratio/min": 0.6489058136940002, + "sampling/sampling_logp_difference/max": 0.43740177154541016, + "sampling/sampling_logp_difference/mean": 0.01843888685107231, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 165.75, + "completions/mean_terminated_length": 165.75, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.4375103712081909, + "epoch": 0.16283185840707964, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9584750125833097, + "kl": 0.019327480345964432, + "learning_rate": 8.053097345132743e-07, + "loss": -0.0028, + "num_tokens": 2106104.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4734406471252441, + "sampling/importance_sampling_ratio/mean": 0.9997900724411011, + "sampling/importance_sampling_ratio/min": 0.6482194662094116, + "sampling/sampling_logp_difference/max": 0.4335259199142456, + "sampling/sampling_logp_difference/mean": 0.01644751988351345, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 144.203125, + "completions/mean_terminated_length": 144.203125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.3018205165863037, + "epoch": 0.16460176991150444, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03494572984307722, + "kl": 0.010509281419217587, + "learning_rate": 8.141592920353983e-07, + "loss": 0.0001, + "num_tokens": 2126389.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.394669771194458, + "sampling/importance_sampling_ratio/mean": 0.9995672702789307, + "sampling/importance_sampling_ratio/min": 0.611591100692749, + "sampling/sampling_logp_difference/max": 0.49169135093688965, + "sampling/sampling_logp_difference/mean": 0.014270318672060966, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 145.765625, + "completions/mean_terminated_length": 145.765625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.348491370677948, + "epoch": 0.1663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2147250909188214, + "kl": 0.013375984504818916, + "learning_rate": 8.230088495575221e-07, + "loss": -0.0192, + "num_tokens": 2145574.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.6011326313018799, + "sampling/importance_sampling_ratio/mean": 1.0004305839538574, + "sampling/importance_sampling_ratio/min": 0.6399555802345276, + "sampling/sampling_logp_difference/max": 0.47071123123168945, + "sampling/sampling_logp_difference/mean": 0.013981970958411694, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 142.953125, + "completions/mean_terminated_length": 142.953125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.5088372230529785, + "epoch": 0.168141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.067385432800198, + "kl": 0.031087420880794525, + "learning_rate": 8.318584070796459e-07, + "loss": -0.0101, + "num_tokens": 2166899.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5077775716781616, + "sampling/importance_sampling_ratio/mean": 1.0007333755493164, + "sampling/importance_sampling_ratio/min": 0.6623175740242004, + "sampling/sampling_logp_difference/max": 0.4120100736618042, + "sampling/sampling_logp_difference/mean": 0.018186533823609352, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 133.59375, + "completions/mean_terminated_length": 133.59375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.30815327167510986, + "epoch": 0.16991150442477876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027201587791542785, + "kl": 0.00946621410548687, + "learning_rate": 8.407079646017698e-07, + "loss": 0.0001, + "num_tokens": 2184889.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6088849306106567, + "sampling/importance_sampling_ratio/mean": 1.0007243156433105, + "sampling/importance_sampling_ratio/min": 0.7547110915184021, + "sampling/sampling_logp_difference/max": 0.475541353225708, + "sampling/sampling_logp_difference/mean": 0.013283636420965195, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 129.203125, + "completions/mean_terminated_length": 129.203125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.32512176036834717, + "epoch": 0.17168141592920355, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04247265994389906, + "kl": 0.0198502354323864, + "learning_rate": 8.495575221238938e-07, + "loss": 0.0002, + "num_tokens": 2202150.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4340442419052124, + "sampling/importance_sampling_ratio/mean": 1.0001094341278076, + "sampling/importance_sampling_ratio/min": 0.6144447922706604, + "sampling/sampling_logp_difference/max": 0.48703622817993164, + "sampling/sampling_logp_difference/mean": 0.01574341207742691, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 183.109375, + "completions/mean_terminated_length": 183.109375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.614513635635376, + "epoch": 0.17345132743362832, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3652957419468377, + "kl": 0.042070478200912476, + "learning_rate": 8.584070796460177e-07, + "loss": -0.0243, + "num_tokens": 2226765.0, + "reward": 0.40625, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.3663080930709839, + "sampling/importance_sampling_ratio/mean": 1.0011334419250488, + "sampling/importance_sampling_ratio/min": 0.6851910948753357, + "sampling/sampling_logp_difference/max": 0.37805747985839844, + "sampling/sampling_logp_difference/mean": 0.018913637846708298, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 230.0, + "completions/max_terminated_length": 230.0, + "completions/mean_length": 118.1875, + "completions/mean_terminated_length": 118.1875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.4186111390590668, + "epoch": 0.1752212389380531, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3455830789305978, + "kl": 0.037338901311159134, + "learning_rate": 8.672566371681415e-07, + "loss": -0.0109, + "num_tokens": 2246089.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4058585166931152, + "sampling/importance_sampling_ratio/mean": 0.999924898147583, + "sampling/importance_sampling_ratio/min": 0.5805544257164001, + "sampling/sampling_logp_difference/max": 0.5437717437744141, + "sampling/sampling_logp_difference/mean": 0.01750396192073822, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 144.515625, + "completions/mean_terminated_length": 144.515625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3664934039115906, + "epoch": 0.17699115044247787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04605670211334364, + "kl": 0.02497912012040615, + "learning_rate": 8.761061946902655e-07, + "loss": 0.0003, + "num_tokens": 2270730.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4137628078460693, + "sampling/importance_sampling_ratio/mean": 0.9990532398223877, + "sampling/importance_sampling_ratio/min": 0.6056217551231384, + "sampling/sampling_logp_difference/max": 0.5014996528625488, + "sampling/sampling_logp_difference/mean": 0.016384344547986984, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 209.09375, + "completions/mean_terminated_length": 209.09375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.5220569968223572, + "epoch": 0.17876106194690267, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2159113837907307, + "kl": 0.03031090274453163, + "learning_rate": 8.849557522123894e-07, + "loss": -0.0172, + "num_tokens": 2295472.0, + "reward": 0.53125, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.475376844406128, + "sampling/importance_sampling_ratio/mean": 0.9996536374092102, + "sampling/importance_sampling_ratio/min": 0.6090261936187744, + "sampling/sampling_logp_difference/max": 0.4958939552307129, + "sampling/sampling_logp_difference/mean": 0.016138719394803047, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 140.734375, + "completions/mean_terminated_length": 140.734375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4087064266204834, + "epoch": 0.18053097345132743, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037537008599160135, + "kl": 0.029635798186063766, + "learning_rate": 8.938053097345132e-07, + "loss": 0.0003, + "num_tokens": 2313967.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3540775775909424, + "sampling/importance_sampling_ratio/mean": 0.9989540576934814, + "sampling/importance_sampling_ratio/min": 0.6092417240142822, + "sampling/sampling_logp_difference/max": 0.49554014205932617, + "sampling/sampling_logp_difference/mean": 0.01638820394873619, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 128.625, + "completions/mean_terminated_length": 128.625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.33473706245422363, + "epoch": 0.18230088495575222, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2249577525011914, + "kl": 0.014623328112065792, + "learning_rate": 9.026548672566371e-07, + "loss": -0.0065, + "num_tokens": 2332871.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5468659400939941, + "sampling/importance_sampling_ratio/mean": 1.0005148649215698, + "sampling/importance_sampling_ratio/min": 0.6942964196205139, + "sampling/sampling_logp_difference/max": 0.4362308979034424, + "sampling/sampling_logp_difference/mean": 0.015089668333530426, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 140.796875, + "completions/mean_terminated_length": 140.796875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4268108010292053, + "epoch": 0.184070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2764446322401908, + "kl": 0.015488414093852043, + "learning_rate": 9.11504424778761e-07, + "loss": -0.0196, + "num_tokens": 2351674.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.5834494829177856, + "sampling/importance_sampling_ratio/mean": 1.0005009174346924, + "sampling/importance_sampling_ratio/min": 0.663982093334198, + "sampling/sampling_logp_difference/max": 0.45960569381713867, + "sampling/sampling_logp_difference/mean": 0.01634242758154869, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 174.4375, + "completions/mean_terminated_length": 174.4375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.507774829864502, + "epoch": 0.18584070796460178, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3069282947095495, + "kl": 0.03400026261806488, + "learning_rate": 9.203539823008849e-07, + "loss": -0.0386, + "num_tokens": 2377846.0, + "reward": 0.15625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.294175148010254, + "sampling/importance_sampling_ratio/mean": 0.99942946434021, + "sampling/importance_sampling_ratio/min": 0.6771621704101562, + "sampling/sampling_logp_difference/max": 0.3898444175720215, + "sampling/sampling_logp_difference/mean": 0.016538560390472412, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 160.34375, + "completions/mean_terminated_length": 160.34375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.5758728384971619, + "epoch": 0.18761061946902655, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.665599068136189, + "kl": 0.026462240144610405, + "learning_rate": 9.292035398230088e-07, + "loss": -0.0373, + "num_tokens": 2401244.0, + "reward": 0.53125, + "reward_std": 0.6331988573074341, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3962644338607788, + "sampling/importance_sampling_ratio/mean": 0.9994282126426697, + "sampling/importance_sampling_ratio/min": 0.6452757120132446, + "sampling/sampling_logp_difference/max": 0.4380776882171631, + "sampling/sampling_logp_difference/mean": 0.019216172397136688, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 138.71875, + "completions/mean_terminated_length": 138.71875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.40617284178733826, + "epoch": 0.18938053097345134, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5116903926994538, + "kl": 0.015507981181144714, + "learning_rate": 9.380530973451328e-07, + "loss": 0.0271, + "num_tokens": 2421866.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4085325002670288, + "sampling/importance_sampling_ratio/mean": 0.9999234080314636, + "sampling/importance_sampling_ratio/min": 0.7436591982841492, + "sampling/sampling_logp_difference/max": 0.3425483703613281, + "sampling/sampling_logp_difference/mean": 0.0145273357629776, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 155.375, + "completions/mean_terminated_length": 155.375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4087454676628113, + "epoch": 0.1911504424778761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.454546734150439, + "kl": 0.05462752655148506, + "learning_rate": 9.469026548672566e-07, + "loss": 0.0008, + "num_tokens": 2442706.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4575304985046387, + "sampling/importance_sampling_ratio/mean": 0.9992321729660034, + "sampling/importance_sampling_ratio/min": 0.6291640996932983, + "sampling/sampling_logp_difference/max": 0.4633631706237793, + "sampling/sampling_logp_difference/mean": 0.01630711928009987, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 153.5625, + "completions/mean_terminated_length": 153.5625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.4511426091194153, + "epoch": 0.1929203539823009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024483318744972813, + "kl": 0.011083774268627167, + "learning_rate": 9.557522123893805e-07, + "loss": 0.0001, + "num_tokens": 2463062.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3670463562011719, + "sampling/importance_sampling_ratio/mean": 0.9995465874671936, + "sampling/importance_sampling_ratio/min": 0.6154676079750061, + "sampling/sampling_logp_difference/max": 0.48537302017211914, + "sampling/sampling_logp_difference/mean": 0.017112568020820618, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 139.921875, + "completions/mean_terminated_length": 139.921875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.4308427572250366, + "epoch": 0.19469026548672566, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5095584664072077, + "kl": 0.01657198928296566, + "learning_rate": 9.646017699115042e-07, + "loss": -0.0318, + "num_tokens": 2482145.0, + "reward": 0.5, + "reward_std": 0.4472135901451111, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6176989078521729, + "sampling/importance_sampling_ratio/mean": 1.0003668069839478, + "sampling/importance_sampling_ratio/min": 0.459940105676651, + "sampling/sampling_logp_difference/max": 0.7766590118408203, + "sampling/sampling_logp_difference/mean": 0.016544148325920105, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 193.328125, + "completions/mean_terminated_length": 193.328125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.5080606937408447, + "epoch": 0.19646017699115045, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7919745122322969, + "kl": 0.0173199363052845, + "learning_rate": 9.734513274336282e-07, + "loss": -0.0041, + "num_tokens": 2508614.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4097967147827148, + "sampling/importance_sampling_ratio/mean": 0.9998339414596558, + "sampling/importance_sampling_ratio/min": 0.6026744246482849, + "sampling/sampling_logp_difference/max": 0.506378173828125, + "sampling/sampling_logp_difference/mean": 0.017503349110484123, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 164.0, + "completions/mean_terminated_length": 164.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.49331408739089966, + "epoch": 0.19823008849557522, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3789700100106055, + "kl": 0.010955605655908585, + "learning_rate": 9.82300884955752e-07, + "loss": 0.018, + "num_tokens": 2537686.0, + "reward": 0.375, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.6250979900360107, + "sampling/importance_sampling_ratio/mean": 1.0000886917114258, + "sampling/importance_sampling_ratio/min": 0.7136193513870239, + "sampling/sampling_logp_difference/max": 0.4855680465698242, + "sampling/sampling_logp_difference/mean": 0.017850980162620544, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 142.25, + "completions/mean_terminated_length": 142.25, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.4632031321525574, + "epoch": 0.2, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1494826839551227, + "kl": 0.011163798160851002, + "learning_rate": 9.91150442477876e-07, + "loss": -0.0053, + "num_tokens": 2561142.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5633403062820435, + "sampling/importance_sampling_ratio/mean": 1.000186800956726, + "sampling/importance_sampling_ratio/min": 0.6731029748916626, + "sampling/sampling_logp_difference/max": 0.4468247890472412, + "sampling/sampling_logp_difference/mean": 0.017129603773355484, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 242.0, + "completions/max_terminated_length": 242.0, + "completions/mean_length": 132.609375, + "completions/mean_terminated_length": 132.609375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.5000830292701721, + "epoch": 0.20176991150442478, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1237492563637732, + "kl": 0.011804342269897461, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 2582461.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.5152255296707153, + "sampling/importance_sampling_ratio/mean": 1.000805139541626, + "sampling/importance_sampling_ratio/min": 0.7269524931907654, + "sampling/sampling_logp_difference/max": 0.41556429862976074, + "sampling/sampling_logp_difference/mean": 0.01942405290901661, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 174.765625, + "completions/mean_terminated_length": 174.765625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.42178767919540405, + "epoch": 0.20353982300884957, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0131540511399926, + "kl": 0.0098259337246418, + "learning_rate": 9.99997614400677e-07, + "loss": 0.012, + "num_tokens": 2604750.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4035134315490723, + "sampling/importance_sampling_ratio/mean": 1.0000475645065308, + "sampling/importance_sampling_ratio/min": 0.6080347299575806, + "sampling/sampling_logp_difference/max": 0.49752330780029297, + "sampling/sampling_logp_difference/mean": 0.016035035252571106, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 172.96875, + "completions/mean_terminated_length": 172.96875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.5256979465484619, + "epoch": 0.20530973451327433, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8396764659600636, + "kl": 0.01159391738474369, + "learning_rate": 9.999904576254724e-07, + "loss": 0.0023, + "num_tokens": 2630236.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5071685314178467, + "sampling/importance_sampling_ratio/mean": 1.0004163980484009, + "sampling/importance_sampling_ratio/min": 0.7611066699028015, + "sampling/sampling_logp_difference/max": 0.4102327823638916, + "sampling/sampling_logp_difference/mean": 0.017749082297086716, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 181.6875, + "completions/mean_terminated_length": 181.6875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.5166543126106262, + "epoch": 0.20707964601769913, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1997542527548868, + "kl": 0.013240222819149494, + "learning_rate": 9.999785297426788e-07, + "loss": 0.0383, + "num_tokens": 2653560.0, + "reward": 0.46875, + "reward_std": 0.3723389506340027, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.3042196035385132, + "sampling/importance_sampling_ratio/mean": 1.0007864236831665, + "sampling/importance_sampling_ratio/min": 0.692559540271759, + "sampling/sampling_logp_difference/max": 0.36736106872558594, + "sampling/sampling_logp_difference/mean": 0.017134547233581543, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 158.15625, + "completions/mean_terminated_length": 158.15625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.5524478554725647, + "epoch": 0.2088495575221239, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.385869401627046, + "kl": 0.012404483743011951, + "learning_rate": 9.999618308661168e-07, + "loss": -0.0238, + "num_tokens": 2674610.0, + "reward": 0.71875, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.329261302947998, + "sampling/importance_sampling_ratio/mean": 1.0006515979766846, + "sampling/importance_sampling_ratio/min": 0.7105941772460938, + "sampling/sampling_logp_difference/max": 0.34165382385253906, + "sampling/sampling_logp_difference/mean": 0.01848817989230156, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 128.484375, + "completions/mean_terminated_length": 128.484375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.39519980549812317, + "epoch": 0.21061946902654868, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1890170889233096, + "kl": 0.0123123899102211, + "learning_rate": 9.99940361155134e-07, + "loss": 0.0021, + "num_tokens": 2692049.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2881121635437012, + "sampling/importance_sampling_ratio/mean": 0.9997977018356323, + "sampling/importance_sampling_ratio/min": 0.743664026260376, + "sampling/sampling_logp_difference/max": 0.29616594314575195, + "sampling/sampling_logp_difference/mean": 0.01698697730898857, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 208.015625, + "completions/mean_terminated_length": 208.015625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.6560627222061157, + "epoch": 0.21238938053097345, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3285645147361667, + "kl": 0.016548465937376022, + "learning_rate": 9.999141208146027e-07, + "loss": 0.0569, + "num_tokens": 2717218.0, + "reward": 0.0, + "reward_std": 0.5123475193977356, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.531611680984497, + "sampling/importance_sampling_ratio/mean": 1.0004523992538452, + "sampling/importance_sampling_ratio/min": 0.7322604656219482, + "sampling/sampling_logp_difference/max": 0.42632055282592773, + "sampling/sampling_logp_difference/mean": 0.01894541271030903, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 158.265625, + "completions/mean_terminated_length": 158.265625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.5611162185668945, + "epoch": 0.21415929203539824, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4886446751012414, + "kl": 0.016358420252799988, + "learning_rate": 9.998831100949186e-07, + "loss": 0.0003, + "num_tokens": 2741459.0, + "reward": 0.875, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.425057053565979, + "sampling/importance_sampling_ratio/mean": 0.9999103546142578, + "sampling/importance_sampling_ratio/min": 0.5411344170570374, + "sampling/sampling_logp_difference/max": 0.6140875816345215, + "sampling/sampling_logp_difference/mean": 0.018663184717297554, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 660.0, + "completions/max_terminated_length": 660.0, + "completions/mean_length": 158.65625, + "completions/mean_terminated_length": 158.65625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.43928951025009155, + "epoch": 0.215929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01645904357050317, + "kl": 0.009950288571417332, + "learning_rate": 9.998473292919985e-07, + "loss": 0.0001, + "num_tokens": 2764541.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3009637594223022, + "sampling/importance_sampling_ratio/mean": 1.0002142190933228, + "sampling/importance_sampling_ratio/min": 0.6546957492828369, + "sampling/sampling_logp_difference/max": 0.4235846996307373, + "sampling/sampling_logp_difference/mean": 0.015278320759534836, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 194.0, + "completions/max_terminated_length": 194.0, + "completions/mean_length": 120.984375, + "completions/mean_terminated_length": 120.984375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2946837544441223, + "epoch": 0.2176991150442478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02478017919018473, + "kl": 0.009915519505739212, + "learning_rate": 9.99806778747277e-07, + "loss": 0.0001, + "num_tokens": 2781916.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4346206188201904, + "sampling/importance_sampling_ratio/mean": 0.9996424913406372, + "sampling/importance_sampling_ratio/min": 0.635346531867981, + "sampling/sampling_logp_difference/max": 0.4535846710205078, + "sampling/sampling_logp_difference/mean": 0.014531348831951618, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 212.0, + "completions/max_terminated_length": 212.0, + "completions/mean_length": 107.0, + "completions/mean_terminated_length": 107.0, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.29190003871917725, + "epoch": 0.21946902654867256, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028728227613374267, + "kl": 0.010412806645035744, + "learning_rate": 9.997614588477033e-07, + "loss": 0.0001, + "num_tokens": 2798876.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.421050786972046, + "sampling/importance_sampling_ratio/mean": 1.0006955862045288, + "sampling/importance_sampling_ratio/min": 0.7137504816055298, + "sampling/sampling_logp_difference/max": 0.3513965606689453, + "sampling/sampling_logp_difference/mean": 0.014932672493159771, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 181.015625, + "completions/mean_terminated_length": 181.015625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.34129270911216736, + "epoch": 0.22123893805309736, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0047889755198065, + "kl": 0.011963277123868465, + "learning_rate": 9.99711370025738e-07, + "loss": 0.0003, + "num_tokens": 2820333.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2962805032730103, + "sampling/importance_sampling_ratio/mean": 1.0000866651535034, + "sampling/importance_sampling_ratio/min": 0.6254509687423706, + "sampling/sampling_logp_difference/max": 0.4692823886871338, + "sampling/sampling_logp_difference/mean": 0.012929601594805717, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 175.125, + "completions/mean_terminated_length": 175.125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.37770137190818787, + "epoch": 0.22300884955752212, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0151443363078754, + "kl": 0.01518353633582592, + "learning_rate": 9.996565127593489e-07, + "loss": -0.0039, + "num_tokens": 2841221.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5573248863220215, + "sampling/importance_sampling_ratio/mean": 1.000090479850769, + "sampling/importance_sampling_ratio/min": 0.6955075860023499, + "sampling/sampling_logp_difference/max": 0.44296956062316895, + "sampling/sampling_logp_difference/mean": 0.014279832132160664, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 125.375, + "completions/mean_terminated_length": 125.375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.38288789987564087, + "epoch": 0.2247787610619469, + "frac_reward_zero_std": 0.75, + "grad_norm": 9.291574908812418, + "kl": 0.017631027847528458, + "learning_rate": 9.995968875720051e-07, + "loss": -0.0187, + "num_tokens": 2864157.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.442091703414917, + "sampling/importance_sampling_ratio/mean": 0.9988166093826294, + "sampling/importance_sampling_ratio/min": 0.6271333694458008, + "sampling/sampling_logp_difference/max": 0.4665961265563965, + "sampling/sampling_logp_difference/mean": 0.015197532251477242, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 185.109375, + "completions/mean_terminated_length": 185.109375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3877907395362854, + "epoch": 0.22654867256637168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02394435841726851, + "kl": 0.01672307401895523, + "learning_rate": 9.995324950326745e-07, + "loss": 0.0002, + "num_tokens": 2887348.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6528104543685913, + "sampling/importance_sampling_ratio/mean": 0.9994519352912903, + "sampling/importance_sampling_ratio/min": 0.7085780501365662, + "sampling/sampling_logp_difference/max": 0.5024771690368652, + "sampling/sampling_logp_difference/mean": 0.01464095525443554, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 388.0, + "completions/max_terminated_length": 388.0, + "completions/mean_length": 170.84375, + "completions/mean_terminated_length": 170.84375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4916332960128784, + "epoch": 0.22831858407079647, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03916647784036259, + "kl": 0.031004594638943672, + "learning_rate": 9.994633357558158e-07, + "loss": 0.0003, + "num_tokens": 2909434.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2818790674209595, + "sampling/importance_sampling_ratio/mean": 0.9994707107543945, + "sampling/importance_sampling_ratio/min": 0.6706785559654236, + "sampling/sampling_logp_difference/max": 0.39946532249450684, + "sampling/sampling_logp_difference/mean": 0.01647157035768032, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 156.40625, + "completions/mean_terminated_length": 156.40625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.46675926446914673, + "epoch": 0.23008849557522124, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1741257957068456, + "kl": 0.021152405068278313, + "learning_rate": 9.993894104013746e-07, + "loss": 0.0574, + "num_tokens": 2929812.0, + "reward": -0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.2836991548538208, + "sampling/importance_sampling_ratio/mean": 0.9995951652526855, + "sampling/importance_sampling_ratio/min": 0.6450011134147644, + "sampling/sampling_logp_difference/max": 0.4385032653808594, + "sampling/sampling_logp_difference/mean": 0.01770133152604103, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 154.4375, + "completions/mean_terminated_length": 154.4375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.33106446266174316, + "epoch": 0.23185840707964603, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027089062065130332, + "kl": 0.020202994346618652, + "learning_rate": 9.993107196747758e-07, + "loss": 0.0002, + "num_tokens": 2949408.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5536561012268066, + "sampling/importance_sampling_ratio/mean": 1.0000596046447754, + "sampling/importance_sampling_ratio/min": 0.7092294692993164, + "sampling/sampling_logp_difference/max": 0.4406108856201172, + "sampling/sampling_logp_difference/mean": 0.013444620184600353, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 189.640625, + "completions/mean_terminated_length": 189.640625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.49644505977630615, + "epoch": 0.2336283185840708, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8487458344539385, + "kl": 0.02899659052491188, + "learning_rate": 9.99227264326918e-07, + "loss": 0.0148, + "num_tokens": 2973593.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4009907245635986, + "sampling/importance_sampling_ratio/mean": 1.0004339218139648, + "sampling/importance_sampling_ratio/min": 0.697740912437439, + "sampling/sampling_logp_difference/max": 0.3599073886871338, + "sampling/sampling_logp_difference/mean": 0.015961207449436188, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 171.0625, + "completions/mean_terminated_length": 171.0625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.523189902305603, + "epoch": 0.23539823008849559, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8880329497640999, + "kl": 0.025794755667448044, + "learning_rate": 9.991390451541648e-07, + "loss": -0.0173, + "num_tokens": 2998269.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4689908027648926, + "sampling/importance_sampling_ratio/mean": 0.9997839331626892, + "sampling/importance_sampling_ratio/min": 0.65727299451828, + "sampling/sampling_logp_difference/max": 0.4196559190750122, + "sampling/sampling_logp_difference/mean": 0.017466718330979347, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 213.234375, + "completions/mean_terminated_length": 213.234375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.5031307935714722, + "epoch": 0.23716814159292035, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1444489279085417, + "kl": 0.02678334340453148, + "learning_rate": 9.990460629983388e-07, + "loss": 0.0087, + "num_tokens": 3023724.0, + "reward": 0.1875, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.4857287406921387, + "sampling/importance_sampling_ratio/mean": 1.0004141330718994, + "sampling/importance_sampling_ratio/min": 0.6298375129699707, + "sampling/sampling_logp_difference/max": 0.4622933864593506, + "sampling/sampling_logp_difference/mean": 0.016972266137599945, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 872.0, + "completions/max_terminated_length": 872.0, + "completions/mean_length": 162.078125, + "completions/mean_terminated_length": 162.078125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.35991179943084717, + "epoch": 0.23893805309734514, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028513651900094868, + "kl": 0.02142981067299843, + "learning_rate": 9.989483187467125e-07, + "loss": 0.0002, + "num_tokens": 3045377.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.535014033317566, + "sampling/importance_sampling_ratio/mean": 1.0001251697540283, + "sampling/importance_sampling_ratio/min": 0.6358522176742554, + "sampling/sampling_logp_difference/max": 0.4527890682220459, + "sampling/sampling_logp_difference/mean": 0.014851902611553669, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 223.0, + "completions/max_terminated_length": 223.0, + "completions/mean_length": 121.625, + "completions/mean_terminated_length": 121.625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.31467437744140625, + "epoch": 0.2407079646017699, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05676225492585991, + "kl": 0.025293543934822083, + "learning_rate": 9.988458133320008e-07, + "loss": 0.0003, + "num_tokens": 3062985.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4338220357894897, + "sampling/importance_sampling_ratio/mean": 1.0003433227539062, + "sampling/importance_sampling_ratio/min": 0.6482194662094116, + "sampling/sampling_logp_difference/max": 0.4335259199142456, + "sampling/sampling_logp_difference/mean": 0.015467900782823563, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 691.0, + "completions/max_terminated_length": 691.0, + "completions/mean_length": 246.1875, + "completions/mean_terminated_length": 246.1875, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.45606034994125366, + "epoch": 0.2424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027817480301703744, + "kl": 0.028994332998991013, + "learning_rate": 9.987385477323506e-07, + "loss": 0.0003, + "num_tokens": 3088773.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.558082938194275, + "sampling/importance_sampling_ratio/mean": 0.999538242816925, + "sampling/importance_sampling_ratio/min": 0.6333794593811035, + "sampling/sampling_logp_difference/max": 0.45668554306030273, + "sampling/sampling_logp_difference/mean": 0.014899208210408688, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 159.015625, + "completions/mean_terminated_length": 159.015625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.31140589714050293, + "epoch": 0.24424778761061947, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2857330417792834, + "kl": 0.023789769038558006, + "learning_rate": 9.98626522971333e-07, + "loss": -0.012, + "num_tokens": 3110774.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.6011077165603638, + "sampling/importance_sampling_ratio/mean": 0.9999802708625793, + "sampling/importance_sampling_ratio/min": 0.6121524572372437, + "sampling/sampling_logp_difference/max": 0.49077391624450684, + "sampling/sampling_logp_difference/mean": 0.01341700367629528, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 179.328125, + "completions/mean_terminated_length": 179.328125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.3433701694011688, + "epoch": 0.24601769911504426, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036298222737904895, + "kl": 0.02790701389312744, + "learning_rate": 9.985097401179333e-07, + "loss": 0.0003, + "num_tokens": 3132299.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4854481220245361, + "sampling/importance_sampling_ratio/mean": 0.9998148679733276, + "sampling/importance_sampling_ratio/min": 0.6218048930168152, + "sampling/sampling_logp_difference/max": 0.4751288890838623, + "sampling/sampling_logp_difference/mean": 0.014352495782077312, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1260.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 224.28125, + "completions/mean_terminated_length": 224.28125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3062925934791565, + "epoch": 0.24778761061946902, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03537521382370573, + "kl": 0.02674679271876812, + "learning_rate": 9.98388200286539e-07, + "loss": 0.0003, + "num_tokens": 3156413.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4351998567581177, + "sampling/importance_sampling_ratio/mean": 0.9995108842849731, + "sampling/importance_sampling_ratio/min": 0.6957627534866333, + "sampling/sampling_logp_difference/max": 0.3627464771270752, + "sampling/sampling_logp_difference/mean": 0.012646778486669064, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 815.0, + "completions/max_terminated_length": 815.0, + "completions/mean_length": 258.265625, + "completions/mean_terminated_length": 258.265625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4568946957588196, + "epoch": 0.24955752212389382, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03188772400383102, + "kl": 0.03366459906101227, + "learning_rate": 9.98261904636932e-07, + "loss": 0.0003, + "num_tokens": 3183054.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.374427080154419, + "sampling/importance_sampling_ratio/mean": 0.9999855756759644, + "sampling/importance_sampling_ratio/min": 0.6346470713615417, + "sampling/sampling_logp_difference/max": 0.45468616485595703, + "sampling/sampling_logp_difference/mean": 0.014311404898762703, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 614.0, + "completions/max_terminated_length": 614.0, + "completions/mean_length": 288.890625, + "completions/mean_terminated_length": 288.890625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.5986075401306152, + "epoch": 0.2513274336283186, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9051967590407283, + "kl": 0.04546056687831879, + "learning_rate": 9.981308543742756e-07, + "loss": -0.0179, + "num_tokens": 3213959.0, + "reward": 0.0, + "reward_std": 0.5, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5933138132095337, + "sampling/importance_sampling_ratio/mean": 1.0001044273376465, + "sampling/importance_sampling_ratio/min": 0.6030287146568298, + "sampling/sampling_logp_difference/max": 0.5057904720306396, + "sampling/sampling_logp_difference/mean": 0.01623942330479622, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 311.0, + "completions/max_terminated_length": 311.0, + "completions/mean_length": 139.03125, + "completions/mean_terminated_length": 139.03125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3082438111305237, + "epoch": 0.25309734513274335, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0346740695433099, + "kl": 0.026018641889095306, + "learning_rate": 9.979950507491033e-07, + "loss": 0.0003, + "num_tokens": 3232889.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4102617502212524, + "sampling/importance_sampling_ratio/mean": 1.000148892402649, + "sampling/importance_sampling_ratio/min": 0.7250264286994934, + "sampling/sampling_logp_difference/max": 0.34377527236938477, + "sampling/sampling_logp_difference/mean": 0.013314444571733475, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 154.265625, + "completions/mean_terminated_length": 154.265625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.34074559807777405, + "epoch": 0.25486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03860775273382935, + "kl": 0.03532181680202484, + "learning_rate": 9.978544950573073e-07, + "loss": 0.0004, + "num_tokens": 3251242.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5971453189849854, + "sampling/importance_sampling_ratio/mean": 1.0001463890075684, + "sampling/importance_sampling_ratio/min": 0.6404089331626892, + "sampling/sampling_logp_difference/max": 0.4682178497314453, + "sampling/sampling_logp_difference/mean": 0.013817005790770054, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 147.515625, + "completions/mean_terminated_length": 147.515625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.2873741686344147, + "epoch": 0.25663716814159293, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04067253581570143, + "kl": 0.03148189187049866, + "learning_rate": 9.97709188640126e-07, + "loss": 0.0003, + "num_tokens": 3271099.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2917137145996094, + "sampling/importance_sampling_ratio/mean": 0.9992789626121521, + "sampling/importance_sampling_ratio/min": 0.616651713848114, + "sampling/sampling_logp_difference/max": 0.48345088958740234, + "sampling/sampling_logp_difference/mean": 0.012774511240422726, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 201.03125, + "completions/mean_terminated_length": 201.03125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4325287938117981, + "epoch": 0.2584070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8993299382546053, + "kl": 0.03885094076395035, + "learning_rate": 9.975591328841304e-07, + "loss": 0.0169, + "num_tokens": 3296477.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.537389874458313, + "sampling/importance_sampling_ratio/mean": 0.9997760057449341, + "sampling/importance_sampling_ratio/min": 0.729432225227356, + "sampling/sampling_logp_difference/max": 0.4300861358642578, + "sampling/sampling_logp_difference/mean": 0.014719918370246887, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 166.1875, + "completions/mean_terminated_length": 166.1875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.4421290159225464, + "epoch": 0.26017699115044246, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1663039993812925, + "kl": 0.03346528485417366, + "learning_rate": 9.974043292212127e-07, + "loss": -0.0257, + "num_tokens": 3324201.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5876429080963135, + "sampling/importance_sampling_ratio/mean": 0.9990295171737671, + "sampling/importance_sampling_ratio/min": 0.694132924079895, + "sampling/sampling_logp_difference/max": 0.4622504711151123, + "sampling/sampling_logp_difference/mean": 0.016207978129386902, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 288.78125, + "completions/mean_terminated_length": 288.78125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.4689440131187439, + "epoch": 0.26194690265486725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027072815911706842, + "kl": 0.03730308637022972, + "learning_rate": 9.97244779128571e-07, + "loss": 0.0004, + "num_tokens": 3355563.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3829669952392578, + "sampling/importance_sampling_ratio/mean": 1.0002778768539429, + "sampling/importance_sampling_ratio/min": 0.6509962677955627, + "sampling/sampling_logp_difference/max": 0.42925143241882324, + "sampling/sampling_logp_difference/mean": 0.014328066259622574, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 225.9375, + "completions/mean_terminated_length": 225.9375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.3772192597389221, + "epoch": 0.26371681415929205, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02721464347010123, + "kl": 0.027017677202820778, + "learning_rate": 9.970804841286953e-07, + "loss": 0.0003, + "num_tokens": 3380599.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4368870258331299, + "sampling/importance_sampling_ratio/mean": 1.0000723600387573, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.01404891163110733, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 250.78125, + "completions/mean_terminated_length": 250.78125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.408069372177124, + "epoch": 0.26548672566371684, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024763661217230747, + "kl": 0.03320477902889252, + "learning_rate": 9.969114457893539e-07, + "loss": 0.0003, + "num_tokens": 3409225.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5792803764343262, + "sampling/importance_sampling_ratio/mean": 0.9997915029525757, + "sampling/importance_sampling_ratio/min": 0.6956390738487244, + "sampling/sampling_logp_difference/max": 0.4569692611694336, + "sampling/sampling_logp_difference/mean": 0.013673271983861923, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 200.765625, + "completions/mean_terminated_length": 200.765625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.4479539096355438, + "epoch": 0.2672566371681416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03492702919427098, + "kl": 0.040304169058799744, + "learning_rate": 9.967376657235778e-07, + "loss": 0.0005, + "num_tokens": 3432458.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.511489748954773, + "sampling/importance_sampling_ratio/mean": 1.0001468658447266, + "sampling/importance_sampling_ratio/min": 0.5509209036827087, + "sampling/sampling_logp_difference/max": 0.5961639881134033, + "sampling/sampling_logp_difference/mean": 0.016600418835878372, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 233.75, + "completions/mean_terminated_length": 233.75, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.40500524640083313, + "epoch": 0.26902654867256637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03434521781959229, + "kl": 0.0346105583012104, + "learning_rate": 9.965591455896455e-07, + "loss": 0.0004, + "num_tokens": 3459130.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2980811595916748, + "sampling/importance_sampling_ratio/mean": 1.000051736831665, + "sampling/importance_sampling_ratio/min": 0.6824501752853394, + "sampling/sampling_logp_difference/max": 0.3820657730102539, + "sampling/sampling_logp_difference/mean": 0.014134477823972702, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1221.0, + "completions/max_terminated_length": 1221.0, + "completions/mean_length": 354.90625, + "completions/mean_terminated_length": 354.90625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3681276738643646, + "epoch": 0.27079646017699116, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01678495946536908, + "kl": 0.02303595468401909, + "learning_rate": 9.96375887091067e-07, + "loss": 0.0002, + "num_tokens": 3494548.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.343514323234558, + "sampling/importance_sampling_ratio/mean": 0.9999706149101257, + "sampling/importance_sampling_ratio/min": 0.6474516987800598, + "sampling/sampling_logp_difference/max": 0.4347110986709595, + "sampling/sampling_logp_difference/mean": 0.011611346155405045, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 301.46875, + "completions/mean_terminated_length": 301.46875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.35046854615211487, + "epoch": 0.27256637168141595, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018301977976502796, + "kl": 0.018390771001577377, + "learning_rate": 9.961878919765677e-07, + "loss": 0.0002, + "num_tokens": 3527010.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6247203350067139, + "sampling/importance_sampling_ratio/mean": 1.0001691579818726, + "sampling/importance_sampling_ratio/min": 0.49258702993392944, + "sampling/sampling_logp_difference/max": 0.7080841064453125, + "sampling/sampling_logp_difference/mean": 0.011635717004537582, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 190.40625, + "completions/mean_terminated_length": 190.40625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3786317706108093, + "epoch": 0.2743362831858407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03390067600192146, + "kl": 0.03351679816842079, + "learning_rate": 9.959951620400718e-07, + "loss": 0.0004, + "num_tokens": 3548684.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3179739713668823, + "sampling/importance_sampling_ratio/mean": 1.0002996921539307, + "sampling/importance_sampling_ratio/min": 0.6940809488296509, + "sampling/sampling_logp_difference/max": 0.36516666412353516, + "sampling/sampling_logp_difference/mean": 0.014952241443097591, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 289.421875, + "completions/mean_terminated_length": 289.421875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.41793277859687805, + "epoch": 0.2761061946902655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027633166062037313, + "kl": 0.03265716880559921, + "learning_rate": 9.957976991206845e-07, + "loss": 0.0003, + "num_tokens": 3578055.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5194100141525269, + "sampling/importance_sampling_ratio/mean": 0.999330997467041, + "sampling/importance_sampling_ratio/min": 0.6532416343688965, + "sampling/sampling_logp_difference/max": 0.4258081912994385, + "sampling/sampling_logp_difference/mean": 0.01384137012064457, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 219.671875, + "completions/mean_terminated_length": 219.671875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4456171989440918, + "epoch": 0.2778761061946903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03839060018604204, + "kl": 0.0445060208439827, + "learning_rate": 9.955955051026758e-07, + "loss": 0.0005, + "num_tokens": 3604290.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4353132247924805, + "sampling/importance_sampling_ratio/mean": 0.9999475479125977, + "sampling/importance_sampling_ratio/min": 0.6802029609680176, + "sampling/sampling_logp_difference/max": 0.3853640556335449, + "sampling/sampling_logp_difference/mean": 0.014434573240578175, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 235.515625, + "completions/mean_terminated_length": 235.515625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.3188089430332184, + "epoch": 0.27964601769911507, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020726454533559042, + "kl": 0.020972244441509247, + "learning_rate": 9.953885819154614e-07, + "loss": 0.0002, + "num_tokens": 3630067.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071592330932617, + "sampling/importance_sampling_ratio/mean": 1.000205159187317, + "sampling/importance_sampling_ratio/min": 0.639909029006958, + "sampling/sampling_logp_difference/max": 0.4464292526245117, + "sampling/sampling_logp_difference/mean": 0.011311270296573639, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 239.65625, + "completions/mean_terminated_length": 239.65625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.588887631893158, + "epoch": 0.2814159292035398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038805979135392424, + "kl": 0.05581683665513992, + "learning_rate": 9.951769315335843e-07, + "loss": 0.0006, + "num_tokens": 3656461.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.280115008354187, + "sampling/importance_sampling_ratio/mean": 1.0008742809295654, + "sampling/importance_sampling_ratio/min": 0.6223615407943726, + "sampling/sampling_logp_difference/max": 0.47423410415649414, + "sampling/sampling_logp_difference/mean": 0.01711086928844452, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 227.390625, + "completions/mean_terminated_length": 227.390625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4441239535808563, + "epoch": 0.2831858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03132778531647549, + "kl": 0.04597689211368561, + "learning_rate": 9.949605559766967e-07, + "loss": 0.0005, + "num_tokens": 3682422.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3805807828903198, + "sampling/importance_sampling_ratio/mean": 1.0000271797180176, + "sampling/importance_sampling_ratio/min": 0.6719688773155212, + "sampling/sampling_logp_difference/max": 0.3975433111190796, + "sampling/sampling_logp_difference/mean": 0.014202705584466457, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 233.9375, + "completions/mean_terminated_length": 233.9375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.4425801634788513, + "epoch": 0.2849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026878682207162892, + "kl": 0.03890957683324814, + "learning_rate": 9.947394573095402e-07, + "loss": 0.0004, + "num_tokens": 3708082.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2759809494018555, + "sampling/importance_sampling_ratio/mean": 0.9999120235443115, + "sampling/importance_sampling_ratio/min": 0.6183002591133118, + "sampling/sampling_logp_difference/max": 0.48078107833862305, + "sampling/sampling_logp_difference/mean": 0.014461169950664043, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 154.203125, + "completions/mean_terminated_length": 154.203125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.367497980594635, + "epoch": 0.2867256637168142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030844164307948486, + "kl": 0.031070904806256294, + "learning_rate": 9.945136376419258e-07, + "loss": 0.0004, + "num_tokens": 3728895.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4351998567581177, + "sampling/importance_sampling_ratio/mean": 1.0001007318496704, + "sampling/importance_sampling_ratio/min": 0.6359556317329407, + "sampling/sampling_logp_difference/max": 0.45262646675109863, + "sampling/sampling_logp_difference/mean": 0.015953868627548218, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 205.703125, + "completions/mean_terminated_length": 205.703125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.448856383562088, + "epoch": 0.2884955752212389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05840902340127153, + "kl": 0.040665335953235626, + "learning_rate": 9.942830991287149e-07, + "loss": 0.0004, + "num_tokens": 3755004.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3960000276565552, + "sampling/importance_sampling_ratio/mean": 1.0006427764892578, + "sampling/importance_sampling_ratio/min": 0.5062552094459534, + "sampling/sampling_logp_difference/max": 0.6807143688201904, + "sampling/sampling_logp_difference/mean": 0.015489346347749233, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 169.0625, + "completions/mean_terminated_length": 169.0625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.36270537972450256, + "epoch": 0.2902654867256637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030222286248721437, + "kl": 0.028047749772667885, + "learning_rate": 9.940478439697972e-07, + "loss": 0.0003, + "num_tokens": 3775056.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3460074663162231, + "sampling/importance_sampling_ratio/mean": 0.9998701810836792, + "sampling/importance_sampling_ratio/min": 0.6988232135772705, + "sampling/sampling_logp_difference/max": 0.35835742950439453, + "sampling/sampling_logp_difference/mean": 0.014132875949144363, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 216.078125, + "completions/mean_terminated_length": 216.078125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.46945494413375854, + "epoch": 0.2920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03360911814839701, + "kl": 0.04232267290353775, + "learning_rate": 9.93807874410071e-07, + "loss": 0.0005, + "num_tokens": 3801413.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.5725798606872559, + "sampling/importance_sampling_ratio/mean": 0.9995458126068115, + "sampling/importance_sampling_ratio/min": 0.6771509051322937, + "sampling/sampling_logp_difference/max": 0.45271754264831543, + "sampling/sampling_logp_difference/mean": 0.014892792329192162, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 176.703125, + "completions/mean_terminated_length": 176.703125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.4789275527000427, + "epoch": 0.2938053097345133, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03239150066842321, + "kl": 0.04120933637022972, + "learning_rate": 9.935631927394214e-07, + "loss": 0.0005, + "num_tokens": 3824354.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3178095817565918, + "sampling/importance_sampling_ratio/mean": 0.9998352527618408, + "sampling/importance_sampling_ratio/min": 0.6385133862495422, + "sampling/sampling_logp_difference/max": 0.44861268997192383, + "sampling/sampling_logp_difference/mean": 0.01687219925224781, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 220.671875, + "completions/mean_terminated_length": 220.671875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3237878978252411, + "epoch": 0.29557522123893804, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020555933492004103, + "kl": 0.019533012062311172, + "learning_rate": 9.93313801292698e-07, + "loss": 0.0002, + "num_tokens": 3848093.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4228062629699707, + "sampling/importance_sampling_ratio/mean": 0.999441385269165, + "sampling/importance_sampling_ratio/min": 0.6771497130393982, + "sampling/sampling_logp_difference/max": 0.38986289501190186, + "sampling/sampling_logp_difference/mean": 0.012553451582789421, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 205.359375, + "completions/mean_terminated_length": 205.359375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.26872894167900085, + "epoch": 0.2973451327433628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022728436780581552, + "kl": 0.017619963735342026, + "learning_rate": 9.93059702449693e-07, + "loss": 0.0002, + "num_tokens": 3870868.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5467859506607056, + "sampling/importance_sampling_ratio/mean": 0.9995690584182739, + "sampling/importance_sampling_ratio/min": 0.6017630100250244, + "sampling/sampling_logp_difference/max": 0.5078915357589722, + "sampling/sampling_logp_difference/mean": 0.012646839022636414, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 141.390625, + "completions/mean_terminated_length": 141.390625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.34829992055892944, + "epoch": 0.2991150442477876, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2453504342625805, + "kl": 0.029450297355651855, + "learning_rate": 9.928008986351186e-07, + "loss": -0.0426, + "num_tokens": 3890445.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3386385440826416, + "sampling/importance_sampling_ratio/mean": 0.9994571208953857, + "sampling/importance_sampling_ratio/min": 0.6298378109931946, + "sampling/sampling_logp_difference/max": 0.4622929096221924, + "sampling/sampling_logp_difference/mean": 0.014471322298049927, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 212.671875, + "completions/mean_terminated_length": 212.671875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.5321623086929321, + "epoch": 0.3008849557522124, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8178777238789101, + "kl": 0.04331979900598526, + "learning_rate": 9.925373923185834e-07, + "loss": 0.0202, + "num_tokens": 3918168.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.2948603630065918, + "sampling/importance_sampling_ratio/mean": 1.000131368637085, + "sampling/importance_sampling_ratio/min": 0.6646633148193359, + "sampling/sampling_logp_difference/max": 0.4084746837615967, + "sampling/sampling_logp_difference/mean": 0.01630018651485443, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1472.0, + "completions/max_terminated_length": 1472.0, + "completions/mean_length": 246.5625, + "completions/mean_terminated_length": 246.5625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.43632301688194275, + "epoch": 0.30265486725663715, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020989156876186344, + "kl": 0.03128629922866821, + "learning_rate": 9.922691860145696e-07, + "loss": 0.0003, + "num_tokens": 3949628.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6568189859390259, + "sampling/importance_sampling_ratio/mean": 1.0005426406860352, + "sampling/importance_sampling_ratio/min": 0.5483725070953369, + "sampling/sampling_logp_difference/max": 0.6008005142211914, + "sampling/sampling_logp_difference/mean": 0.015076635405421257, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 200.453125, + "completions/mean_terminated_length": 200.453125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.4134460985660553, + "epoch": 0.30442477876106194, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12538438339130348, + "kl": 0.0488395132124424, + "learning_rate": 9.919962822824083e-07, + "loss": 0.0006, + "num_tokens": 3978249.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5698164701461792, + "sampling/importance_sampling_ratio/mean": 0.9996939897537231, + "sampling/importance_sampling_ratio/min": 0.665626049041748, + "sampling/sampling_logp_difference/max": 0.4509587287902832, + "sampling/sampling_logp_difference/mean": 0.015557809732854366, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 106.59375, + "completions/mean_terminated_length": 106.59375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.28162699937820435, + "epoch": 0.30619469026548674, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033040203096103796, + "kl": 0.01680285856127739, + "learning_rate": 9.91718683726255e-07, + "loss": 0.0002, + "num_tokens": 3994175.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6337953805923462, + "sampling/importance_sampling_ratio/mean": 0.9994305968284607, + "sampling/importance_sampling_ratio/min": 0.6063137650489807, + "sampling/sampling_logp_difference/max": 0.5003576278686523, + "sampling/sampling_logp_difference/mean": 0.015085892751812935, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.0, + "completions/max_terminated_length": 618.0, + "completions/mean_length": 226.046875, + "completions/mean_terminated_length": 226.046875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.4104277491569519, + "epoch": 0.30796460176991153, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023710154914067024, + "kl": 0.030415624380111694, + "learning_rate": 9.914363929950657e-07, + "loss": 0.0004, + "num_tokens": 4018834.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3802553415298462, + "sampling/importance_sampling_ratio/mean": 1.0001755952835083, + "sampling/importance_sampling_ratio/min": 0.48663073778152466, + "sampling/sampling_logp_difference/max": 0.7202496528625488, + "sampling/sampling_logp_difference/mean": 0.015630293637514114, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 131.171875, + "completions/mean_terminated_length": 131.171875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.2812592387199402, + "epoch": 0.30973451327433627, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031217325718960937, + "kl": 0.019437432289123535, + "learning_rate": 9.91149412782571e-07, + "loss": 0.0002, + "num_tokens": 4035165.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.293818473815918, + "sampling/importance_sampling_ratio/mean": 1.0000535249710083, + "sampling/importance_sampling_ratio/min": 0.6472192406654358, + "sampling/sampling_logp_difference/max": 0.4350701570510864, + "sampling/sampling_logp_difference/mean": 0.012308983132243156, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 224.953125, + "completions/mean_terminated_length": 224.953125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.4822311997413635, + "epoch": 0.31150442477876106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030771598024009236, + "kl": 0.0388898141682148, + "learning_rate": 9.908577458272495e-07, + "loss": 0.0004, + "num_tokens": 4061178.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3889068365097046, + "sampling/importance_sampling_ratio/mean": 0.9999488592147827, + "sampling/importance_sampling_ratio/min": 0.6047934889793396, + "sampling/sampling_logp_difference/max": 0.5028681755065918, + "sampling/sampling_logp_difference/mean": 0.015290592797100544, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 146.4375, + "completions/mean_terminated_length": 146.4375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.33842039108276367, + "epoch": 0.31327433628318585, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027339872557214075, + "kl": 0.017229732125997543, + "learning_rate": 9.905613949123034e-07, + "loss": 0.0002, + "num_tokens": 4082726.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999748706817627, + "sampling/importance_sampling_ratio/min": 0.7109527587890625, + "sampling/sampling_logp_difference/max": 0.7022933959960938, + "sampling/sampling_logp_difference/mean": 0.01365036703646183, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 228.984375, + "completions/mean_terminated_length": 228.984375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4279431700706482, + "epoch": 0.31504424778761064, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028186699292072136, + "kl": 0.03191818296909332, + "learning_rate": 9.902603628656311e-07, + "loss": 0.0004, + "num_tokens": 4107733.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4215264320373535, + "sampling/importance_sampling_ratio/mean": 0.9996159672737122, + "sampling/importance_sampling_ratio/min": 0.6781244874000549, + "sampling/sampling_logp_difference/max": 0.3884243965148926, + "sampling/sampling_logp_difference/mean": 0.013241034932434559, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 175.734375, + "completions/mean_terminated_length": 175.734375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3895474672317505, + "epoch": 0.3168141592920354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04318810553249402, + "kl": 0.025557177141308784, + "learning_rate": 9.899546525597997e-07, + "loss": 0.0003, + "num_tokens": 4129508.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5860437154769897, + "sampling/importance_sampling_ratio/mean": 1.000396966934204, + "sampling/importance_sampling_ratio/min": 0.6368714570999146, + "sampling/sampling_logp_difference/max": 0.46124267578125, + "sampling/sampling_logp_difference/mean": 0.016088902950286865, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 196.296875, + "completions/mean_terminated_length": 196.296875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.4673101305961609, + "epoch": 0.3185840707964602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042519648945525956, + "kl": 0.04119706153869629, + "learning_rate": 9.896442669120187e-07, + "loss": 0.0004, + "num_tokens": 4154183.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4919874668121338, + "sampling/importance_sampling_ratio/mean": 1.0002926588058472, + "sampling/importance_sampling_ratio/min": 0.6402009725570679, + "sampling/sampling_logp_difference/max": 0.44597315788269043, + "sampling/sampling_logp_difference/mean": 0.0154157355427742, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 168.359375, + "completions/mean_terminated_length": 168.359375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3967672884464264, + "epoch": 0.32035398230088497, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027599792756827137, + "kl": 0.0274306982755661, + "learning_rate": 9.893292088841108e-07, + "loss": 0.0003, + "num_tokens": 4174158.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.361751914024353, + "sampling/importance_sampling_ratio/mean": 1.0000026226043701, + "sampling/importance_sampling_ratio/min": 0.6568920612335205, + "sampling/sampling_logp_difference/max": 0.4202355146408081, + "sampling/sampling_logp_difference/mean": 0.015374141745269299, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.0, + "completions/max_terminated_length": 696.0, + "completions/mean_length": 280.921875, + "completions/mean_terminated_length": 280.921875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.46510568261146545, + "epoch": 0.32212389380530976, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11045246901719807, + "kl": 0.039309095591306686, + "learning_rate": 9.890094814824852e-07, + "loss": 0.0004, + "num_tokens": 4204857.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3266682624816895, + "sampling/importance_sampling_ratio/mean": 0.9999687671661377, + "sampling/importance_sampling_ratio/min": 0.6524989604949951, + "sampling/sampling_logp_difference/max": 0.42694568634033203, + "sampling/sampling_logp_difference/mean": 0.015028094872832298, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 119.25, + "completions/mean_terminated_length": 119.25, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3483262062072754, + "epoch": 0.3238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02743467795406971, + "kl": 0.024161968380212784, + "learning_rate": 9.886850877581078e-07, + "loss": 0.0003, + "num_tokens": 4223289.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6014540195465088, + "sampling/importance_sampling_ratio/mean": 0.999658465385437, + "sampling/importance_sampling_ratio/min": 0.623564600944519, + "sampling/sampling_logp_difference/max": 0.4723029136657715, + "sampling/sampling_logp_difference/mean": 0.015557544305920601, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 139.265625, + "completions/mean_terminated_length": 139.265625, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.36477214097976685, + "epoch": 0.3256637168141593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.032238574152394166, + "kl": 0.02369370311498642, + "learning_rate": 9.883560308064722e-07, + "loss": 0.0003, + "num_tokens": 4242426.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6076874732971191, + "sampling/importance_sampling_ratio/mean": 1.0006036758422852, + "sampling/importance_sampling_ratio/min": 0.7465273141860962, + "sampling/sampling_logp_difference/max": 0.47479677200317383, + "sampling/sampling_logp_difference/mean": 0.015426398254930973, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 190.4375, + "completions/mean_terminated_length": 190.4375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.37622520327568054, + "epoch": 0.3274336283185841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02247795274545629, + "kl": 0.019452273845672607, + "learning_rate": 9.880223137675707e-07, + "loss": 0.0003, + "num_tokens": 4264630.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3681427240371704, + "sampling/importance_sampling_ratio/mean": 0.9998056292533875, + "sampling/importance_sampling_ratio/min": 0.6489084362983704, + "sampling/sampling_logp_difference/max": 0.4324636459350586, + "sampling/sampling_logp_difference/mean": 0.015204913914203644, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 138.71875, + "completions/mean_terminated_length": 138.71875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.28751200437545776, + "epoch": 0.3292035398230089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023669913508627505, + "kl": 0.013852202333509922, + "learning_rate": 9.876839398258639e-07, + "loss": 0.0001, + "num_tokens": 4284660.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.573148488998413, + "sampling/importance_sampling_ratio/mean": 1.000110387802124, + "sampling/importance_sampling_ratio/min": 0.6071493029594421, + "sampling/sampling_logp_difference/max": 0.4989805221557617, + "sampling/sampling_logp_difference/mean": 0.014854800887405872, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 146.140625, + "completions/mean_terminated_length": 146.140625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3645159900188446, + "epoch": 0.3309734513274336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11880412522046792, + "kl": 0.027670051902532578, + "learning_rate": 9.873409122102503e-07, + "loss": 0.0004, + "num_tokens": 4304429.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4720406532287598, + "sampling/importance_sampling_ratio/mean": 0.999610185623169, + "sampling/importance_sampling_ratio/min": 0.6360495090484619, + "sampling/sampling_logp_difference/max": 0.45247888565063477, + "sampling/sampling_logp_difference/mean": 0.01561033632606268, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 271.203125, + "completions/mean_terminated_length": 271.203125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3917270600795746, + "epoch": 0.3327433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02590847844041066, + "kl": 0.02397763356566429, + "learning_rate": 9.869932341940358e-07, + "loss": 0.0003, + "num_tokens": 4332778.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.608748435974121, + "sampling/importance_sampling_ratio/mean": 1.0001976490020752, + "sampling/importance_sampling_ratio/min": 0.6262629628181458, + "sampling/sampling_logp_difference/max": 0.47545647621154785, + "sampling/sampling_logp_difference/mean": 0.013658429495990276, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 145.6875, + "completions/mean_terminated_length": 145.6875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.23473447561264038, + "epoch": 0.3345132743362832, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020008689326622402, + "kl": 0.011209280230104923, + "learning_rate": 9.86640909094902e-07, + "loss": 0.0001, + "num_tokens": 4352374.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3670518398284912, + "sampling/importance_sampling_ratio/mean": 0.9996951818466187, + "sampling/importance_sampling_ratio/min": 0.6023116111755371, + "sampling/sampling_logp_difference/max": 0.5069804191589355, + "sampling/sampling_logp_difference/mean": 0.011350013315677643, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 295.09375, + "completions/mean_terminated_length": 295.09375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.46837174892425537, + "epoch": 0.336283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02787908959420259, + "kl": 0.026032259687781334, + "learning_rate": 9.862839402748753e-07, + "loss": 0.0003, + "num_tokens": 4382476.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.405807375907898, + "sampling/importance_sampling_ratio/mean": 1.0000739097595215, + "sampling/importance_sampling_ratio/min": 0.771156907081604, + "sampling/sampling_logp_difference/max": 0.3406118154525757, + "sampling/sampling_logp_difference/mean": 0.014217447489500046, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 170.859375, + "completions/mean_terminated_length": 170.859375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.2878764271736145, + "epoch": 0.3380530973451327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01399220170190771, + "kl": 0.010474582202732563, + "learning_rate": 9.859223311402936e-07, + "loss": 0.0001, + "num_tokens": 4403603.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.526659607887268, + "sampling/importance_sampling_ratio/mean": 0.9999752640724182, + "sampling/importance_sampling_ratio/min": 0.6080242991447449, + "sampling/sampling_logp_difference/max": 0.4975404739379883, + "sampling/sampling_logp_difference/mean": 0.012195511721074581, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 164.671875, + "completions/mean_terminated_length": 164.671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4259641766548157, + "epoch": 0.3398230088495575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02764650026709833, + "kl": 0.023068472743034363, + "learning_rate": 9.85556085141775e-07, + "loss": 0.0003, + "num_tokens": 4426350.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3869491815567017, + "sampling/importance_sampling_ratio/mean": 1.0000137090682983, + "sampling/importance_sampling_ratio/min": 0.6759111881256104, + "sampling/sampling_logp_difference/max": 0.3916935920715332, + "sampling/sampling_logp_difference/mean": 0.016428587958216667, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 178.3125, + "completions/mean_terminated_length": 178.3125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3221898674964905, + "epoch": 0.3415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01953206362168495, + "kl": 0.01120888814330101, + "learning_rate": 9.851852057741844e-07, + "loss": 0.0001, + "num_tokens": 4449362.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.441954255104065, + "sampling/importance_sampling_ratio/mean": 1.0001592636108398, + "sampling/importance_sampling_ratio/min": 0.6792446970939636, + "sampling/sampling_logp_difference/max": 0.38677382469177246, + "sampling/sampling_logp_difference/mean": 0.013950981199741364, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 167.84375, + "completions/mean_terminated_length": 167.84375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.36951327323913574, + "epoch": 0.3433628318584071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021651804598215342, + "kl": 0.02235507220029831, + "learning_rate": 9.848096965766002e-07, + "loss": 0.0003, + "num_tokens": 4470872.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5216445922851562, + "sampling/importance_sampling_ratio/mean": 0.9995793104171753, + "sampling/importance_sampling_ratio/min": 0.7712951898574829, + "sampling/sampling_logp_difference/max": 0.41979169845581055, + "sampling/sampling_logp_difference/mean": 0.014211702160537243, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 151.515625, + "completions/mean_terminated_length": 151.515625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.3271106481552124, + "epoch": 0.34513274336283184, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021102853279036434, + "kl": 0.015833543613553047, + "learning_rate": 9.844295611322803e-07, + "loss": 0.0002, + "num_tokens": 4490361.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2862383127212524, + "sampling/importance_sampling_ratio/mean": 0.9994319677352905, + "sampling/importance_sampling_ratio/min": 0.6054958701133728, + "sampling/sampling_logp_difference/max": 0.5017075538635254, + "sampling/sampling_logp_difference/mean": 0.013794454745948315, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 155.796875, + "completions/mean_terminated_length": 155.796875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.46761173009872437, + "epoch": 0.34690265486725663, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1197933032331047, + "kl": 0.02969934791326523, + "learning_rate": 9.84044803068628e-07, + "loss": 0.0277, + "num_tokens": 4510972.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4048992395401, + "sampling/importance_sampling_ratio/mean": 1.0003435611724854, + "sampling/importance_sampling_ratio/min": 0.5983700156211853, + "sampling/sampling_logp_difference/max": 0.5135459899902344, + "sampling/sampling_logp_difference/mean": 0.017445165663957596, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 132.328125, + "completions/mean_terminated_length": 132.328125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.3574693500995636, + "epoch": 0.3486725663716814, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4380926813243147, + "kl": 0.022844795137643814, + "learning_rate": 9.836554260571577e-07, + "loss": -0.0285, + "num_tokens": 4530481.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3352618217468262, + "sampling/importance_sampling_ratio/mean": 1.001255750656128, + "sampling/importance_sampling_ratio/min": 0.7138298153877258, + "sampling/sampling_logp_difference/max": 0.3371107578277588, + "sampling/sampling_logp_difference/mean": 0.014518964104354382, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 120.125, + "completions/mean_terminated_length": 120.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.27148181200027466, + "epoch": 0.3504424778761062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025780608407040082, + "kl": 0.011116293258965015, + "learning_rate": 9.832614338134595e-07, + "loss": 0.0001, + "num_tokens": 4548233.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5538471937179565, + "sampling/importance_sampling_ratio/mean": 0.9993295669555664, + "sampling/importance_sampling_ratio/min": 0.6070836186408997, + "sampling/sampling_logp_difference/max": 0.49908876419067383, + "sampling/sampling_logp_difference/mean": 0.013770409859716892, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 908.0, + "completions/max_terminated_length": 908.0, + "completions/mean_length": 187.140625, + "completions/mean_terminated_length": 187.140625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4195002615451813, + "epoch": 0.35221238938053095, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0162650481550141, + "kl": 0.016213098540902138, + "learning_rate": 9.828628300971638e-07, + "loss": 0.0002, + "num_tokens": 4571138.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3236145973205566, + "sampling/importance_sampling_ratio/mean": 1.0003173351287842, + "sampling/importance_sampling_ratio/min": 0.6771544814109802, + "sampling/sampling_logp_difference/max": 0.38985586166381836, + "sampling/sampling_logp_difference/mean": 0.01656009815633297, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 217.59375, + "completions/mean_terminated_length": 217.59375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.5768882036209106, + "epoch": 0.35398230088495575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03671326191022061, + "kl": 0.0406668446958065, + "learning_rate": 9.82459618711906e-07, + "loss": 0.0004, + "num_tokens": 4602744.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2956786155700684, + "sampling/importance_sampling_ratio/mean": 0.9998922348022461, + "sampling/importance_sampling_ratio/min": 0.7278984189033508, + "sampling/sampling_logp_difference/max": 0.3175938129425049, + "sampling/sampling_logp_difference/mean": 0.01754450425505638, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1157.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 278.609375, + "completions/mean_terminated_length": 278.609375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.5030993819236755, + "epoch": 0.35575221238938054, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034186990638514964, + "kl": 0.044528283178806305, + "learning_rate": 9.820518035052889e-07, + "loss": 0.0004, + "num_tokens": 4630511.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.298831582069397, + "sampling/importance_sampling_ratio/mean": 1.0000646114349365, + "sampling/importance_sampling_ratio/min": 0.6081597208976746, + "sampling/sampling_logp_difference/max": 0.4973177909851074, + "sampling/sampling_logp_difference/mean": 0.014819031581282616, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 253.03125, + "completions/mean_terminated_length": 253.03125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.5252817869186401, + "epoch": 0.35752212389380533, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029911273715772218, + "kl": 0.033255547285079956, + "learning_rate": 9.816393883688475e-07, + "loss": 0.0003, + "num_tokens": 4659553.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.310223937034607, + "sampling/importance_sampling_ratio/mean": 1.000415325164795, + "sampling/importance_sampling_ratio/min": 0.6752868294715881, + "sampling/sampling_logp_difference/max": 0.39261770248413086, + "sampling/sampling_logp_difference/mean": 0.016464397311210632, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 235.78125, + "completions/mean_terminated_length": 235.78125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.43605995178222656, + "epoch": 0.35929203539823007, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0210552935025437, + "kl": 0.02404474839568138, + "learning_rate": 9.812223772380105e-07, + "loss": 0.0003, + "num_tokens": 4684563.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5271201133728027, + "sampling/importance_sampling_ratio/mean": 0.9989410042762756, + "sampling/importance_sampling_ratio/min": 0.6703864932060242, + "sampling/sampling_logp_difference/max": 0.4233837127685547, + "sampling/sampling_logp_difference/mean": 0.015572982840240002, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 610.0, + "completions/max_terminated_length": 610.0, + "completions/mean_length": 222.09375, + "completions/mean_terminated_length": 222.09375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.4209005832672119, + "epoch": 0.36106194690265486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02194375076571999, + "kl": 0.01922685280442238, + "learning_rate": 9.808007740920645e-07, + "loss": 0.0002, + "num_tokens": 4712121.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2821297645568848, + "sampling/importance_sampling_ratio/mean": 0.9998530745506287, + "sampling/importance_sampling_ratio/min": 0.6066128015518188, + "sampling/sampling_logp_difference/max": 0.4998645782470703, + "sampling/sampling_logp_difference/mean": 0.01416803803294897, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 581.0, + "completions/max_terminated_length": 581.0, + "completions/mean_length": 213.390625, + "completions/mean_terminated_length": 213.390625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.4472233057022095, + "epoch": 0.36283185840707965, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01741477807687026, + "kl": 0.01929214969277382, + "learning_rate": 9.803745829541137e-07, + "loss": 0.0002, + "num_tokens": 4738722.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.630118489265442, + "sampling/importance_sampling_ratio/mean": 1.0000805854797363, + "sampling/importance_sampling_ratio/min": 0.6256342530250549, + "sampling/sampling_logp_difference/max": 0.48865270614624023, + "sampling/sampling_logp_difference/mean": 0.01494203507900238, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 175.390625, + "completions/mean_terminated_length": 175.390625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.38506031036376953, + "epoch": 0.36460176991150445, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01890302236673626, + "kl": 0.017401982098817825, + "learning_rate": 9.799438078910432e-07, + "loss": 0.0002, + "num_tokens": 4760347.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.333805799484253, + "sampling/importance_sampling_ratio/mean": 0.9999526739120483, + "sampling/importance_sampling_ratio/min": 0.6824237108230591, + "sampling/sampling_logp_difference/max": 0.3821045160293579, + "sampling/sampling_logp_difference/mean": 0.014700263738632202, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 424.0, + "completions/max_terminated_length": 424.0, + "completions/mean_length": 150.484375, + "completions/mean_terminated_length": 150.484375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.43252402544021606, + "epoch": 0.3663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.097902939114096, + "kl": 0.03045620024204254, + "learning_rate": 9.7950845301348e-07, + "loss": -0.004, + "num_tokens": 4780074.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.310570478439331, + "sampling/importance_sampling_ratio/mean": 1.0005415678024292, + "sampling/importance_sampling_ratio/min": 0.6203567385673523, + "sampling/sampling_logp_difference/max": 0.4774606227874756, + "sampling/sampling_logp_difference/mean": 0.01583504118025303, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 169.109375, + "completions/mean_terminated_length": 169.109375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.4405205249786377, + "epoch": 0.368141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1743411438430786, + "kl": 0.02028767019510269, + "learning_rate": 9.790685224757532e-07, + "loss": -0.0356, + "num_tokens": 4801233.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.442976951599121, + "sampling/importance_sampling_ratio/mean": 1.0004386901855469, + "sampling/importance_sampling_ratio/min": 0.7331960797309875, + "sampling/sampling_logp_difference/max": 0.36670827865600586, + "sampling/sampling_logp_difference/mean": 0.015310833230614662, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 117.25, + "completions/mean_terminated_length": 117.25, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.27622413635253906, + "epoch": 0.36991150442477877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016372609160945743, + "kl": 0.007270202971994877, + "learning_rate": 9.786240204758552e-07, + "loss": 0.0001, + "num_tokens": 4817809.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6354501247406006, + "sampling/importance_sampling_ratio/mean": 1.0009384155273438, + "sampling/importance_sampling_ratio/min": 0.6567115187644958, + "sampling/sampling_logp_difference/max": 0.49191808700561523, + "sampling/sampling_logp_difference/mean": 0.015505598857998848, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 231.4375, + "completions/mean_terminated_length": 231.4375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.543701171875, + "epoch": 0.37168141592920356, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7267322612573032, + "kl": 0.03129158169031143, + "learning_rate": 9.781749512553998e-07, + "loss": -0.0166, + "num_tokens": 4844973.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.5301965475082397, + "sampling/importance_sampling_ratio/mean": 0.9999191761016846, + "sampling/importance_sampling_ratio/min": 0.6165185570716858, + "sampling/sampling_logp_difference/max": 0.48366689682006836, + "sampling/sampling_logp_difference/mean": 0.016235269606113434, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 229.25, + "completions/mean_terminated_length": 229.25, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.5884172916412354, + "epoch": 0.3734513274336283, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.061016962198772, + "kl": 0.029968272894620895, + "learning_rate": 9.777213190995847e-07, + "loss": -0.0025, + "num_tokens": 4871629.0, + "reward": 0.34375, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.3039125204086304, + "sampling/importance_sampling_ratio/mean": 1.0002131462097168, + "sampling/importance_sampling_ratio/min": 0.6482402682304382, + "sampling/sampling_logp_difference/max": 0.43349385261535645, + "sampling/sampling_logp_difference/mean": 0.01739707589149475, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 180.640625, + "completions/mean_terminated_length": 180.640625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4253998398780823, + "epoch": 0.3752212389380531, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.052056436755184, + "kl": 0.020112060010433197, + "learning_rate": 9.77263128337148e-07, + "loss": -0.0292, + "num_tokens": 4895190.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4048014879226685, + "sampling/importance_sampling_ratio/mean": 0.9998510479927063, + "sampling/importance_sampling_ratio/min": 0.6613737344741821, + "sampling/sampling_logp_difference/max": 0.4134361743927002, + "sampling/sampling_logp_difference/mean": 0.014138556085526943, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 201.40625, + "completions/mean_terminated_length": 201.40625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.5402541756629944, + "epoch": 0.3769911504424779, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8347692127920789, + "kl": 0.029757630079984665, + "learning_rate": 9.768003833403276e-07, + "loss": -0.0094, + "num_tokens": 4920736.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6207294464111328, + "sampling/importance_sampling_ratio/mean": 0.9997462630271912, + "sampling/importance_sampling_ratio/min": 0.7541331052780151, + "sampling/sampling_logp_difference/max": 0.4828763008117676, + "sampling/sampling_logp_difference/mean": 0.016633976250886917, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 106.765625, + "completions/mean_terminated_length": 106.765625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.2815261483192444, + "epoch": 0.3787610619469027, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024846509969903882, + "kl": 0.010371927171945572, + "learning_rate": 9.763330885248204e-07, + "loss": 0.0001, + "num_tokens": 4937425.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6145673990249634, + "sampling/importance_sampling_ratio/mean": 1.0003533363342285, + "sampling/importance_sampling_ratio/min": 0.6623780131340027, + "sampling/sampling_logp_difference/max": 0.4790670871734619, + "sampling/sampling_logp_difference/mean": 0.0145414462313056, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 120.28125, + "completions/mean_terminated_length": 120.28125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3886878192424774, + "epoch": 0.3805309734513274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023396212835991338, + "kl": 0.016547974199056625, + "learning_rate": 9.758612483497394e-07, + "loss": 0.0002, + "num_tokens": 4955539.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3992942571640015, + "sampling/importance_sampling_ratio/mean": 0.9997751116752625, + "sampling/importance_sampling_ratio/min": 0.5779934525489807, + "sampling/sampling_logp_difference/max": 0.5481927394866943, + "sampling/sampling_logp_difference/mean": 0.015263278037309647, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 162.65625, + "completions/mean_terminated_length": 162.65625, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.44779172539711, + "epoch": 0.3823008849557522, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0702160813909476, + "kl": 0.020305894315242767, + "learning_rate": 9.753848673175707e-07, + "loss": -0.018, + "num_tokens": 4977373.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.3271540403366089, + "sampling/importance_sampling_ratio/mean": 0.9998618960380554, + "sampling/importance_sampling_ratio/min": 0.7632419466972351, + "sampling/sampling_logp_difference/max": 0.28303682804107666, + "sampling/sampling_logp_difference/mean": 0.015994010493159294, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 153.21875, + "completions/mean_terminated_length": 153.21875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.30035877227783203, + "epoch": 0.384070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.018376748811207013, + "kl": 0.009180797263979912, + "learning_rate": 9.74903949974131e-07, + "loss": 0.0001, + "num_tokens": 4997563.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3879265785217285, + "sampling/importance_sampling_ratio/mean": 0.9997693300247192, + "sampling/importance_sampling_ratio/min": 0.6757646203041077, + "sampling/sampling_logp_difference/max": 0.3919105529785156, + "sampling/sampling_logp_difference/mean": 0.01399917807430029, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 205.421875, + "completions/mean_terminated_length": 205.421875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.5094939470291138, + "epoch": 0.3858407079646018, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.16903239345118, + "kl": 0.031334519386291504, + "learning_rate": 9.744185009085256e-07, + "loss": -0.019, + "num_tokens": 5020982.0, + "reward": 0.40625, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.4352703094482422, + "sampling/importance_sampling_ratio/mean": 0.9993390440940857, + "sampling/importance_sampling_ratio/min": 0.61335289478302, + "sampling/sampling_logp_difference/max": 0.4888148307800293, + "sampling/sampling_logp_difference/mean": 0.016465526074171066, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 170.9375, + "completions/mean_terminated_length": 170.9375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.45526376366615295, + "epoch": 0.38761061946902653, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8736043720727277, + "kl": 0.016055453568696976, + "learning_rate": 9.739285247531017e-07, + "loss": -0.0153, + "num_tokens": 5042866.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4764906167984009, + "sampling/importance_sampling_ratio/mean": 1.0001682043075562, + "sampling/importance_sampling_ratio/min": 0.6263575553894043, + "sampling/sampling_logp_difference/max": 0.4678339958190918, + "sampling/sampling_logp_difference/mean": 0.017147481441497803, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 209.84375, + "completions/mean_terminated_length": 209.84375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.6200296878814697, + "epoch": 0.3893805309734513, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.371812364956784, + "kl": 0.030370794236660004, + "learning_rate": 9.734340261834066e-07, + "loss": 0.0509, + "num_tokens": 5067256.0, + "reward": 0.125, + "reward_std": 0.6494960784912109, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.2872438430786133, + "sampling/importance_sampling_ratio/mean": 0.9999788403511047, + "sampling/importance_sampling_ratio/min": 0.7144313454627991, + "sampling/sampling_logp_difference/max": 0.33626842498779297, + "sampling/sampling_logp_difference/mean": 0.017782989889383316, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 248.296875, + "completions/mean_terminated_length": 248.296875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.5655719041824341, + "epoch": 0.3911504424778761, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0377346363036783, + "kl": 0.021286573261022568, + "learning_rate": 9.729350099181419e-07, + "loss": 0.023, + "num_tokens": 5094875.0, + "reward": 0.8125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.455339789390564, + "sampling/importance_sampling_ratio/mean": 1.0000135898590088, + "sampling/importance_sampling_ratio/min": 0.6379542946815491, + "sampling/sampling_logp_difference/max": 0.44948863983154297, + "sampling/sampling_logp_difference/mean": 0.017688971012830734, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 189.484375, + "completions/mean_terminated_length": 189.484375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.39400508999824524, + "epoch": 0.3929203539823009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02233395690026934, + "kl": 0.014203056693077087, + "learning_rate": 9.724314807191196e-07, + "loss": 0.0002, + "num_tokens": 5118106.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7028963565826416, + "sampling/importance_sampling_ratio/mean": 1.0008316040039062, + "sampling/importance_sampling_ratio/min": 0.7066613435745239, + "sampling/sampling_logp_difference/max": 0.5323305130004883, + "sampling/sampling_logp_difference/mean": 0.01546834409236908, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 225.890625, + "completions/mean_terminated_length": 225.890625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.6610978841781616, + "epoch": 0.39469026548672564, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0932215259143516, + "kl": 0.02542734146118164, + "learning_rate": 9.719234433912146e-07, + "loss": 0.0067, + "num_tokens": 5145075.0, + "reward": 0.5625, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.2990854978561401, + "sampling/importance_sampling_ratio/mean": 0.9992256164550781, + "sampling/importance_sampling_ratio/min": 0.7171126008033752, + "sampling/sampling_logp_difference/max": 0.3325223922729492, + "sampling/sampling_logp_difference/mean": 0.0187995582818985, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 176.578125, + "completions/mean_terminated_length": 176.578125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.510978102684021, + "epoch": 0.39646017699115044, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.381758013357901, + "kl": 0.021535923704504967, + "learning_rate": 9.714109027823216e-07, + "loss": 0.011, + "num_tokens": 5167944.0, + "reward": 0.15625, + "reward_std": 0.42695626616477966, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.444840669631958, + "sampling/importance_sampling_ratio/mean": 1.0002617835998535, + "sampling/importance_sampling_ratio/min": 0.6666793823242188, + "sampling/sampling_logp_difference/max": 0.40544605255126953, + "sampling/sampling_logp_difference/mean": 0.016433410346508026, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 153.6875, + "completions/mean_terminated_length": 153.6875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.4001522362232208, + "epoch": 0.39823008849557523, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0646633808271555, + "kl": 0.01506091095507145, + "learning_rate": 9.708938637833064e-07, + "loss": -0.0094, + "num_tokens": 5187924.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5525848865509033, + "sampling/importance_sampling_ratio/mean": 1.0004353523254395, + "sampling/importance_sampling_ratio/min": 0.667425274848938, + "sampling/sampling_logp_difference/max": 0.43992114067077637, + "sampling/sampling_logp_difference/mean": 0.01390500832349062, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 206.1875, + "completions/mean_terminated_length": 206.1875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.49817371368408203, + "epoch": 0.4, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028349881888547018, + "kl": 0.01913132332265377, + "learning_rate": 9.703723313279605e-07, + "loss": 0.0002, + "num_tokens": 5211088.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2966861724853516, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.6807541847229004, + "sampling/sampling_logp_difference/max": 0.38455402851104736, + "sampling/sampling_logp_difference/mean": 0.01603330299258232, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 259.0, + "completions/mean_terminated_length": 259.0, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.6391530632972717, + "epoch": 0.40176991150442476, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9967957194934567, + "kl": 0.02740185335278511, + "learning_rate": 9.698463103929541e-07, + "loss": 0.0385, + "num_tokens": 5238432.0, + "reward": 0.625, + "reward_std": 0.481805682182312, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.3534291982650757, + "sampling/importance_sampling_ratio/mean": 0.9993962049484253, + "sampling/importance_sampling_ratio/min": 0.6330663561820984, + "sampling/sampling_logp_difference/max": 0.4571800231933594, + "sampling/sampling_logp_difference/mean": 0.01765705645084381, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 282.03125, + "completions/mean_terminated_length": 282.03125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.4961310625076294, + "epoch": 0.40353982300884955, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0505587189342032, + "kl": 0.019192662090063095, + "learning_rate": 9.693158059977877e-07, + "loss": 0.002, + "num_tokens": 5267970.0, + "reward": 0.40625, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.8243242502212524, + "sampling/importance_sampling_ratio/mean": 1.0002517700195312, + "sampling/importance_sampling_ratio/min": 0.7685901522636414, + "sampling/sampling_logp_difference/max": 0.6012096405029297, + "sampling/sampling_logp_difference/mean": 0.014218084514141083, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 167.0, + "completions/max_terminated_length": 167.0, + "completions/mean_length": 108.84375, + "completions/mean_terminated_length": 108.84375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.2912164628505707, + "epoch": 0.40530973451327434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03102505553320011, + "kl": 0.010266855359077454, + "learning_rate": 9.68780823204745e-07, + "loss": 0.0001, + "num_tokens": 5284424.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4392528533935547, + "sampling/importance_sampling_ratio/mean": 0.9990299940109253, + "sampling/importance_sampling_ratio/min": 0.6255216598510742, + "sampling/sampling_logp_difference/max": 0.46916937828063965, + "sampling/sampling_logp_difference/mean": 0.013452749699354172, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 176.546875, + "completions/mean_terminated_length": 176.546875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.37197035551071167, + "epoch": 0.40707964601769914, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01896041032859841, + "kl": 0.011751978658139706, + "learning_rate": 9.682413671188444e-07, + "loss": 0.0001, + "num_tokens": 5305819.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4121108055114746, + "sampling/importance_sampling_ratio/mean": 1.0002362728118896, + "sampling/importance_sampling_ratio/min": 0.7776780724525452, + "sampling/sampling_logp_difference/max": 0.34508562088012695, + "sampling/sampling_logp_difference/mean": 0.01318573672324419, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 269.203125, + "completions/mean_terminated_length": 269.203125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.5372804403305054, + "epoch": 0.4088495575221239, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.132039308346563, + "kl": 0.016992956399917603, + "learning_rate": 9.6769744288779e-07, + "loss": 0.0479, + "num_tokens": 5334968.0, + "reward": 0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.5277607440948486, + "sampling/importance_sampling_ratio/mean": 0.9999881982803345, + "sampling/importance_sampling_ratio/min": 0.6976944804191589, + "sampling/sampling_logp_difference/max": 0.42380309104919434, + "sampling/sampling_logp_difference/mean": 0.015516486018896103, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 209.28125, + "completions/mean_terminated_length": 209.28125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.5396798253059387, + "epoch": 0.41061946902654867, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.209918698640942, + "kl": 0.026270480826497078, + "learning_rate": 9.671490557019233e-07, + "loss": -0.0009, + "num_tokens": 5360170.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.2983312606811523, + "sampling/importance_sampling_ratio/mean": 1.000602126121521, + "sampling/importance_sampling_ratio/min": 0.7697843909263611, + "sampling/sampling_logp_difference/max": 0.2616448402404785, + "sampling/sampling_logp_difference/mean": 0.016278911381959915, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 182.0625, + "completions/mean_terminated_length": 182.0625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.41270995140075684, + "epoch": 0.41238938053097346, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025328679594359024, + "kl": 0.016738593578338623, + "learning_rate": 9.665962107941724e-07, + "loss": 0.0002, + "num_tokens": 5381534.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 0.9995510578155518, + "sampling/importance_sampling_ratio/min": 0.6961075067520142, + "sampling/sampling_logp_difference/max": 0.48689985275268555, + "sampling/sampling_logp_difference/mean": 0.013693327084183693, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 243.28125, + "completions/mean_terminated_length": 243.28125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.5390169620513916, + "epoch": 0.41415929203539825, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7296044095280566, + "kl": 0.025505583733320236, + "learning_rate": 9.660389134400033e-07, + "loss": 0.0166, + "num_tokens": 5408832.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3929054737091064, + "sampling/importance_sampling_ratio/mean": 1.0000765323638916, + "sampling/importance_sampling_ratio/min": 0.7323424816131592, + "sampling/sampling_logp_difference/max": 0.3313918113708496, + "sampling/sampling_logp_difference/mean": 0.015481807291507721, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 180.078125, + "completions/mean_terminated_length": 180.078125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4292844533920288, + "epoch": 0.415929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9506863313827874, + "kl": 0.022041702643036842, + "learning_rate": 9.654771689573684e-07, + "loss": 0.0058, + "num_tokens": 5431029.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.275795340538025, + "sampling/importance_sampling_ratio/mean": 0.9993537664413452, + "sampling/importance_sampling_ratio/min": 0.695496678352356, + "sampling/sampling_logp_difference/max": 0.36312901973724365, + "sampling/sampling_logp_difference/mean": 0.013931536115705967, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 663.0, + "completions/max_terminated_length": 663.0, + "completions/mean_length": 155.3125, + "completions/mean_terminated_length": 155.3125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3744187355041504, + "epoch": 0.4176991150442478, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0355842923875715, + "kl": 0.016530301421880722, + "learning_rate": 9.64910982706657e-07, + "loss": 0.0521, + "num_tokens": 5452345.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.53571355342865, + "sampling/importance_sampling_ratio/mean": 1.000366449356079, + "sampling/importance_sampling_ratio/min": 0.5138673186302185, + "sampling/sampling_logp_difference/max": 0.6657902002334595, + "sampling/sampling_logp_difference/mean": 0.01374361664056778, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 155.4375, + "completions/mean_terminated_length": 155.4375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.5032491087913513, + "epoch": 0.4194690265486726, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0318982236311647, + "kl": 0.025443322956562042, + "learning_rate": 9.643403600906432e-07, + "loss": -0.0148, + "num_tokens": 5471269.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.303486704826355, + "sampling/importance_sampling_ratio/mean": 1.0002458095550537, + "sampling/importance_sampling_ratio/min": 0.6822929978370667, + "sampling/sampling_logp_difference/max": 0.382296085357666, + "sampling/sampling_logp_difference/mean": 0.01743306592106819, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 126.453125, + "completions/mean_terminated_length": 126.453125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.4431364834308624, + "epoch": 0.42123893805309737, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3168846370828566, + "kl": 0.028479289263486862, + "learning_rate": 9.637653065544349e-07, + "loss": 0.0066, + "num_tokens": 5490434.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.4940837621688843, + "sampling/importance_sampling_ratio/mean": 1.00003981590271, + "sampling/importance_sampling_ratio/min": 0.7341160178184509, + "sampling/sampling_logp_difference/max": 0.40151309967041016, + "sampling/sampling_logp_difference/mean": 0.016477826982736588, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 174.71875, + "completions/mean_terminated_length": 174.71875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.48424744606018066, + "epoch": 0.4230088495575221, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0173106400671965, + "kl": 0.022911589592695236, + "learning_rate": 9.63185827585421e-07, + "loss": 0.005, + "num_tokens": 5511840.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.3995966911315918, + "sampling/importance_sampling_ratio/mean": 1.000849723815918, + "sampling/importance_sampling_ratio/min": 0.7160124182701111, + "sampling/sampling_logp_difference/max": 0.33618414402008057, + "sampling/sampling_logp_difference/mean": 0.015440179035067558, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 206.125, + "completions/mean_terminated_length": 206.125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.6587193608283997, + "epoch": 0.4247787610619469, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.191138425928516, + "kl": 0.03645382449030876, + "learning_rate": 9.6260192871322e-07, + "loss": -0.0358, + "num_tokens": 5537272.0, + "reward": 0.5625, + "reward_std": 0.5081988573074341, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.2708524465560913, + "sampling/importance_sampling_ratio/mean": 0.9996212124824524, + "sampling/importance_sampling_ratio/min": 0.645904004573822, + "sampling/sampling_logp_difference/max": 0.4371044635772705, + "sampling/sampling_logp_difference/mean": 0.018731512129306793, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 229.765625, + "completions/mean_terminated_length": 229.765625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.7126834392547607, + "epoch": 0.4265486725663717, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.2622471061996008, + "kl": 0.043451737612485886, + "learning_rate": 9.620136155096275e-07, + "loss": -0.0345, + "num_tokens": 5562217.0, + "reward": -0.03125, + "reward_std": 0.6683381795883179, + "rewards/decision_reward_func/mean": -0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.3758001327514648, + "sampling/importance_sampling_ratio/mean": 1.0001277923583984, + "sampling/importance_sampling_ratio/min": 0.7748174071311951, + "sampling/sampling_logp_difference/max": 0.31903553009033203, + "sampling/sampling_logp_difference/mean": 0.01953180506825447, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 206.3125, + "completions/mean_terminated_length": 206.3125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.5569666624069214, + "epoch": 0.4283185840707965, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2751258361314115, + "kl": 0.035392675548791885, + "learning_rate": 9.614208935885614e-07, + "loss": -0.0336, + "num_tokens": 5589597.0, + "reward": 0.34375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.5744760036468506, + "sampling/importance_sampling_ratio/mean": 1.0000823736190796, + "sampling/importance_sampling_ratio/min": 0.7300400137901306, + "sampling/sampling_logp_difference/max": 0.4539225101470947, + "sampling/sampling_logp_difference/mean": 0.016382835805416107, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 120.4375, + "completions/mean_terminated_length": 120.4375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.4012613892555237, + "epoch": 0.4300884955752212, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0384114085462503, + "kl": 0.0251857191324234, + "learning_rate": 9.608237686060097e-07, + "loss": 0.0003, + "num_tokens": 5608681.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3672583103179932, + "sampling/importance_sampling_ratio/mean": 0.9998194575309753, + "sampling/importance_sampling_ratio/min": 0.6546944975852966, + "sampling/sampling_logp_difference/max": 0.4235866069793701, + "sampling/sampling_logp_difference/mean": 0.01646578684449196, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 175.6875, + "completions/mean_terminated_length": 175.6875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.39096441864967346, + "epoch": 0.431858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026902015466041054, + "kl": 0.022231586277484894, + "learning_rate": 9.602222462599766e-07, + "loss": 0.0002, + "num_tokens": 5634789.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4702367782592773, + "sampling/importance_sampling_ratio/mean": 0.9999848008155823, + "sampling/importance_sampling_ratio/min": 0.6614747047424316, + "sampling/sampling_logp_difference/max": 0.4132835865020752, + "sampling/sampling_logp_difference/mean": 0.015026605688035488, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 134.453125, + "completions/mean_terminated_length": 134.453125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.30047836899757385, + "epoch": 0.4336283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03992031362694212, + "kl": 0.017428256571292877, + "learning_rate": 9.596163322904269e-07, + "loss": 0.0002, + "num_tokens": 5652482.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3753678798675537, + "sampling/importance_sampling_ratio/mean": 0.999406099319458, + "sampling/importance_sampling_ratio/min": 0.6152034997940063, + "sampling/sampling_logp_difference/max": 0.48580217361450195, + "sampling/sampling_logp_difference/mean": 0.013884510844945908, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 193.453125, + "completions/mean_terminated_length": 193.453125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.4609413743019104, + "epoch": 0.4353982300884956, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0185164908305393, + "kl": 0.02759983018040657, + "learning_rate": 9.590060324792325e-07, + "loss": 0.0043, + "num_tokens": 5675487.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4057984352111816, + "sampling/importance_sampling_ratio/mean": 1.0006136894226074, + "sampling/importance_sampling_ratio/min": 0.6176036596298218, + "sampling/sampling_logp_difference/max": 0.48190832138061523, + "sampling/sampling_logp_difference/mean": 0.016024740412831306, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 162.203125, + "completions/mean_terminated_length": 162.203125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.4550291895866394, + "epoch": 0.43716814159292033, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.167626541533283, + "kl": 0.028106512501835823, + "learning_rate": 9.58391352650117e-07, + "loss": -0.0043, + "num_tokens": 5695932.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.2885998487472534, + "sampling/importance_sampling_ratio/mean": 1.0006885528564453, + "sampling/importance_sampling_ratio/min": 0.6220961213111877, + "sampling/sampling_logp_difference/max": 0.47466063499450684, + "sampling/sampling_logp_difference/mean": 0.016126839444041252, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 168.71875, + "completions/mean_terminated_length": 168.71875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3887510299682617, + "epoch": 0.4389380530973451, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03398823727925125, + "kl": 0.026438862085342407, + "learning_rate": 9.57772298668599e-07, + "loss": 0.0003, + "num_tokens": 5717514.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.409028172492981, + "sampling/importance_sampling_ratio/mean": 1.0000361204147339, + "sampling/importance_sampling_ratio/min": 0.6638216972351074, + "sampling/sampling_logp_difference/max": 0.4097416400909424, + "sampling/sampling_logp_difference/mean": 0.01466484647244215, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 258.90625, + "completions/mean_terminated_length": 258.90625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.4102955162525177, + "epoch": 0.4407079646017699, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8652263137420526, + "kl": 0.026828434318304062, + "learning_rate": 9.57148876441938e-07, + "loss": -0.002, + "num_tokens": 5745332.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6272636651992798, + "sampling/importance_sampling_ratio/mean": 0.9999905824661255, + "sampling/importance_sampling_ratio/min": 0.6772826910018921, + "sampling/sampling_logp_difference/max": 0.48689985275268555, + "sampling/sampling_logp_difference/mean": 0.014956308528780937, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 189.125, + "completions/mean_terminated_length": 189.125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.46813422441482544, + "epoch": 0.4424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07185935544295184, + "kl": 0.04196876287460327, + "learning_rate": 9.565210919190763e-07, + "loss": 0.0004, + "num_tokens": 5773644.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3978229761123657, + "sampling/importance_sampling_ratio/mean": 0.9999117255210876, + "sampling/importance_sampling_ratio/min": 0.550255537033081, + "sampling/sampling_logp_difference/max": 0.5973725318908691, + "sampling/sampling_logp_difference/mean": 0.016827870160341263, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 205.0, + "completions/max_terminated_length": 205.0, + "completions/mean_length": 113.625, + "completions/mean_terminated_length": 113.625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3235929012298584, + "epoch": 0.44424778761061945, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06097395303517226, + "kl": 0.023939600214362144, + "learning_rate": 9.558889510905835e-07, + "loss": 0.0003, + "num_tokens": 5794180.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6908003091812134, + "sampling/importance_sampling_ratio/mean": 1.0010063648223877, + "sampling/importance_sampling_ratio/min": 0.6060614585876465, + "sampling/sampling_logp_difference/max": 0.5252020359039307, + "sampling/sampling_logp_difference/mean": 0.01488159503787756, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 123.203125, + "completions/mean_terminated_length": 123.203125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.214959979057312, + "epoch": 0.44601769911504424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04782722132271782, + "kl": 0.01646348088979721, + "learning_rate": 9.55252459988598e-07, + "loss": 0.0002, + "num_tokens": 5813233.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5760674476623535, + "sampling/importance_sampling_ratio/mean": 0.9988939166069031, + "sampling/importance_sampling_ratio/min": 0.5484436750411987, + "sampling/sampling_logp_difference/max": 0.6006706953048706, + "sampling/sampling_logp_difference/mean": 0.012746063992381096, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 164.203125, + "completions/mean_terminated_length": 164.203125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.32349807024002075, + "epoch": 0.44778761061946903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04171575141316936, + "kl": 0.028249233961105347, + "learning_rate": 9.546116246867713e-07, + "loss": 0.0003, + "num_tokens": 5834734.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5744246244430542, + "sampling/importance_sampling_ratio/mean": 0.9997957944869995, + "sampling/importance_sampling_ratio/min": 0.6231405735015869, + "sampling/sampling_logp_difference/max": 0.47298312187194824, + "sampling/sampling_logp_difference/mean": 0.013606461696326733, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 137.8125, + "completions/mean_terminated_length": 137.8125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.32213008403778076, + "epoch": 0.4495575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.004911966993259, + "kl": 0.03752923011779785, + "learning_rate": 9.539664513002084e-07, + "loss": -0.0278, + "num_tokens": 5854130.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3999723196029663, + "sampling/importance_sampling_ratio/mean": 0.9996242523193359, + "sampling/importance_sampling_ratio/min": 0.6813591122627258, + "sampling/sampling_logp_difference/max": 0.3836658000946045, + "sampling/sampling_logp_difference/mean": 0.014559498056769371, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 165.109375, + "completions/mean_terminated_length": 165.109375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3359453082084656, + "epoch": 0.45132743362831856, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3108660255354827, + "kl": 0.031047837808728218, + "learning_rate": 9.533169459854098e-07, + "loss": -0.0249, + "num_tokens": 5874457.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6089236736297607, + "sampling/importance_sampling_ratio/mean": 0.9999319314956665, + "sampling/importance_sampling_ratio/min": 0.6133982539176941, + "sampling/sampling_logp_difference/max": 0.4887409210205078, + "sampling/sampling_logp_difference/mean": 0.014974427409470081, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 159.296875, + "completions/mean_terminated_length": 159.296875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.29909294843673706, + "epoch": 0.45309734513274336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04973117091467245, + "kl": 0.029443148523569107, + "learning_rate": 9.526631149402134e-07, + "loss": 0.0003, + "num_tokens": 5894844.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4723871946334839, + "sampling/importance_sampling_ratio/mean": 1.0001782178878784, + "sampling/importance_sampling_ratio/min": 0.6331398487091064, + "sampling/sampling_logp_difference/max": 0.4570639133453369, + "sampling/sampling_logp_difference/mean": 0.01476360484957695, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 180.1875, + "completions/mean_terminated_length": 180.1875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.33986639976501465, + "epoch": 0.45486725663716815, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05368425239332466, + "kl": 0.032071635127067566, + "learning_rate": 9.520049644037347e-07, + "loss": 0.0003, + "num_tokens": 5916376.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5971448421478271, + "sampling/importance_sampling_ratio/mean": 0.9999678134918213, + "sampling/importance_sampling_ratio/min": 0.641612708568573, + "sampling/sampling_logp_difference/max": 0.4682176113128662, + "sampling/sampling_logp_difference/mean": 0.014039294794201851, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 150.65625, + "completions/mean_terminated_length": 150.65625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.33248865604400635, + "epoch": 0.45663716814159294, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05925151073793785, + "kl": 0.031930696219205856, + "learning_rate": 9.513425006563078e-07, + "loss": 0.0004, + "num_tokens": 5936962.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.7888113260269165, + "sampling/importance_sampling_ratio/mean": 0.9999220371246338, + "sampling/importance_sampling_ratio/min": 0.652288019657135, + "sampling/sampling_logp_difference/max": 0.5815513134002686, + "sampling/sampling_logp_difference/mean": 0.01445157453417778, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 160.984375, + "completions/mean_terminated_length": 160.984375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.33986949920654297, + "epoch": 0.4584070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08418805660577101, + "kl": 0.04477149248123169, + "learning_rate": 9.506757300194248e-07, + "loss": 0.0005, + "num_tokens": 5970113.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4855422973632812, + "sampling/importance_sampling_ratio/mean": 1.0002464056015015, + "sampling/importance_sampling_ratio/min": 0.547514796257019, + "sampling/sampling_logp_difference/max": 0.6023657917976379, + "sampling/sampling_logp_difference/mean": 0.014705209992825985, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 192.90625, + "completions/mean_terminated_length": 192.90625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3955267071723938, + "epoch": 0.46017699115044247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.053332440143998176, + "kl": 0.039812974631786346, + "learning_rate": 9.500046588556761e-07, + "loss": 0.0004, + "num_tokens": 5993611.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4954570531845093, + "sampling/importance_sampling_ratio/mean": 0.9997931122779846, + "sampling/importance_sampling_ratio/min": 0.7331216931343079, + "sampling/sampling_logp_difference/max": 0.402431845664978, + "sampling/sampling_logp_difference/mean": 0.015132974833250046, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 167.609375, + "completions/mean_terminated_length": 167.609375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.3669849634170532, + "epoch": 0.46194690265486726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0387119018057199, + "kl": 0.029329845681786537, + "learning_rate": 9.493292935686894e-07, + "loss": 0.0003, + "num_tokens": 6014706.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6007599830627441, + "sampling/importance_sampling_ratio/mean": 1.000851035118103, + "sampling/importance_sampling_ratio/min": 0.6125900745391846, + "sampling/sampling_logp_difference/max": 0.49005937576293945, + "sampling/sampling_logp_difference/mean": 0.015224466100335121, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1115.0, + "completions/max_terminated_length": 1115.0, + "completions/mean_length": 231.703125, + "completions/mean_terminated_length": 231.703125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.31691044569015503, + "epoch": 0.46371681415929206, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04087936515212419, + "kl": 0.02219213917851448, + "learning_rate": 9.486496406030685e-07, + "loss": 0.0003, + "num_tokens": 6041631.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.333465576171875, + "sampling/importance_sampling_ratio/mean": 1.0001709461212158, + "sampling/importance_sampling_ratio/min": 0.6453405618667603, + "sampling/sampling_logp_difference/max": 0.4379770755767822, + "sampling/sampling_logp_difference/mean": 0.013479698449373245, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 200.1875, + "completions/mean_terminated_length": 200.1875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.45341458916664124, + "epoch": 0.4654867256637168, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03696931697805135, + "kl": 0.041830264031887054, + "learning_rate": 9.479657064443321e-07, + "loss": 0.0004, + "num_tokens": 6066843.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3260716199874878, + "sampling/importance_sampling_ratio/mean": 0.9999263286590576, + "sampling/importance_sampling_ratio/min": 0.684958815574646, + "sampling/sampling_logp_difference/max": 0.37839651107788086, + "sampling/sampling_logp_difference/mean": 0.016536952927708626, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 253.03125, + "completions/mean_terminated_length": 253.03125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.34183597564697266, + "epoch": 0.4672566371681416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0236482152955727, + "kl": 0.02626795694231987, + "learning_rate": 9.472774976188513e-07, + "loss": 0.0003, + "num_tokens": 6094653.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3353275060653687, + "sampling/importance_sampling_ratio/mean": 1.0005013942718506, + "sampling/importance_sampling_ratio/min": 0.7414844036102295, + "sampling/sampling_logp_difference/max": 0.2991011142730713, + "sampling/sampling_logp_difference/mean": 0.012045430950820446, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 192.5625, + "completions/mean_terminated_length": 192.5625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.27034690976142883, + "epoch": 0.4690265486725664, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03322347176542136, + "kl": 0.02082662284374237, + "learning_rate": 9.465850206937887e-07, + "loss": 0.0002, + "num_tokens": 6117521.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3415220975875854, + "sampling/importance_sampling_ratio/mean": 0.9999597072601318, + "sampling/importance_sampling_ratio/min": 0.6079509258270264, + "sampling/sampling_logp_difference/max": 0.49766111373901367, + "sampling/sampling_logp_difference/mean": 0.013065982609987259, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 238.0, + "completions/max_terminated_length": 238.0, + "completions/mean_length": 136.28125, + "completions/mean_terminated_length": 136.28125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.20872843265533447, + "epoch": 0.47079646017699117, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.042854545735950086, + "kl": 0.01808926649391651, + "learning_rate": 9.45888282277034e-07, + "loss": 0.0002, + "num_tokens": 6136003.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3818767070770264, + "sampling/importance_sampling_ratio/mean": 0.9991451501846313, + "sampling/importance_sampling_ratio/min": 0.3858293294906616, + "sampling/sampling_logp_difference/max": 0.9523601531982422, + "sampling/sampling_logp_difference/mean": 0.012654677964746952, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 231.828125, + "completions/mean_terminated_length": 231.828125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.3623976707458496, + "epoch": 0.4725663716814159, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03701270794177717, + "kl": 0.03097117319703102, + "learning_rate": 9.451872890171419e-07, + "loss": 0.0003, + "num_tokens": 6161704.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4505499601364136, + "sampling/importance_sampling_ratio/mean": 1.0002344846725464, + "sampling/importance_sampling_ratio/min": 0.7210965752601624, + "sampling/sampling_logp_difference/max": 0.37194275856018066, + "sampling/sampling_logp_difference/mean": 0.013468829914927483, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 296.28125, + "completions/mean_terminated_length": 296.28125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.5455514192581177, + "epoch": 0.4743362831858407, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.659294837514523, + "kl": 0.041745543479919434, + "learning_rate": 9.444820476032685e-07, + "loss": -0.0268, + "num_tokens": 6194010.0, + "reward": -0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4530071020126343, + "sampling/importance_sampling_ratio/mean": 1.0003100633621216, + "sampling/importance_sampling_ratio/min": 0.6299753785133362, + "sampling/sampling_logp_difference/max": 0.46207451820373535, + "sampling/sampling_logp_difference/mean": 0.016169343143701553, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 574.0, + "completions/max_terminated_length": 574.0, + "completions/mean_length": 178.109375, + "completions/mean_terminated_length": 178.109375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.32269760966300964, + "epoch": 0.4761061946902655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035257563406750556, + "kl": 0.01920374296605587, + "learning_rate": 9.437725647651078e-07, + "loss": 0.0002, + "num_tokens": 6218465.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6268205642700195, + "sampling/importance_sampling_ratio/mean": 1.0004115104675293, + "sampling/importance_sampling_ratio/min": 0.6369473934173584, + "sampling/sampling_logp_difference/max": 0.48662757873535156, + "sampling/sampling_logp_difference/mean": 0.015678048133850098, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 190.265625, + "completions/mean_terminated_length": 190.265625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.37728631496429443, + "epoch": 0.4778761061946903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06004114459586334, + "kl": 0.03412202000617981, + "learning_rate": 9.430588472728269e-07, + "loss": 0.0004, + "num_tokens": 6239938.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4148181676864624, + "sampling/importance_sampling_ratio/mean": 0.9995939135551453, + "sampling/importance_sampling_ratio/min": 0.6361550688743591, + "sampling/sampling_logp_difference/max": 0.4523129463195801, + "sampling/sampling_logp_difference/mean": 0.013349948450922966, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 228.203125, + "completions/mean_terminated_length": 228.203125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.311359167098999, + "epoch": 0.479646017699115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023194905350027488, + "kl": 0.01969325914978981, + "learning_rate": 9.423409019370014e-07, + "loss": 0.0002, + "num_tokens": 6264255.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.578434944152832, + "sampling/importance_sampling_ratio/mean": 0.999988317489624, + "sampling/importance_sampling_ratio/min": 0.6299833655357361, + "sampling/sampling_logp_difference/max": 0.46206188201904297, + "sampling/sampling_logp_difference/mean": 0.014301535673439503, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 153.890625, + "completions/mean_terminated_length": 153.890625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.39541131258010864, + "epoch": 0.4814159292035398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036914813075702546, + "kl": 0.031673021614551544, + "learning_rate": 9.416187356085512e-07, + "loss": 0.0004, + "num_tokens": 6288504.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.6045210361480713, + "sampling/importance_sampling_ratio/mean": 0.999925971031189, + "sampling/importance_sampling_ratio/min": 0.634973406791687, + "sampling/sampling_logp_difference/max": 0.472825288772583, + "sampling/sampling_logp_difference/mean": 0.01636761799454689, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 154.640625, + "completions/mean_terminated_length": 154.640625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.27730217576026917, + "epoch": 0.4831858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.019819123880060466, + "kl": 0.015538809821009636, + "learning_rate": 9.408923551786742e-07, + "loss": 0.0002, + "num_tokens": 6310129.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.28533136844635, + "sampling/importance_sampling_ratio/mean": 1.0004087686538696, + "sampling/importance_sampling_ratio/min": 0.7605655193328857, + "sampling/sampling_logp_difference/max": 0.2736930251121521, + "sampling/sampling_logp_difference/mean": 0.013058988377451897, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 148.1875, + "completions/mean_terminated_length": 148.1875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.33199161291122437, + "epoch": 0.4849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025359344994080425, + "kl": 0.019676754251122475, + "learning_rate": 9.40161767578781e-07, + "loss": 0.0002, + "num_tokens": 6330909.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6070911884307861, + "sampling/importance_sampling_ratio/mean": 0.9996774196624756, + "sampling/importance_sampling_ratio/min": 0.6098532676696777, + "sampling/sampling_logp_difference/max": 0.4945368766784668, + "sampling/sampling_logp_difference/mean": 0.013872501440346241, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 119.03125, + "completions/mean_terminated_length": 119.03125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.25699901580810547, + "epoch": 0.48672566371681414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030136405749096117, + "kl": 0.013796394690871239, + "learning_rate": 9.394269797804288e-07, + "loss": 0.0001, + "num_tokens": 6348767.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6629855632781982, + "sampling/importance_sampling_ratio/mean": 1.0000479221343994, + "sampling/importance_sampling_ratio/min": 0.49850544333457947, + "sampling/sampling_logp_difference/max": 0.6961407661437988, + "sampling/sampling_logp_difference/mean": 0.013275440782308578, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 195.703125, + "completions/mean_terminated_length": 195.703125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.4806135892868042, + "epoch": 0.48849557522123893, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027145594527740468, + "kl": 0.03496091812849045, + "learning_rate": 9.386879987952549e-07, + "loss": 0.0004, + "num_tokens": 6380412.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4901208877563477, + "sampling/importance_sampling_ratio/mean": 1.0001537799835205, + "sampling/importance_sampling_ratio/min": 0.6103525161743164, + "sampling/sampling_logp_difference/max": 0.49371862411499023, + "sampling/sampling_logp_difference/mean": 0.01681288704276085, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 214.0, + "completions/max_terminated_length": 214.0, + "completions/mean_length": 120.90625, + "completions/mean_terminated_length": 120.90625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.27076271176338196, + "epoch": 0.4902654867256637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027781639445950742, + "kl": 0.016093842685222626, + "learning_rate": 9.37944831674909e-07, + "loss": 0.0002, + "num_tokens": 6396918.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.40231192111969, + "sampling/importance_sampling_ratio/mean": 1.0001559257507324, + "sampling/importance_sampling_ratio/min": 0.6182749271392822, + "sampling/sampling_logp_difference/max": 0.4808220863342285, + "sampling/sampling_logp_difference/mean": 0.013468633405864239, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 185.421875, + "completions/mean_terminated_length": 185.421875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3222809135913849, + "epoch": 0.4920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02109532999968717, + "kl": 0.021185191348195076, + "learning_rate": 9.371974855109874e-07, + "loss": 0.0002, + "num_tokens": 6420369.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.443979024887085, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.6533827781677246, + "sampling/sampling_logp_difference/max": 0.42559218406677246, + "sampling/sampling_logp_difference/mean": 0.012732608243823051, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 167.0625, + "completions/mean_terminated_length": 167.0625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3757314085960388, + "epoch": 0.49380530973451325, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02220712023150318, + "kl": 0.02302708476781845, + "learning_rate": 9.36445967434964e-07, + "loss": 0.0003, + "num_tokens": 6440965.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3643826246261597, + "sampling/importance_sampling_ratio/mean": 0.9994344115257263, + "sampling/importance_sampling_ratio/min": 0.5260617733001709, + "sampling/sampling_logp_difference/max": 0.6423366069793701, + "sampling/sampling_logp_difference/mean": 0.015564961358904839, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 206.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 101.328125, + "completions/mean_terminated_length": 101.328125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.2622968852519989, + "epoch": 0.49557522123893805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024359950973413567, + "kl": 0.01559748686850071, + "learning_rate": 9.356902846181228e-07, + "loss": 0.0002, + "num_tokens": 6456506.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.435205340385437, + "sampling/importance_sampling_ratio/mean": 1.0001575946807861, + "sampling/importance_sampling_ratio/min": 0.6743903756141663, + "sampling/sampling_logp_difference/max": 0.39394617080688477, + "sampling/sampling_logp_difference/mean": 0.013536439277231693, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 154.34375, + "completions/mean_terminated_length": 154.34375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.36557501554489136, + "epoch": 0.49734513274336284, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01883840913930749, + "kl": 0.021856961771845818, + "learning_rate": 9.349304442714895e-07, + "loss": 0.0002, + "num_tokens": 6476640.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7520533800125122, + "sampling/importance_sampling_ratio/mean": 1.000203251838684, + "sampling/importance_sampling_ratio/min": 0.7620862126350403, + "sampling/sampling_logp_difference/max": 0.5607883930206299, + "sampling/sampling_logp_difference/mean": 0.015660658478736877, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 220.4375, + "completions/mean_terminated_length": 220.4375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.519733190536499, + "epoch": 0.49911504424778763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02446708363363976, + "kl": 0.03840360417962074, + "learning_rate": 9.341664536457625e-07, + "loss": 0.0004, + "num_tokens": 6510732.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2997714281082153, + "sampling/importance_sampling_ratio/mean": 1.0001707077026367, + "sampling/importance_sampling_ratio/min": 0.6407306790351868, + "sampling/sampling_logp_difference/max": 0.4451460838317871, + "sampling/sampling_logp_difference/mean": 0.01690497063100338, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 234.578125, + "completions/mean_terminated_length": 234.578125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.5728236436843872, + "epoch": 0.5008849557522124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04478807109354219, + "kl": 0.04850665107369423, + "learning_rate": 9.33398320031244e-07, + "loss": 0.0005, + "num_tokens": 6539425.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3944982290267944, + "sampling/importance_sampling_ratio/mean": 0.9997686743736267, + "sampling/importance_sampling_ratio/min": 0.7058351635932922, + "sampling/sampling_logp_difference/max": 0.34837353229522705, + "sampling/sampling_logp_difference/mean": 0.01805371418595314, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 180.921875, + "completions/mean_terminated_length": 180.921875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4373040795326233, + "epoch": 0.5026548672566372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02192972516307828, + "kl": 0.02788272686302662, + "learning_rate": 9.3262605075777e-07, + "loss": 0.0004, + "num_tokens": 6563452.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4271368980407715, + "sampling/importance_sampling_ratio/mean": 0.9991888403892517, + "sampling/importance_sampling_ratio/min": 0.7124289274215698, + "sampling/sampling_logp_difference/max": 0.3556702136993408, + "sampling/sampling_logp_difference/mean": 0.016525931656360626, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 197.21875, + "completions/mean_terminated_length": 197.21875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.5013217329978943, + "epoch": 0.504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9948517252301762, + "kl": 0.042211972177028656, + "learning_rate": 9.318496531946409e-07, + "loss": -0.0194, + "num_tokens": 6591754.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.4786263704299927, + "sampling/importance_sampling_ratio/mean": 0.9997700452804565, + "sampling/importance_sampling_ratio/min": 0.6789397597312927, + "sampling/sampling_logp_difference/max": 0.3911135196685791, + "sampling/sampling_logp_difference/mean": 0.01624986156821251, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 132.8125, + "completions/mean_terminated_length": 132.8125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.4127649962902069, + "epoch": 0.5061946902654867, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.021955724850407192, + "kl": 0.02443903684616089, + "learning_rate": 9.310691347505505e-07, + "loss": 0.0003, + "num_tokens": 6611518.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3266682624816895, + "sampling/importance_sampling_ratio/mean": 1.0003299713134766, + "sampling/importance_sampling_ratio/min": 0.7654657363891602, + "sampling/sampling_logp_difference/max": 0.2826707363128662, + "sampling/sampling_logp_difference/mean": 0.015683164820075035, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 187.703125, + "completions/mean_terminated_length": 187.703125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.3855421543121338, + "epoch": 0.5079646017699115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025761441639133714, + "kl": 0.03089805133640766, + "learning_rate": 9.30284502873516e-07, + "loss": 0.0003, + "num_tokens": 6633019.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4548776149749756, + "sampling/importance_sampling_ratio/mean": 0.9999523758888245, + "sampling/importance_sampling_ratio/min": 0.6919315457344055, + "sampling/sampling_logp_difference/max": 0.3749217987060547, + "sampling/sampling_logp_difference/mean": 0.013852078467607498, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 678.0, + "completions/max_terminated_length": 678.0, + "completions/mean_length": 167.265625, + "completions/mean_terminated_length": 167.265625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.28932544589042664, + "epoch": 0.5097345132743363, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020763578839902066, + "kl": 0.017403192818164825, + "learning_rate": 9.294957650508064e-07, + "loss": 0.0002, + "num_tokens": 6653372.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3269850015640259, + "sampling/importance_sampling_ratio/mean": 1.000016689300537, + "sampling/importance_sampling_ratio/min": 0.7129645347595215, + "sampling/sampling_logp_difference/max": 0.3383236527442932, + "sampling/sampling_logp_difference/mean": 0.012063910253345966, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 184.140625, + "completions/mean_terminated_length": 184.140625, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3914494514465332, + "epoch": 0.511504424778761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02080754388776366, + "kl": 0.020681608468294144, + "learning_rate": 9.287029288088716e-07, + "loss": 0.0003, + "num_tokens": 6677653.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4709348678588867, + "sampling/importance_sampling_ratio/mean": 0.9999104738235474, + "sampling/importance_sampling_ratio/min": 0.6171886920928955, + "sampling/sampling_logp_difference/max": 0.48258042335510254, + "sampling/sampling_logp_difference/mean": 0.014956386759877205, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 175.484375, + "completions/mean_terminated_length": 175.484375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.5061272382736206, + "epoch": 0.5132743362831859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03207714367449626, + "kl": 0.03939015045762062, + "learning_rate": 9.279060017132697e-07, + "loss": 0.0005, + "num_tokens": 6705876.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.293717384338379, + "sampling/importance_sampling_ratio/mean": 0.9991098642349243, + "sampling/importance_sampling_ratio/min": 0.7004491090774536, + "sampling/sampling_logp_difference/max": 0.3560335636138916, + "sampling/sampling_logp_difference/mean": 0.017809201031923294, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 210.8125, + "completions/mean_terminated_length": 210.8125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.41961824893951416, + "epoch": 0.5150442477876106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017195812824318982, + "kl": 0.02614808827638626, + "learning_rate": 9.271049913685959e-07, + "loss": 0.0003, + "num_tokens": 6730776.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3036280870437622, + "sampling/importance_sampling_ratio/mean": 0.9997357130050659, + "sampling/importance_sampling_ratio/min": 0.6956069469451904, + "sampling/sampling_logp_difference/max": 0.3629704713821411, + "sampling/sampling_logp_difference/mean": 0.015091790817677975, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 243.984375, + "completions/mean_terminated_length": 243.984375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.513262152671814, + "epoch": 0.5168141592920354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02756305466878137, + "kl": 0.04059458523988724, + "learning_rate": 9.262999054184091e-07, + "loss": 0.0004, + "num_tokens": 6757991.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5239235162734985, + "sampling/importance_sampling_ratio/mean": 1.0002270936965942, + "sampling/importance_sampling_ratio/min": 0.7383068799972534, + "sampling/sampling_logp_difference/max": 0.42128825187683105, + "sampling/sampling_logp_difference/mean": 0.015886511653661728, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 170.0, + "completions/mean_terminated_length": 170.0, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.44291773438453674, + "epoch": 0.5185840707964602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023537025702967722, + "kl": 0.025342687964439392, + "learning_rate": 9.254907515451591e-07, + "loss": 0.0003, + "num_tokens": 6779511.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4934041500091553, + "sampling/importance_sampling_ratio/mean": 1.000349521636963, + "sampling/importance_sampling_ratio/min": 0.6546944975852966, + "sampling/sampling_logp_difference/max": 0.4235866069793701, + "sampling/sampling_logp_difference/mean": 0.015553380362689495, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 270.359375, + "completions/mean_terminated_length": 270.359375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.6321618556976318, + "epoch": 0.5203539823008849, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6718029852191244, + "kl": 0.05273823440074921, + "learning_rate": 9.246775374701138e-07, + "loss": 0.0073, + "num_tokens": 6830126.0, + "reward": -0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": -0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.437933325767517, + "sampling/importance_sampling_ratio/mean": 0.999913215637207, + "sampling/importance_sampling_ratio/min": 0.6157731413841248, + "sampling/sampling_logp_difference/max": 0.4848766326904297, + "sampling/sampling_logp_difference/mean": 0.018297888338565826, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 186.0, + "completions/max_terminated_length": 186.0, + "completions/mean_length": 113.25, + "completions/mean_terminated_length": 113.25, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.30457520484924316, + "epoch": 0.5221238938053098, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.014786051129669832, + "kl": 0.011042892932891846, + "learning_rate": 9.23860270953285e-07, + "loss": 0.0001, + "num_tokens": 6846686.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.297924518585205, + "sampling/importance_sampling_ratio/mean": 0.9996802806854248, + "sampling/importance_sampling_ratio/min": 0.5011434555053711, + "sampling/sampling_logp_difference/max": 0.6908628940582275, + "sampling/sampling_logp_difference/mean": 0.013864563778042793, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 182.171875, + "completions/mean_terminated_length": 182.171875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.34369033575057983, + "epoch": 0.5238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029699791224600255, + "kl": 0.023969898000359535, + "learning_rate": 9.230389597933543e-07, + "loss": 0.0003, + "num_tokens": 6868745.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3634611368179321, + "sampling/importance_sampling_ratio/mean": 0.999811589717865, + "sampling/importance_sampling_ratio/min": 0.6313598155975342, + "sampling/sampling_logp_difference/max": 0.45987939834594727, + "sampling/sampling_logp_difference/mean": 0.013427523896098137, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 164.265625, + "completions/mean_terminated_length": 164.265625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.43866345286369324, + "epoch": 0.5256637168141592, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035562733657815176, + "kl": 0.03041796386241913, + "learning_rate": 9.222136118275995e-07, + "loss": 0.0004, + "num_tokens": 6890698.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.515989065170288, + "sampling/importance_sampling_ratio/mean": 1.0003479719161987, + "sampling/importance_sampling_ratio/min": 0.662236213684082, + "sampling/sampling_logp_difference/max": 0.41606807708740234, + "sampling/sampling_logp_difference/mean": 0.015883635729551315, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 173.0, + "completions/max_terminated_length": 173.0, + "completions/mean_length": 112.890625, + "completions/mean_terminated_length": 112.890625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.27075040340423584, + "epoch": 0.5274336283185841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.016403369626088807, + "kl": 0.012253494933247566, + "learning_rate": 9.213842349318184e-07, + "loss": 0.0001, + "num_tokens": 6906963.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3670459985733032, + "sampling/importance_sampling_ratio/mean": 0.9997448921203613, + "sampling/importance_sampling_ratio/min": 0.5639070868492126, + "sampling/sampling_logp_difference/max": 0.5728658437728882, + "sampling/sampling_logp_difference/mean": 0.013015816919505596, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 234.375, + "completions/mean_terminated_length": 234.375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.5180535316467285, + "epoch": 0.5292035398230088, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7780875731753081, + "kl": 0.039922792464494705, + "learning_rate": 9.205508370202551e-07, + "loss": 0.0211, + "num_tokens": 6934955.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.3283700942993164, + "sampling/importance_sampling_ratio/mean": 1.0002131462097168, + "sampling/importance_sampling_ratio/min": 0.751277506351471, + "sampling/sampling_logp_difference/max": 0.285980224609375, + "sampling/sampling_logp_difference/mean": 0.016268664970993996, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 190.5625, + "completions/mean_terminated_length": 190.5625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.49133166670799255, + "epoch": 0.5309734513274337, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023107569257414747, + "kl": 0.03035806678235531, + "learning_rate": 9.197134260455233e-07, + "loss": 0.0003, + "num_tokens": 6961407.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3000757694244385, + "sampling/importance_sampling_ratio/mean": 0.9997725486755371, + "sampling/importance_sampling_ratio/min": 0.7143741250038147, + "sampling/sampling_logp_difference/max": 0.3363485336303711, + "sampling/sampling_logp_difference/mean": 0.015370436944067478, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 162.84375, + "completions/mean_terminated_length": 162.84375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.4394567608833313, + "epoch": 0.5327433628318584, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3449971981015074, + "kl": 0.025277450680732727, + "learning_rate": 9.188720099985315e-07, + "loss": -0.0636, + "num_tokens": 6984389.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.3278049230575562, + "sampling/importance_sampling_ratio/mean": 1.0007739067077637, + "sampling/importance_sampling_ratio/min": 0.719417929649353, + "sampling/sampling_logp_difference/max": 0.329312801361084, + "sampling/sampling_logp_difference/mean": 0.015836309641599655, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 190.59375, + "completions/mean_terminated_length": 190.59375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.4766071140766144, + "epoch": 0.5345132743362832, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9641606035355126, + "kl": 0.029689954593777657, + "learning_rate": 9.180265969084056e-07, + "loss": -0.0054, + "num_tokens": 7008811.0, + "reward": 0.03125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.03125, + "rewards/decision_reward_func/std": 1.0074130296707153, + "sampling/importance_sampling_ratio/max": 1.44169020652771, + "sampling/importance_sampling_ratio/mean": 1.0000590085983276, + "sampling/importance_sampling_ratio/min": 0.7592349052429199, + "sampling/sampling_logp_difference/max": 0.3658161163330078, + "sampling/sampling_logp_difference/mean": 0.01634790375828743, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 207.09375, + "completions/mean_terminated_length": 207.09375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4701969027519226, + "epoch": 0.536283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.023617708610028605, + "kl": 0.029484564438462257, + "learning_rate": 9.171771948424136e-07, + "loss": 0.0003, + "num_tokens": 7034529.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4577841758728027, + "sampling/importance_sampling_ratio/mean": 1.0001473426818848, + "sampling/importance_sampling_ratio/min": 0.6138359308242798, + "sampling/sampling_logp_difference/max": 0.48802757263183594, + "sampling/sampling_logp_difference/mean": 0.016000833362340927, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 123.984375, + "completions/mean_terminated_length": 123.984375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.4232381582260132, + "epoch": 0.5380530973451327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017957166653677577, + "kl": 0.017481127753853798, + "learning_rate": 9.163238119058871e-07, + "loss": 0.0002, + "num_tokens": 7052288.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4640995264053345, + "sampling/importance_sampling_ratio/mean": 0.9997575879096985, + "sampling/importance_sampling_ratio/min": 0.7615348696708679, + "sampling/sampling_logp_difference/max": 0.3812403678894043, + "sampling/sampling_logp_difference/mean": 0.01718355156481266, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 124.765625, + "completions/mean_terminated_length": 124.765625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.34873998165130615, + "epoch": 0.5398230088495575, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.01970275105905271, + "kl": 0.01254827156662941, + "learning_rate": 9.154664562421453e-07, + "loss": 0.0001, + "num_tokens": 7070497.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5928269624710083, + "sampling/importance_sampling_ratio/mean": 0.9998602271080017, + "sampling/importance_sampling_ratio/min": 0.7438759207725525, + "sampling/sampling_logp_difference/max": 0.46551036834716797, + "sampling/sampling_logp_difference/mean": 0.01575476862490177, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 146.6875, + "completions/mean_terminated_length": 146.6875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.45946380496025085, + "epoch": 0.5415929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027331330504074755, + "kl": 0.025959448888897896, + "learning_rate": 9.146051360324165e-07, + "loss": 0.0003, + "num_tokens": 7090989.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4168685674667358, + "sampling/importance_sampling_ratio/mean": 1.0000064373016357, + "sampling/importance_sampling_ratio/min": 0.610821545124054, + "sampling/sampling_logp_difference/max": 0.492950439453125, + "sampling/sampling_logp_difference/mean": 0.01741579920053482, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 179.34375, + "completions/mean_terminated_length": 179.34375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.3625985383987427, + "epoch": 0.5433628318584071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02555287441167141, + "kl": 0.022509675472974777, + "learning_rate": 9.137398594957603e-07, + "loss": 0.0003, + "num_tokens": 7112979.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3600614070892334, + "sampling/importance_sampling_ratio/mean": 1.0002243518829346, + "sampling/importance_sampling_ratio/min": 0.645759105682373, + "sampling/sampling_logp_difference/max": 0.4373287856578827, + "sampling/sampling_logp_difference/mean": 0.014308637008070946, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 201.90625, + "completions/mean_terminated_length": 201.90625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4243144094944, + "epoch": 0.5451327433628319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020160252089109758, + "kl": 0.019518980756402016, + "learning_rate": 9.128706348889894e-07, + "loss": 0.0002, + "num_tokens": 7136429.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4510719776153564, + "sampling/importance_sampling_ratio/mean": 0.9998438358306885, + "sampling/importance_sampling_ratio/min": 0.7054146528244019, + "sampling/sampling_logp_difference/max": 0.3723025321960449, + "sampling/sampling_logp_difference/mean": 0.01521989330649376, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 171.28125, + "completions/mean_terminated_length": 171.28125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.4476414620876312, + "epoch": 0.5469026548672566, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1123146738256926, + "kl": 0.02239086851477623, + "learning_rate": 9.1199747050659e-07, + "loss": -0.0148, + "num_tokens": 7159679.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.2853026390075684, + "sampling/importance_sampling_ratio/mean": 0.9998779296875, + "sampling/importance_sampling_ratio/min": 0.6334834098815918, + "sampling/sampling_logp_difference/max": 0.45652151107788086, + "sampling/sampling_logp_difference/mean": 0.01625833287835121, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 192.46875, + "completions/mean_terminated_length": 192.46875, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3643852472305298, + "epoch": 0.5486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.017628714255359525, + "kl": 0.01792171411216259, + "learning_rate": 9.111203746806439e-07, + "loss": 0.0002, + "num_tokens": 7182349.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4224846363067627, + "sampling/importance_sampling_ratio/mean": 1.000328540802002, + "sampling/importance_sampling_ratio/min": 0.6199836730957031, + "sampling/sampling_logp_difference/max": 0.47806215286254883, + "sampling/sampling_logp_difference/mean": 0.013797037303447723, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 213.234375, + "completions/mean_terminated_length": 213.234375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.5292292833328247, + "epoch": 0.5504424778761062, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3727710535787903, + "kl": 0.03132692724466324, + "learning_rate": 9.102393557807476e-07, + "loss": -0.0237, + "num_tokens": 7208476.0, + "reward": 0.15625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4008958339691162, + "sampling/importance_sampling_ratio/mean": 1.0000406503677368, + "sampling/importance_sampling_ratio/min": 0.6428192853927612, + "sampling/sampling_logp_difference/max": 0.4418916702270508, + "sampling/sampling_logp_difference/mean": 0.01771797239780426, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 141.9375, + "completions/mean_terminated_length": 141.9375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4951961934566498, + "epoch": 0.552212389380531, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1948294530666905, + "kl": 0.02944597788155079, + "learning_rate": 9.093544222139337e-07, + "loss": -0.0143, + "num_tokens": 7227896.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.632109522819519, + "sampling/importance_sampling_ratio/mean": 0.9997971057891846, + "sampling/importance_sampling_ratio/min": 0.6881688237190247, + "sampling/sampling_logp_difference/max": 0.48987340927124023, + "sampling/sampling_logp_difference/mean": 0.016983861103653908, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 216.0, + "completions/max_terminated_length": 216.0, + "completions/mean_length": 113.109375, + "completions/mean_terminated_length": 113.109375, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.3199790418148041, + "epoch": 0.5539823008849557, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022995973721823926, + "kl": 0.013230049051344395, + "learning_rate": 9.084655824245897e-07, + "loss": 0.0001, + "num_tokens": 7245567.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2884671688079834, + "sampling/importance_sampling_ratio/mean": 0.9997365474700928, + "sampling/importance_sampling_ratio/min": 0.6622399091720581, + "sampling/sampling_logp_difference/max": 0.41212737560272217, + "sampling/sampling_logp_difference/mean": 0.01478345226496458, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 179.875, + "completions/mean_terminated_length": 179.875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.4696625769138336, + "epoch": 0.5557522123893806, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1544001954538277, + "kl": 0.03679889440536499, + "learning_rate": 9.075728448943781e-07, + "loss": 0.0253, + "num_tokens": 7268551.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.5777137279510498, + "sampling/importance_sampling_ratio/mean": 0.999577522277832, + "sampling/importance_sampling_ratio/min": 0.6172122955322266, + "sampling/sampling_logp_difference/max": 0.4825422763824463, + "sampling/sampling_logp_difference/mean": 0.015795571729540825, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 280.515625, + "completions/mean_terminated_length": 280.515625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.7630300521850586, + "epoch": 0.5575221238938053, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.219075142426945, + "kl": 0.047929853200912476, + "learning_rate": 9.066762181421552e-07, + "loss": -0.0221, + "num_tokens": 7306632.0, + "reward": 0.34375, + "reward_std": 0.6205305457115173, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.2905977964401245, + "sampling/importance_sampling_ratio/mean": 0.9999635219573975, + "sampling/importance_sampling_ratio/min": 0.69340980052948, + "sampling/sampling_logp_difference/max": 0.3661341667175293, + "sampling/sampling_logp_difference/mean": 0.01918640546500683, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.0, + "completions/max_terminated_length": 454.0, + "completions/mean_length": 166.75, + "completions/mean_terminated_length": 166.75, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4249437749385834, + "epoch": 0.5592920353982301, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0349278536279007, + "kl": 0.0165907870978117, + "learning_rate": 9.057757107238894e-07, + "loss": -0.0517, + "num_tokens": 7327704.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.4352306127548218, + "sampling/importance_sampling_ratio/mean": 0.9991052150726318, + "sampling/importance_sampling_ratio/min": 0.6202441453933716, + "sampling/sampling_logp_difference/max": 0.4776420593261719, + "sampling/sampling_logp_difference/mean": 0.016329392790794373, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 109.09375, + "completions/mean_terminated_length": 109.09375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.34463635087013245, + "epoch": 0.5610619469026549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03992014966658588, + "kl": 0.014742556028068066, + "learning_rate": 9.048713312325804e-07, + "loss": 0.0001, + "num_tokens": 7344350.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4840832948684692, + "sampling/importance_sampling_ratio/mean": 0.9999971985816956, + "sampling/importance_sampling_ratio/min": 0.61069256067276, + "sampling/sampling_logp_difference/max": 0.493161678314209, + "sampling/sampling_logp_difference/mean": 0.017180006951093674, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 153.765625, + "completions/mean_terminated_length": 153.765625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.45609360933303833, + "epoch": 0.5628318584070796, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1303939749009286, + "kl": 0.04069334641098976, + "learning_rate": 9.039630882981768e-07, + "loss": 0.0028, + "num_tokens": 7365327.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.4444701671600342, + "sampling/importance_sampling_ratio/mean": 1.0001757144927979, + "sampling/importance_sampling_ratio/min": 0.7404763698577881, + "sampling/sampling_logp_difference/max": 0.36774253845214844, + "sampling/sampling_logp_difference/mean": 0.016026653349399567, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 204.34375, + "completions/mean_terminated_length": 204.34375, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.512562096118927, + "epoch": 0.5646017699115045, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8567449108608022, + "kl": 0.024660678580403328, + "learning_rate": 9.030509905874932e-07, + "loss": 0.0184, + "num_tokens": 7391493.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.388451099395752, + "sampling/importance_sampling_ratio/mean": 1.0002787113189697, + "sampling/importance_sampling_ratio/min": 0.6836274862289429, + "sampling/sampling_logp_difference/max": 0.38034212589263916, + "sampling/sampling_logp_difference/mean": 0.017032500356435776, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 416.0, + "completions/max_terminated_length": 416.0, + "completions/mean_length": 200.265625, + "completions/mean_terminated_length": 200.265625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4987943470478058, + "epoch": 0.5663716814159292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02476306652548267, + "kl": 0.027325624600052834, + "learning_rate": 9.021350468041287e-07, + "loss": 0.0003, + "num_tokens": 7416406.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3389766216278076, + "sampling/importance_sampling_ratio/mean": 1.0004692077636719, + "sampling/importance_sampling_ratio/min": 0.6792095303535461, + "sampling/sampling_logp_difference/max": 0.3868255615234375, + "sampling/sampling_logp_difference/mean": 0.01691562309861183, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 175.34375, + "completions/mean_terminated_length": 175.34375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.5358196496963501, + "epoch": 0.5681415929203539, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.197771469521168, + "kl": 0.03339190408587456, + "learning_rate": 9.012152656883822e-07, + "loss": -0.0202, + "num_tokens": 7442124.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.3223949670791626, + "sampling/importance_sampling_ratio/mean": 0.999968409538269, + "sampling/importance_sampling_ratio/min": 0.6171384453773499, + "sampling/sampling_logp_difference/max": 0.4826619625091553, + "sampling/sampling_logp_difference/mean": 0.01843232661485672, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 140.234375, + "completions/mean_terminated_length": 140.234375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.24724894762039185, + "epoch": 0.5699115044247788, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037065985528952475, + "kl": 0.01480955071747303, + "learning_rate": 9.002916560171712e-07, + "loss": 0.0001, + "num_tokens": 7460347.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.3267300128936768, + "sampling/importance_sampling_ratio/mean": 1.0000386238098145, + "sampling/importance_sampling_ratio/min": 0.6084439158439636, + "sampling/sampling_logp_difference/max": 0.49685049057006836, + "sampling/sampling_logp_difference/mean": 0.01217254064977169, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 254.9375, + "completions/mean_terminated_length": 254.9375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.4838610589504242, + "epoch": 0.5716814159292035, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4102688538404478, + "kl": 0.02883777767419815, + "learning_rate": 8.993642266039456e-07, + "loss": 0.1031, + "num_tokens": 7489527.0, + "reward": 0.8125, + "reward_std": 0.4973389506340027, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4121158123016357, + "sampling/importance_sampling_ratio/mean": 1.000025749206543, + "sampling/importance_sampling_ratio/min": 0.5279315114021301, + "sampling/sampling_logp_difference/max": 0.6387887001037598, + "sampling/sampling_logp_difference/mean": 0.01626652479171753, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 176.375, + "completions/mean_terminated_length": 176.375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.580811083316803, + "epoch": 0.5734513274336284, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.7990374585024924, + "kl": 0.0485653281211853, + "learning_rate": 8.984329862986055e-07, + "loss": -0.0115, + "num_tokens": 7514879.0, + "reward": 0.15625, + "reward_std": 0.5827301740646362, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6363136768341064, + "sampling/importance_sampling_ratio/mean": 0.9998959898948669, + "sampling/importance_sampling_ratio/min": 0.6743044853210449, + "sampling/sampling_logp_difference/max": 0.4924459457397461, + "sampling/sampling_logp_difference/mean": 0.01841394044458866, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 161.28125, + "completions/mean_terminated_length": 161.28125, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.4510684609413147, + "epoch": 0.5752212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03822921565943506, + "kl": 0.02629006840288639, + "learning_rate": 8.97497943987416e-07, + "loss": 0.0003, + "num_tokens": 7535505.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.311832070350647, + "sampling/importance_sampling_ratio/mean": 0.9999359250068665, + "sampling/importance_sampling_ratio/min": 0.644705593585968, + "sampling/sampling_logp_difference/max": 0.4389615058898926, + "sampling/sampling_logp_difference/mean": 0.017147788777947426, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 158.0, + "completions/max_terminated_length": 158.0, + "completions/mean_length": 99.234375, + "completions/mean_terminated_length": 99.234375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.231645405292511, + "epoch": 0.5769911504424778, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04359184759156683, + "kl": 0.016105014830827713, + "learning_rate": 8.96559108592922e-07, + "loss": 0.0002, + "num_tokens": 7550720.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.451245903968811, + "sampling/importance_sampling_ratio/mean": 0.9995250105857849, + "sampling/importance_sampling_ratio/min": 0.6763100028038025, + "sampling/sampling_logp_difference/max": 0.39110374450683594, + "sampling/sampling_logp_difference/mean": 0.01318065170198679, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 199.8125, + "completions/mean_terminated_length": 199.8125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.43389004468917847, + "epoch": 0.5787610619469027, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8302330875336456, + "kl": 0.034609388560056686, + "learning_rate": 8.956164890738642e-07, + "loss": 0.0015, + "num_tokens": 7575060.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.3011858463287354, + "sampling/importance_sampling_ratio/mean": 1.000328540802002, + "sampling/importance_sampling_ratio/min": 0.7293118834495544, + "sampling/sampling_logp_difference/max": 0.31565380096435547, + "sampling/sampling_logp_difference/mean": 0.0162799172103405, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 155.09375, + "completions/mean_terminated_length": 155.09375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.27465489506721497, + "epoch": 0.5805309734513274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035554886971738475, + "kl": 0.021240783855319023, + "learning_rate": 8.946700944250924e-07, + "loss": 0.0002, + "num_tokens": 7594906.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2974066734313965, + "sampling/importance_sampling_ratio/mean": 0.9997029304504395, + "sampling/importance_sampling_ratio/min": 0.6262715458869934, + "sampling/sampling_logp_difference/max": 0.4679713249206543, + "sampling/sampling_logp_difference/mean": 0.013311211951076984, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 229.59375, + "completions/mean_terminated_length": 229.59375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.4771236479282379, + "epoch": 0.5823008849557522, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8169328181258667, + "kl": 0.023737013339996338, + "learning_rate": 8.937199336774804e-07, + "loss": -0.0056, + "num_tokens": 7621680.0, + "reward": 0.3125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.3591655492782593, + "sampling/importance_sampling_ratio/mean": 1.00018310546875, + "sampling/importance_sampling_ratio/min": 0.6493704915046692, + "sampling/sampling_logp_difference/max": 0.4317518472671509, + "sampling/sampling_logp_difference/mean": 0.015672830864787102, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 243.203125, + "completions/mean_terminated_length": 243.203125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.5708993077278137, + "epoch": 0.584070796460177, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5306653694538026, + "kl": 0.036455169320106506, + "learning_rate": 8.927660158978392e-07, + "loss": -0.1243, + "num_tokens": 7650397.0, + "reward": 0.15625, + "reward_std": 0.6505630612373352, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.6598302125930786, + "sampling/importance_sampling_ratio/mean": 1.0003546476364136, + "sampling/importance_sampling_ratio/min": 0.7050821185112, + "sampling/sampling_logp_difference/max": 0.5067152976989746, + "sampling/sampling_logp_difference/mean": 0.017485732212662697, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 174.65625, + "completions/mean_terminated_length": 174.65625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.39911821484565735, + "epoch": 0.5858407079646017, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027272171052515736, + "kl": 0.017386943101882935, + "learning_rate": 8.918083501888316e-07, + "loss": 0.0002, + "num_tokens": 7673063.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4191452264785767, + "sampling/importance_sampling_ratio/mean": 1.0000393390655518, + "sampling/importance_sampling_ratio/min": 0.6963819861412048, + "sampling/sampling_logp_difference/max": 0.36185693740844727, + "sampling/sampling_logp_difference/mean": 0.015225267969071865, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.0, + "completions/max_terminated_length": 434.0, + "completions/mean_length": 204.546875, + "completions/mean_terminated_length": 204.546875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.5385278463363647, + "epoch": 0.5876106194690266, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1898785104977088, + "kl": 0.04100334271788597, + "learning_rate": 8.908469456888843e-07, + "loss": 0.0137, + "num_tokens": 7700042.0, + "reward": 0.59375, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.2864842414855957, + "sampling/importance_sampling_ratio/mean": 0.999769926071167, + "sampling/importance_sampling_ratio/min": 0.7454503178596497, + "sampling/sampling_logp_difference/max": 0.2937668561935425, + "sampling/sampling_logp_difference/mean": 0.016970038414001465, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 203.734375, + "completions/mean_terminated_length": 203.734375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.40644773840904236, + "epoch": 0.5893805309734513, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8706282295748837, + "kl": 0.021324289962649345, + "learning_rate": 8.898818115721007e-07, + "loss": -0.0221, + "num_tokens": 7723001.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.377809762954712, + "sampling/importance_sampling_ratio/mean": 0.9999908208847046, + "sampling/importance_sampling_ratio/min": 0.6941059231758118, + "sampling/sampling_logp_difference/max": 0.3651306629180908, + "sampling/sampling_logp_difference/mean": 0.014977142214775085, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 152.953125, + "completions/mean_terminated_length": 152.953125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.4270297884941101, + "epoch": 0.5911504424778761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026822486395567488, + "kl": 0.02106567844748497, + "learning_rate": 8.889129570481741e-07, + "loss": 0.0002, + "num_tokens": 7743942.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3990821838378906, + "sampling/importance_sampling_ratio/mean": 0.9998767375946045, + "sampling/importance_sampling_ratio/min": 0.6077051162719727, + "sampling/sampling_logp_difference/max": 0.4980654716491699, + "sampling/sampling_logp_difference/mean": 0.015651099383831024, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 846.0, + "completions/max_terminated_length": 846.0, + "completions/mean_length": 198.546875, + "completions/mean_terminated_length": 198.546875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.325345516204834, + "epoch": 0.5929203539823009, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9778925706608806, + "kl": 0.013278895057737827, + "learning_rate": 8.879403913622996e-07, + "loss": -0.023, + "num_tokens": 7766473.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4353985786437988, + "sampling/importance_sampling_ratio/mean": 1.0006654262542725, + "sampling/importance_sampling_ratio/min": 0.7160588502883911, + "sampling/sampling_logp_difference/max": 0.36144256591796875, + "sampling/sampling_logp_difference/mean": 0.013793321326375008, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 165.609375, + "completions/mean_terminated_length": 165.609375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4705030024051666, + "epoch": 0.5946902654867257, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1162423522001792, + "kl": 0.031908683478832245, + "learning_rate": 8.869641237950849e-07, + "loss": -0.0083, + "num_tokens": 7788256.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.6634169816970825, + "sampling/importance_sampling_ratio/mean": 1.000626564025879, + "sampling/importance_sampling_ratio/min": 0.6571769714355469, + "sampling/sampling_logp_difference/max": 0.5088739395141602, + "sampling/sampling_logp_difference/mean": 0.0166355911642313, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 192.59375, + "completions/mean_terminated_length": 192.59375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.5487221479415894, + "epoch": 0.5964601769911504, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3387937200408704, + "kl": 0.03449319303035736, + "learning_rate": 8.859841636624631e-07, + "loss": 0.0164, + "num_tokens": 7811094.0, + "reward": 0.59375, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.4217135906219482, + "sampling/importance_sampling_ratio/mean": 1.0003790855407715, + "sampling/importance_sampling_ratio/min": 0.6393097639083862, + "sampling/sampling_logp_difference/max": 0.44736623764038086, + "sampling/sampling_logp_difference/mean": 0.01947716996073723, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 115.78125, + "completions/mean_terminated_length": 115.78125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.296596884727478, + "epoch": 0.5982300884955752, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033744789320011105, + "kl": 0.018130596727132797, + "learning_rate": 8.850005203156034e-07, + "loss": 0.0002, + "num_tokens": 7828952.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6452924013137817, + "sampling/importance_sampling_ratio/mean": 0.9998478293418884, + "sampling/importance_sampling_ratio/min": 0.6643782258033752, + "sampling/sampling_logp_difference/max": 0.49791812896728516, + "sampling/sampling_logp_difference/mean": 0.014338867738842964, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 241.0, + "completions/max_terminated_length": 241.0, + "completions/mean_length": 138.28125, + "completions/mean_terminated_length": 138.28125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.23987334966659546, + "epoch": 0.6, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028806517551722206, + "kl": 0.017187688499689102, + "learning_rate": 8.84013203140821e-07, + "loss": 0.0002, + "num_tokens": 7847242.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5104879140853882, + "sampling/importance_sampling_ratio/mean": 1.0002355575561523, + "sampling/importance_sampling_ratio/min": 0.6625238060951233, + "sampling/sampling_logp_difference/max": 0.4124326705932617, + "sampling/sampling_logp_difference/mean": 0.011877824552357197, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 118.0, + "completions/mean_terminated_length": 118.0, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.32074832916259766, + "epoch": 0.6017699115044248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028997403607390542, + "kl": 0.016999173909425735, + "learning_rate": 8.83022221559489e-07, + "loss": 0.0002, + "num_tokens": 7865306.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.600737452507019, + "sampling/importance_sampling_ratio/mean": 0.9993951320648193, + "sampling/importance_sampling_ratio/min": 0.6845306158065796, + "sampling/sampling_logp_difference/max": 0.47046446800231934, + "sampling/sampling_logp_difference/mean": 0.015037331730127335, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 201.984375, + "completions/mean_terminated_length": 201.984375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.4528687596321106, + "epoch": 0.6035398230088496, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8374241068347515, + "kl": 0.03223993629217148, + "learning_rate": 8.820275850279472e-07, + "loss": -0.0105, + "num_tokens": 7889417.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.418492317199707, + "sampling/importance_sampling_ratio/mean": 0.9998888969421387, + "sampling/importance_sampling_ratio/min": 0.7561723589897156, + "sampling/sampling_logp_difference/max": 0.3495945930480957, + "sampling/sampling_logp_difference/mean": 0.016138648614287376, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 642.0, + "completions/max_terminated_length": 642.0, + "completions/mean_length": 216.40625, + "completions/mean_terminated_length": 216.40625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.5420851111412048, + "epoch": 0.6053097345132743, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2654400146455447, + "kl": 0.03524719923734665, + "learning_rate": 8.810293030374125e-07, + "loss": 0.0027, + "num_tokens": 7914755.0, + "reward": 0.8125, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6463134288787842, + "sampling/importance_sampling_ratio/mean": 1.0001258850097656, + "sampling/importance_sampling_ratio/min": 0.6932322382926941, + "sampling/sampling_logp_difference/max": 0.4985384941101074, + "sampling/sampling_logp_difference/mean": 0.017083358019590378, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 154.421875, + "completions/mean_terminated_length": 154.421875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3786289691925049, + "epoch": 0.6070796460176991, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03575700654546547, + "kl": 0.025098759680986404, + "learning_rate": 8.800273851138882e-07, + "loss": 0.0003, + "num_tokens": 7934798.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3124302625656128, + "sampling/importance_sampling_ratio/mean": 1.0006170272827148, + "sampling/importance_sampling_ratio/min": 0.6380122303962708, + "sampling/sampling_logp_difference/max": 0.4493978023529053, + "sampling/sampling_logp_difference/mean": 0.01563396491110325, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 195.8125, + "completions/mean_terminated_length": 195.8125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.43428149819374084, + "epoch": 0.6088495575221239, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9633281384458705, + "kl": 0.022706853225827217, + "learning_rate": 8.790218408180734e-07, + "loss": -0.0124, + "num_tokens": 7959218.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.5093923807144165, + "sampling/importance_sampling_ratio/mean": 0.9997909665107727, + "sampling/importance_sampling_ratio/min": 0.6833602786064148, + "sampling/sampling_logp_difference/max": 0.41170716285705566, + "sampling/sampling_logp_difference/mean": 0.015266085043549538, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 230.4375, + "completions/mean_terminated_length": 230.4375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.5112296342849731, + "epoch": 0.6106194690265486, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028634278924611033, + "kl": 0.032195795327425, + "learning_rate": 8.780126797452712e-07, + "loss": 0.0003, + "num_tokens": 7986686.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3169021606445312, + "sampling/importance_sampling_ratio/mean": 0.9998841881752014, + "sampling/importance_sampling_ratio/min": 0.6206711530685425, + "sampling/sampling_logp_difference/max": 0.47695398330688477, + "sampling/sampling_logp_difference/mean": 0.016089120879769325, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 277.65625, + "completions/mean_terminated_length": 277.65625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.48142367601394653, + "epoch": 0.6123893805309735, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0819649635422508, + "kl": 0.030770402401685715, + "learning_rate": 8.769999115252975e-07, + "loss": 0.009, + "num_tokens": 8015784.0, + "reward": 0.53125, + "reward_std": 0.48935678601264954, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.424164056777954, + "sampling/importance_sampling_ratio/mean": 0.9999499320983887, + "sampling/importance_sampling_ratio/min": 0.6622363924980164, + "sampling/sampling_logp_difference/max": 0.41213274002075195, + "sampling/sampling_logp_difference/mean": 0.014971626922488213, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 173.34375, + "completions/mean_terminated_length": 173.34375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.43033796548843384, + "epoch": 0.6141592920353982, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1118008381723494, + "kl": 0.036823272705078125, + "learning_rate": 8.759835458223887e-07, + "loss": 0.0226, + "num_tokens": 8037438.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3416407108306885, + "sampling/importance_sampling_ratio/mean": 0.9997933506965637, + "sampling/importance_sampling_ratio/min": 0.6783764958381653, + "sampling/sampling_logp_difference/max": 0.3880528211593628, + "sampling/sampling_logp_difference/mean": 0.01579831913113594, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 225.734375, + "completions/mean_terminated_length": 225.734375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.528110682964325, + "epoch": 0.6159292035398231, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03858983418164625, + "kl": 0.038228943943977356, + "learning_rate": 8.749635923351106e-07, + "loss": 0.0004, + "num_tokens": 8063229.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4603391885757446, + "sampling/importance_sampling_ratio/mean": 1.0001890659332275, + "sampling/importance_sampling_ratio/min": 0.6262895464897156, + "sampling/sampling_logp_difference/max": 0.467942476272583, + "sampling/sampling_logp_difference/mean": 0.01644430309534073, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 226.9375, + "completions/mean_terminated_length": 226.9375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4355280101299286, + "epoch": 0.6176991150442478, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8883931500274885, + "kl": 0.03403376042842865, + "learning_rate": 8.739400607962644e-07, + "loss": 0.0072, + "num_tokens": 8089033.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.3030188083648682, + "sampling/importance_sampling_ratio/mean": 1.000458002090454, + "sampling/importance_sampling_ratio/min": 0.6413252353668213, + "sampling/sampling_logp_difference/max": 0.44421863555908203, + "sampling/sampling_logp_difference/mean": 0.013998275622725487, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 177.265625, + "completions/mean_terminated_length": 177.265625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.2729702591896057, + "epoch": 0.6194690265486725, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027398615108238537, + "kl": 0.02054606005549431, + "learning_rate": 8.729129609727946e-07, + "loss": 0.0002, + "num_tokens": 8108938.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4834272861480713, + "sampling/importance_sampling_ratio/mean": 1.0010185241699219, + "sampling/importance_sampling_ratio/min": 0.5620219707489014, + "sampling/sampling_logp_difference/max": 0.5762143135070801, + "sampling/sampling_logp_difference/mean": 0.01200198009610176, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 213.515625, + "completions/mean_terminated_length": 213.515625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.5209338665008545, + "epoch": 0.6212389380530974, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2635862005627876, + "kl": 0.04583410546183586, + "learning_rate": 8.718823026656958e-07, + "loss": -0.0236, + "num_tokens": 8134235.0, + "reward": 0.65625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.6144993305206299, + "sampling/importance_sampling_ratio/mean": 0.9988173246383667, + "sampling/importance_sampling_ratio/min": 0.63479083776474, + "sampling/sampling_logp_difference/max": 0.47902488708496094, + "sampling/sampling_logp_difference/mean": 0.01699173077940941, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 215.1875, + "completions/mean_terminated_length": 215.1875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.46796682476997375, + "epoch": 0.6230088495575221, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9188342619639853, + "kl": 0.0415828675031662, + "learning_rate": 8.708480957099193e-07, + "loss": -0.0114, + "num_tokens": 8157383.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.3758742809295654, + "sampling/importance_sampling_ratio/mean": 1.0000108480453491, + "sampling/importance_sampling_ratio/min": 0.6447747945785522, + "sampling/sampling_logp_difference/max": 0.4388542175292969, + "sampling/sampling_logp_difference/mean": 0.015339828096330166, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 221.234375, + "completions/mean_terminated_length": 221.234375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4628385305404663, + "epoch": 0.6247787610619469, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1133548655824281, + "kl": 0.040711820125579834, + "learning_rate": 8.698103499742783e-07, + "loss": -0.0059, + "num_tokens": 8181526.0, + "reward": 0.4375, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4460031986236572, + "sampling/importance_sampling_ratio/mean": 1.0004066228866577, + "sampling/importance_sampling_ratio/min": 0.6387195587158203, + "sampling/sampling_logp_difference/max": 0.4482898712158203, + "sampling/sampling_logp_difference/mean": 0.01546501275151968, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 544.0, + "completions/max_terminated_length": 544.0, + "completions/mean_length": 201.96875, + "completions/mean_terminated_length": 201.96875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.39816412329673767, + "epoch": 0.6265486725663717, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0763734372573512, + "kl": 0.029497426003217697, + "learning_rate": 8.687690753613554e-07, + "loss": -0.0271, + "num_tokens": 8206212.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.560969352722168, + "sampling/importance_sampling_ratio/mean": 1.0002760887145996, + "sampling/importance_sampling_ratio/min": 0.6801750063896179, + "sampling/sampling_logp_difference/max": 0.44530701637268066, + "sampling/sampling_logp_difference/mean": 0.014909004792571068, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 134.390625, + "completions/mean_terminated_length": 134.390625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.262448251247406, + "epoch": 0.6283185840707964, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05111182214123538, + "kl": 0.025020107626914978, + "learning_rate": 8.677242818074062e-07, + "loss": 0.0003, + "num_tokens": 8224557.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3412717580795288, + "sampling/importance_sampling_ratio/mean": 0.9999301433563232, + "sampling/importance_sampling_ratio/min": 0.6426113247871399, + "sampling/sampling_logp_difference/max": 0.4422152042388916, + "sampling/sampling_logp_difference/mean": 0.0129863191395998, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 217.0625, + "completions/mean_terminated_length": 217.0625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.4070281386375427, + "epoch": 0.6300884955752213, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1382826206720111, + "kl": 0.02475142851471901, + "learning_rate": 8.666759792822661e-07, + "loss": -0.0316, + "num_tokens": 8261377.0, + "reward": -0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.553475260734558, + "sampling/importance_sampling_ratio/mean": 1.000364065170288, + "sampling/importance_sampling_ratio/min": 0.6571546792984009, + "sampling/sampling_logp_difference/max": 0.4404945373535156, + "sampling/sampling_logp_difference/mean": 0.01452541071921587, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 902.0, + "completions/max_terminated_length": 902.0, + "completions/mean_length": 331.96875, + "completions/mean_terminated_length": 331.96875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.39330345392227173, + "epoch": 0.631858407079646, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02467953743492263, + "kl": 0.027912279590964317, + "learning_rate": 8.656241777892542e-07, + "loss": 0.0003, + "num_tokens": 8293039.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.547163963317871, + "sampling/importance_sampling_ratio/mean": 0.9996451139450073, + "sampling/importance_sampling_ratio/min": 0.636603593826294, + "sampling/sampling_logp_difference/max": 0.45160818099975586, + "sampling/sampling_logp_difference/mean": 0.011738335713744164, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 155.1875, + "completions/mean_terminated_length": 155.1875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.27595287561416626, + "epoch": 0.6336283185840708, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026142118969928976, + "kl": 0.018282443284988403, + "learning_rate": 8.645688873650784e-07, + "loss": 0.0002, + "num_tokens": 8312731.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6021919250488281, + "sampling/importance_sampling_ratio/mean": 1.0000102519989014, + "sampling/importance_sampling_ratio/min": 0.6061299443244934, + "sampling/sampling_logp_difference/max": 0.5006608963012695, + "sampling/sampling_logp_difference/mean": 0.012841126881539822, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 172.21875, + "completions/mean_terminated_length": 172.21875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.4368191361427307, + "epoch": 0.6353982300884956, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0767510877429516, + "kl": 0.0403921864926815, + "learning_rate": 8.63510118079739e-07, + "loss": -0.0141, + "num_tokens": 8333945.0, + "reward": 0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.2823578119277954, + "sampling/importance_sampling_ratio/mean": 0.9997091293334961, + "sampling/importance_sampling_ratio/min": 0.6186152696609497, + "sampling/sampling_logp_difference/max": 0.4802718162536621, + "sampling/sampling_logp_difference/mean": 0.015755271539092064, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 181.296875, + "completions/mean_terminated_length": 181.296875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.4091697931289673, + "epoch": 0.6371681415929203, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0322824050142376, + "kl": 0.03155152499675751, + "learning_rate": 8.624478800364331e-07, + "loss": -0.0314, + "num_tokens": 8358892.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5558624267578125, + "sampling/importance_sampling_ratio/mean": 0.9999942779541016, + "sampling/importance_sampling_ratio/min": 0.6341093182563782, + "sampling/sampling_logp_difference/max": 0.4555339813232422, + "sampling/sampling_logp_difference/mean": 0.015035311691462994, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 264.453125, + "completions/mean_terminated_length": 264.453125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.4108230769634247, + "epoch": 0.6389380530973451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6428188955893065, + "kl": 0.03412599489092827, + "learning_rate": 8.613821833714583e-07, + "loss": -0.0075, + "num_tokens": 8387337.0, + "reward": 0.375, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.4704291820526123, + "sampling/importance_sampling_ratio/mean": 1.0001084804534912, + "sampling/importance_sampling_ratio/min": 0.2541913092136383, + "sampling/sampling_logp_difference/max": 1.3696681261062622, + "sampling/sampling_logp_difference/mean": 0.013900469988584518, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1369.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 317.78125, + "completions/mean_terminated_length": 317.78125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.5605416297912598, + "epoch": 0.6407079646017699, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8708143773037155, + "kl": 0.04219198226928711, + "learning_rate": 8.603130382541155e-07, + "loss": 0.0043, + "num_tokens": 8421947.0, + "reward": 0.21875, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4345859289169312, + "sampling/importance_sampling_ratio/mean": 0.9998886585235596, + "sampling/importance_sampling_ratio/min": 0.7422106862068176, + "sampling/sampling_logp_difference/max": 0.360876202583313, + "sampling/sampling_logp_difference/mean": 0.017081569880247116, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 763.0, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 313.640625, + "completions/mean_terminated_length": 313.640625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.43918853998184204, + "epoch": 0.6424778761061947, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6168597521682825, + "kl": 0.0383281372487545, + "learning_rate": 8.592404548866122e-07, + "loss": 0.0008, + "num_tokens": 8453940.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.4403674602508545, + "sampling/importance_sampling_ratio/mean": 1.0000804662704468, + "sampling/importance_sampling_ratio/min": 0.6487054824829102, + "sampling/sampling_logp_difference/max": 0.43277645111083984, + "sampling/sampling_logp_difference/mean": 0.013403661549091339, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 181.125, + "completions/mean_terminated_length": 181.125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.39058035612106323, + "epoch": 0.6442477876106195, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036823206988136556, + "kl": 0.033125072717666626, + "learning_rate": 8.58164443503965e-07, + "loss": 0.0004, + "num_tokens": 8477036.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4265888929367065, + "sampling/importance_sampling_ratio/mean": 0.9999474883079529, + "sampling/importance_sampling_ratio/min": 0.6711859703063965, + "sampling/sampling_logp_difference/max": 0.3987090587615967, + "sampling/sampling_logp_difference/mean": 0.014679424464702606, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 244.421875, + "completions/mean_terminated_length": 244.421875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.6227244734764099, + "epoch": 0.6460176991150443, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1229622924470637, + "kl": 0.05889035388827324, + "learning_rate": 8.570850143739021e-07, + "loss": -0.0174, + "num_tokens": 8505047.0, + "reward": -0.0625, + "reward_std": 0.5081988573074341, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.4740734100341797, + "sampling/importance_sampling_ratio/mean": 1.0005154609680176, + "sampling/importance_sampling_ratio/min": 0.7484645843505859, + "sampling/sampling_logp_difference/max": 0.3880295753479004, + "sampling/sampling_logp_difference/mean": 0.01888107880949974, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 169.1875, + "completions/mean_terminated_length": 169.1875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.4301868677139282, + "epoch": 0.647787610619469, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2027250960354323, + "kl": 0.042081568390131, + "learning_rate": 8.560021777967648e-07, + "loss": 0.0043, + "num_tokens": 8525699.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.6007353067398071, + "sampling/importance_sampling_ratio/mean": 1.000962495803833, + "sampling/importance_sampling_ratio/min": 0.6136767864227295, + "sampling/sampling_logp_difference/max": 0.48828697204589844, + "sampling/sampling_logp_difference/mean": 0.01758481003344059, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 286.9375, + "completions/mean_terminated_length": 286.9375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.4721408784389496, + "epoch": 0.6495575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8144029201387767, + "kl": 0.03806294500827789, + "learning_rate": 8.549159441054104e-07, + "loss": 0.0139, + "num_tokens": 8556655.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.3278887271881104, + "sampling/importance_sampling_ratio/mean": 1.0000855922698975, + "sampling/importance_sampling_ratio/min": 0.6211404800415039, + "sampling/sampling_logp_difference/max": 0.4761979579925537, + "sampling/sampling_logp_difference/mean": 0.015013427473604679, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 286.0, + "completions/mean_terminated_length": 286.0, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.42947787046432495, + "epoch": 0.6513274336283186, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9964883662934144, + "kl": 0.03491639345884323, + "learning_rate": 8.538263236651117e-07, + "loss": 0.047, + "num_tokens": 8586303.0, + "reward": 0.40625, + "reward_std": 0.4515564441680908, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.2862749099731445, + "sampling/importance_sampling_ratio/mean": 0.9998199939727783, + "sampling/importance_sampling_ratio/min": 0.6631476879119873, + "sampling/sampling_logp_difference/max": 0.41075754165649414, + "sampling/sampling_logp_difference/mean": 0.013806729577481747, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 273.84375, + "completions/mean_terminated_length": 273.84375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.5510585308074951, + "epoch": 0.6530973451327433, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0826589295928573, + "kl": 0.04940249025821686, + "learning_rate": 8.527333268734606e-07, + "loss": -0.0047, + "num_tokens": 8615221.0, + "reward": 0.75, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.3706663846969604, + "sampling/importance_sampling_ratio/mean": 1.0001425743103027, + "sampling/importance_sampling_ratio/min": 0.7628521919250488, + "sampling/sampling_logp_difference/max": 0.31529712677001953, + "sampling/sampling_logp_difference/mean": 0.01648496463894844, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 367.125, + "completions/mean_terminated_length": 367.125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.5697993636131287, + "epoch": 0.6548672566371682, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7974082551907156, + "kl": 0.044508419930934906, + "learning_rate": 8.516369641602661e-07, + "loss": 0.0134, + "num_tokens": 8654493.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.354619026184082, + "sampling/importance_sampling_ratio/mean": 1.0000743865966797, + "sampling/importance_sampling_ratio/min": 0.6262628436088562, + "sampling/sampling_logp_difference/max": 0.4679851531982422, + "sampling/sampling_logp_difference/mean": 0.016434911638498306, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 352.453125, + "completions/mean_terminated_length": 352.453125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.38086485862731934, + "epoch": 0.6566371681415929, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6697272735790052, + "kl": 0.027844320982694626, + "learning_rate": 8.505372459874571e-07, + "loss": -0.0038, + "num_tokens": 8688570.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5612245798110962, + "sampling/importance_sampling_ratio/mean": 0.9996849894523621, + "sampling/importance_sampling_ratio/min": 0.7282742857933044, + "sampling/sampling_logp_difference/max": 0.4454704523086548, + "sampling/sampling_logp_difference/mean": 0.01266162283718586, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 197.625, + "completions/mean_terminated_length": 197.625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.4918649196624756, + "epoch": 0.6584070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9628604908198412, + "kl": 0.041082967072725296, + "learning_rate": 8.494341828489812e-07, + "loss": 0.0088, + "num_tokens": 8711906.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4009339809417725, + "sampling/importance_sampling_ratio/mean": 0.9996036291122437, + "sampling/importance_sampling_ratio/min": 0.6651098132133484, + "sampling/sampling_logp_difference/max": 0.4078030586242676, + "sampling/sampling_logp_difference/mean": 0.01707690767943859, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 299.40625, + "completions/mean_terminated_length": 299.40625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.663341760635376, + "epoch": 0.6601769911504425, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.1956210092052122, + "kl": 0.05895734950900078, + "learning_rate": 8.483277852707052e-07, + "loss": 0.0174, + "num_tokens": 8744108.0, + "reward": -0.375, + "reward_std": 0.7581988573074341, + "rewards/decision_reward_func/mean": -0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.3005774021148682, + "sampling/importance_sampling_ratio/mean": 0.9997754096984863, + "sampling/importance_sampling_ratio/min": 0.7420384883880615, + "sampling/sampling_logp_difference/max": 0.2983541488647461, + "sampling/sampling_logp_difference/mean": 0.018573878332972527, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 517.0, + "completions/max_terminated_length": 517.0, + "completions/mean_length": 219.6875, + "completions/mean_terminated_length": 219.6875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.549524188041687, + "epoch": 0.6619469026548672, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2155712285760218, + "kl": 0.04660700261592865, + "learning_rate": 8.472180638103143e-07, + "loss": 0.0298, + "num_tokens": 8771112.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.4421229362487793, + "sampling/importance_sampling_ratio/mean": 1.0000449419021606, + "sampling/importance_sampling_ratio/min": 0.7006356716156006, + "sampling/sampling_logp_difference/max": 0.3661162853240967, + "sampling/sampling_logp_difference/mean": 0.017872437834739685, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 262.203125, + "completions/mean_terminated_length": 262.203125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.5852094292640686, + "epoch": 0.6637168141592921, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3117610178807275, + "kl": 0.05262136831879616, + "learning_rate": 8.461050290572113e-07, + "loss": 0.0303, + "num_tokens": 8800069.0, + "reward": 0.78125, + "reward_std": 0.5281128883361816, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.3352786302566528, + "sampling/importance_sampling_ratio/mean": 0.9999169111251831, + "sampling/importance_sampling_ratio/min": 0.6927358508110046, + "sampling/sampling_logp_difference/max": 0.367106556892395, + "sampling/sampling_logp_difference/mean": 0.017396733164787292, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1007.0, + "completions/max_terminated_length": 1007.0, + "completions/mean_length": 246.453125, + "completions/mean_terminated_length": 246.453125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.38401997089385986, + "epoch": 0.6654867256637168, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7449451717953398, + "kl": 0.03761235252022743, + "learning_rate": 8.449886916324166e-07, + "loss": 0.0005, + "num_tokens": 8826594.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.6086597442626953, + "sampling/importance_sampling_ratio/mean": 1.000878930091858, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.4754014015197754, + "sampling/sampling_logp_difference/mean": 0.013770446181297302, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 143.421875, + "completions/mean_terminated_length": 143.421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.3362015187740326, + "epoch": 0.6672566371681415, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2989116729881696, + "kl": 0.03373100236058235, + "learning_rate": 8.438690621884649e-07, + "loss": -0.0125, + "num_tokens": 8844765.0, + "reward": 0.21875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4606289863586426, + "sampling/importance_sampling_ratio/mean": 0.9999160170555115, + "sampling/importance_sampling_ratio/min": 0.6257045269012451, + "sampling/sampling_logp_difference/max": 0.46887707710266113, + "sampling/sampling_logp_difference/mean": 0.01427266001701355, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 163.3125, + "completions/mean_terminated_length": 163.3125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.3471616804599762, + "epoch": 0.6690265486725664, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1550938859904358, + "kl": 0.03041689656674862, + "learning_rate": 8.427461514093055e-07, + "loss": -0.0106, + "num_tokens": 8865377.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.313628911972046, + "sampling/importance_sampling_ratio/mean": 0.9998461604118347, + "sampling/importance_sampling_ratio/min": 0.6218625903129578, + "sampling/sampling_logp_difference/max": 0.4750361442565918, + "sampling/sampling_logp_difference/mean": 0.014026266522705555, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 230.375, + "completions/mean_terminated_length": 230.375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.45404207706451416, + "epoch": 0.6707964601769911, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1282184976345067, + "kl": 0.028066381812095642, + "learning_rate": 8.41619970010199e-07, + "loss": -0.0127, + "num_tokens": 8890969.0, + "reward": 0.28125, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.2957441806793213, + "sampling/importance_sampling_ratio/mean": 1.0000749826431274, + "sampling/importance_sampling_ratio/min": 0.6950652003288269, + "sampling/sampling_logp_difference/max": 0.363749623298645, + "sampling/sampling_logp_difference/mean": 0.015464250929653645, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 166.359375, + "completions/mean_terminated_length": 166.359375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.4227294921875, + "epoch": 0.672566371681416, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9049689389200966, + "kl": 0.03660879656672478, + "learning_rate": 8.404905287376157e-07, + "loss": 0.0145, + "num_tokens": 8912064.0, + "reward": 0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.3383833169937134, + "sampling/importance_sampling_ratio/mean": 0.9998734593391418, + "sampling/importance_sampling_ratio/min": 0.6398648023605347, + "sampling/sampling_logp_difference/max": 0.44649839401245117, + "sampling/sampling_logp_difference/mean": 0.016387417912483215, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 158.6875, + "completions/mean_terminated_length": 158.6875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.37693119049072266, + "epoch": 0.6743362831858407, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02550021586580856, + "kl": 0.0251535065472126, + "learning_rate": 8.393578383691328e-07, + "loss": 0.0003, + "num_tokens": 8933004.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3655242919921875, + "sampling/importance_sampling_ratio/mean": 0.999870777130127, + "sampling/importance_sampling_ratio/min": 0.7550402879714966, + "sampling/sampling_logp_difference/max": 0.3115384578704834, + "sampling/sampling_logp_difference/mean": 0.014674804173409939, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 148.234375, + "completions/mean_terminated_length": 148.234375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.45803314447402954, + "epoch": 0.6761061946902654, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9489645812081653, + "kl": 0.03578249737620354, + "learning_rate": 8.382219097133323e-07, + "loss": 0.0026, + "num_tokens": 8953259.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.621260404586792, + "sampling/importance_sampling_ratio/mean": 0.9990768432617188, + "sampling/importance_sampling_ratio/min": 0.714861273765564, + "sampling/sampling_logp_difference/max": 0.4832038879394531, + "sampling/sampling_logp_difference/mean": 0.01736932247877121, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 182.34375, + "completions/mean_terminated_length": 182.34375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.40728506445884705, + "epoch": 0.6778761061946903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02369537274113039, + "kl": 0.024733269587159157, + "learning_rate": 8.370827536096964e-07, + "loss": 0.0003, + "num_tokens": 8975265.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5361980199813843, + "sampling/importance_sampling_ratio/mean": 0.9993815422058105, + "sampling/importance_sampling_ratio/min": 0.6953165531158447, + "sampling/sampling_logp_difference/max": 0.42931056022644043, + "sampling/sampling_logp_difference/mean": 0.015199595130980015, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 121.671875, + "completions/mean_terminated_length": 121.671875, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.4286119341850281, + "epoch": 0.679646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3224448549975067, + "kl": 0.03849584236741066, + "learning_rate": 8.359403809285053e-07, + "loss": -0.0127, + "num_tokens": 8994108.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.4355621337890625, + "sampling/importance_sampling_ratio/mean": 0.9998904466629028, + "sampling/importance_sampling_ratio/min": 0.6399005651473999, + "sampling/sampling_logp_difference/max": 0.44644248485565186, + "sampling/sampling_logp_difference/mean": 0.017456576228141785, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 107.328125, + "completions/mean_terminated_length": 107.328125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.4205324053764343, + "epoch": 0.6814159292035398, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.035414373865660514, + "kl": 0.032268770039081573, + "learning_rate": 8.347948025707329e-07, + "loss": 0.0003, + "num_tokens": 9013777.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3150197267532349, + "sampling/importance_sampling_ratio/mean": 0.9994502663612366, + "sampling/importance_sampling_ratio/min": 0.6689095497131348, + "sampling/sampling_logp_difference/max": 0.4021064043045044, + "sampling/sampling_logp_difference/mean": 0.017202619463205338, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 190.765625, + "completions/mean_terminated_length": 190.765625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.46052759885787964, + "epoch": 0.6831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9175729001782233, + "kl": 0.035584956407547, + "learning_rate": 8.336460294679431e-07, + "loss": 0.1104, + "num_tokens": 9036690.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.8059618473052979, + "sampling/importance_sampling_ratio/mean": 1.0006980895996094, + "sampling/importance_sampling_ratio/min": 0.7699272036552429, + "sampling/sampling_logp_difference/max": 0.5910933017730713, + "sampling/sampling_logp_difference/mean": 0.016931939870119095, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 174.765625, + "completions/mean_terminated_length": 174.765625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.5552585124969482, + "epoch": 0.6849557522123894, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.788329312938343, + "kl": 0.04846896976232529, + "learning_rate": 8.324940725821852e-07, + "loss": -0.0461, + "num_tokens": 9059203.0, + "reward": 0.125, + "reward_std": 0.5351393222808838, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.4157276153564453, + "sampling/importance_sampling_ratio/mean": 1.0004475116729736, + "sampling/importance_sampling_ratio/min": 0.7033500671386719, + "sampling/sampling_logp_difference/max": 0.351900577545166, + "sampling/sampling_logp_difference/mean": 0.018314823508262634, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 138.09375, + "completions/mean_terminated_length": 138.09375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.45556119084358215, + "epoch": 0.6867256637168142, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.039107353994337235, + "kl": 0.03828766942024231, + "learning_rate": 8.313389429058895e-07, + "loss": 0.0005, + "num_tokens": 9079785.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4468210935592651, + "sampling/importance_sampling_ratio/mean": 0.9999643564224243, + "sampling/importance_sampling_ratio/min": 0.6927420496940613, + "sampling/sampling_logp_difference/max": 0.3693687915802002, + "sampling/sampling_logp_difference/mean": 0.017646033316850662, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 298.0, + "completions/max_terminated_length": 298.0, + "completions/mean_length": 146.265625, + "completions/mean_terminated_length": 146.265625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.5707560777664185, + "epoch": 0.6884955752212389, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4928868726004139, + "kl": 0.05419418215751648, + "learning_rate": 8.30180651461762e-07, + "loss": -0.0125, + "num_tokens": 9103706.0, + "reward": -0.0625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": -0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.440291166305542, + "sampling/importance_sampling_ratio/mean": 1.0000207424163818, + "sampling/importance_sampling_ratio/min": 0.7370258569717407, + "sampling/sampling_logp_difference/max": 0.36484527587890625, + "sampling/sampling_logp_difference/mean": 0.018560871481895447, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 154.09375, + "completions/mean_terminated_length": 154.09375, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.5176286101341248, + "epoch": 0.6902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1155849207511874, + "kl": 0.04577094689011574, + "learning_rate": 8.290192093026805e-07, + "loss": -0.0055, + "num_tokens": 9126128.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4447511434555054, + "sampling/importance_sampling_ratio/mean": 1.0000211000442505, + "sampling/importance_sampling_ratio/min": 0.6860484480857849, + "sampling/sampling_logp_difference/max": 0.37680697441101074, + "sampling/sampling_logp_difference/mean": 0.016872867941856384, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 141.9375, + "completions/mean_terminated_length": 141.9375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4314468502998352, + "epoch": 0.6920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03694870082996628, + "kl": 0.036875467747449875, + "learning_rate": 8.278546275115869e-07, + "loss": 0.0004, + "num_tokens": 9145948.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5289959907531738, + "sampling/importance_sampling_ratio/mean": 0.9996025562286377, + "sampling/importance_sampling_ratio/min": 0.5001392960548401, + "sampling/sampling_logp_difference/max": 0.692868709564209, + "sampling/sampling_logp_difference/mean": 0.01830694079399109, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 196.015625, + "completions/mean_terminated_length": 196.015625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.5099686980247498, + "epoch": 0.6938053097345133, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.4299765430530396, + "kl": 0.04674728214740753, + "learning_rate": 8.266869172013835e-07, + "loss": 0.0203, + "num_tokens": 9168909.0, + "reward": 0.78125, + "reward_std": 0.5281128883361816, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4223401546478271, + "sampling/importance_sampling_ratio/mean": 0.9999047517776489, + "sampling/importance_sampling_ratio/min": 0.7822982668876648, + "sampling/sampling_logp_difference/max": 0.35230350494384766, + "sampling/sampling_logp_difference/mean": 0.016416629776358604, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 118.421875, + "completions/mean_terminated_length": 118.421875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.43823981285095215, + "epoch": 0.695575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3434754243547569, + "kl": 0.05030140280723572, + "learning_rate": 8.255160895148262e-07, + "loss": -0.002, + "num_tokens": 9186840.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.331215262413025, + "sampling/importance_sampling_ratio/mean": 0.9995092153549194, + "sampling/importance_sampling_ratio/min": 0.7431110143661499, + "sampling/sampling_logp_difference/max": 0.29690980911254883, + "sampling/sampling_logp_difference/mean": 0.01721595600247383, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 126.15625, + "completions/mean_terminated_length": 126.15625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.36454033851623535, + "epoch": 0.6973451327433628, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2443335047884587, + "kl": 0.04676324874162674, + "learning_rate": 8.243421556244178e-07, + "loss": -0.0285, + "num_tokens": 9205954.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.3202030658721924, + "sampling/importance_sampling_ratio/mean": 1.0000194311141968, + "sampling/importance_sampling_ratio/min": 0.7287810444831848, + "sampling/sampling_logp_difference/max": 0.31638193130493164, + "sampling/sampling_logp_difference/mean": 0.014255122281610966, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 132.3125, + "completions/mean_terminated_length": 132.3125, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.3857053518295288, + "epoch": 0.6991150442477876, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036256310135995104, + "kl": 0.0411846861243248, + "learning_rate": 8.231651267323018e-07, + "loss": 0.0004, + "num_tokens": 9223254.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4263430833816528, + "sampling/importance_sampling_ratio/mean": 1.0001716613769531, + "sampling/importance_sampling_ratio/min": 0.7204696536064148, + "sampling/sampling_logp_difference/max": 0.3551138639450073, + "sampling/sampling_logp_difference/mean": 0.014991642907261848, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 148.0, + "completions/max_terminated_length": 148.0, + "completions/mean_length": 99.78125, + "completions/mean_terminated_length": 99.78125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.29990869760513306, + "epoch": 0.7008849557522124, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047019865250950525, + "kl": 0.03183341026306152, + "learning_rate": 8.219850140701556e-07, + "loss": 0.0003, + "num_tokens": 9239544.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5087276697158813, + "sampling/importance_sampling_ratio/mean": 1.000213623046875, + "sampling/importance_sampling_ratio/min": 0.626640260219574, + "sampling/sampling_logp_difference/max": 0.46738266944885254, + "sampling/sampling_logp_difference/mean": 0.015097476541996002, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 202.0, + "completions/max_terminated_length": 202.0, + "completions/mean_length": 109.75, + "completions/mean_terminated_length": 109.75, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.4533824026584625, + "epoch": 0.7026548672566372, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4732923471056585, + "kl": 0.04384038597345352, + "learning_rate": 8.208018288990831e-07, + "loss": 0.0008, + "num_tokens": 9257400.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.7252179384231567, + "sampling/importance_sampling_ratio/mean": 0.9997067451477051, + "sampling/importance_sampling_ratio/min": 0.7307789325714111, + "sampling/sampling_logp_difference/max": 0.5453534126281738, + "sampling/sampling_logp_difference/mean": 0.016809294000267982, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 359.0, + "completions/max_terminated_length": 359.0, + "completions/mean_length": 160.46875, + "completions/mean_terminated_length": 160.46875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.46921366453170776, + "epoch": 0.7044247787610619, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.357055443061283, + "kl": 0.04166853800415993, + "learning_rate": 8.196155825095072e-07, + "loss": 0.0085, + "num_tokens": 9278518.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.536620020866394, + "sampling/importance_sampling_ratio/mean": 1.0007810592651367, + "sampling/importance_sampling_ratio/min": 0.697200357913971, + "sampling/sampling_logp_difference/max": 0.42958521842956543, + "sampling/sampling_logp_difference/mean": 0.016781406477093697, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 160.96875, + "completions/mean_terminated_length": 160.96875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.3876239061355591, + "epoch": 0.7061946902654868, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4759216717531352, + "kl": 0.04244627803564072, + "learning_rate": 8.184262862210624e-07, + "loss": 0.0385, + "num_tokens": 9299732.0, + "reward": 0.90625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5744906663894653, + "sampling/importance_sampling_ratio/mean": 0.9994622468948364, + "sampling/importance_sampling_ratio/min": 0.6394615769386292, + "sampling/sampling_logp_difference/max": 0.4539318084716797, + "sampling/sampling_logp_difference/mean": 0.015496197156608105, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 125.984375, + "completions/mean_terminated_length": 125.984375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3927186131477356, + "epoch": 0.7079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5864410065005021, + "kl": 0.037036724388599396, + "learning_rate": 8.172339513824862e-07, + "loss": 0.0137, + "num_tokens": 9322035.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.3844969272613525, + "sampling/importance_sampling_ratio/mean": 1.0005230903625488, + "sampling/importance_sampling_ratio/min": 0.7229098677635193, + "sampling/sampling_logp_difference/max": 0.3253368139266968, + "sampling/sampling_logp_difference/mean": 0.014183755964040756, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 167.375, + "completions/mean_terminated_length": 167.375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.5327609181404114, + "epoch": 0.7097345132743362, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5360476310515012, + "kl": 0.04352780431509018, + "learning_rate": 8.160385893715112e-07, + "loss": 0.0377, + "num_tokens": 9343595.0, + "reward": 0.375, + "reward_std": 0.34156501293182373, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5233222246170044, + "sampling/importance_sampling_ratio/mean": 1.0008280277252197, + "sampling/importance_sampling_ratio/min": 0.6530730128288269, + "sampling/sampling_logp_difference/max": 0.42606639862060547, + "sampling/sampling_logp_difference/mean": 0.01846783608198166, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 124.078125, + "completions/mean_terminated_length": 124.078125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.3296247124671936, + "epoch": 0.7115044247787611, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1383720765457108, + "kl": 0.03181477636098862, + "learning_rate": 8.14840211594757e-07, + "loss": 0.0082, + "num_tokens": 9361040.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.298300862312317, + "sampling/importance_sampling_ratio/mean": 0.9994966983795166, + "sampling/importance_sampling_ratio/min": 0.7810361981391907, + "sampling/sampling_logp_difference/max": 0.26105642318725586, + "sampling/sampling_logp_difference/mean": 0.013512702658772469, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 305.0, + "completions/max_terminated_length": 305.0, + "completions/mean_length": 133.34375, + "completions/mean_terminated_length": 133.34375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.4810064435005188, + "epoch": 0.7132743362831858, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3724334344488918, + "kl": 0.05646929517388344, + "learning_rate": 8.136388294876202e-07, + "loss": 0.004, + "num_tokens": 9380198.0, + "reward": 0.65625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.4132767915725708, + "sampling/importance_sampling_ratio/mean": 0.9991341233253479, + "sampling/importance_sampling_ratio/min": 0.643557071685791, + "sampling/sampling_logp_difference/max": 0.44074463844299316, + "sampling/sampling_logp_difference/mean": 0.017204541712999344, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 119.796875, + "completions/mean_terminated_length": 119.796875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.40257346630096436, + "epoch": 0.7150442477876107, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5563109733512384, + "kl": 0.03649323433637619, + "learning_rate": 8.124344545141661e-07, + "loss": 0.0446, + "num_tokens": 9403401.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.6273202896118164, + "sampling/importance_sampling_ratio/mean": 0.9993991255760193, + "sampling/importance_sampling_ratio/min": 0.6329724788665771, + "sampling/sampling_logp_difference/max": 0.4869346618652344, + "sampling/sampling_logp_difference/mean": 0.01654677465558052, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 182.921875, + "completions/mean_terminated_length": 182.921875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.5903284549713135, + "epoch": 0.7168141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9608264057391309, + "kl": 0.05380180850625038, + "learning_rate": 8.112270981670195e-07, + "loss": 0.0267, + "num_tokens": 9430356.0, + "reward": 0.4375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.3267014026641846, + "sampling/importance_sampling_ratio/mean": 0.9999366998672485, + "sampling/importance_sampling_ratio/min": 0.6254509687423706, + "sampling/sampling_logp_difference/max": 0.4692823886871338, + "sampling/sampling_logp_difference/mean": 0.018537428230047226, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 117.609375, + "completions/mean_terminated_length": 117.609375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.3920556604862213, + "epoch": 0.7185840707964601, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03612522798777008, + "kl": 0.04312152415513992, + "learning_rate": 8.10016771967254e-07, + "loss": 0.0005, + "num_tokens": 9448315.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3156917095184326, + "sampling/importance_sampling_ratio/mean": 0.9998875260353088, + "sampling/importance_sampling_ratio/min": 0.6056219339370728, + "sampling/sampling_logp_difference/max": 0.5014994144439697, + "sampling/sampling_logp_difference/mean": 0.016677189618349075, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 187.65625, + "completions/mean_terminated_length": 187.65625, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "entropy": 0.48553669452667236, + "epoch": 0.720353982300885, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.222016051787163, + "kl": 0.04862046241760254, + "learning_rate": 8.088034874642833e-07, + "loss": -0.0122, + "num_tokens": 9470517.0, + "reward": -0.65625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": -0.65625, + "rewards/decision_reward_func/std": 0.7605084180831909, + "sampling/importance_sampling_ratio/max": 1.3227328062057495, + "sampling/importance_sampling_ratio/mean": 1.000306248664856, + "sampling/importance_sampling_ratio/min": 0.7772423028945923, + "sampling/sampling_logp_difference/max": 0.2796999216079712, + "sampling/sampling_logp_difference/mean": 0.016164880245923996, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 226.0, + "completions/max_terminated_length": 226.0, + "completions/mean_length": 121.640625, + "completions/mean_terminated_length": 121.640625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.41761600971221924, + "epoch": 0.7221238938053097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03810339140122863, + "kl": 0.04997194930911064, + "learning_rate": 8.0758725623575e-07, + "loss": 0.0005, + "num_tokens": 9488990.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2934409379959106, + "sampling/importance_sampling_ratio/mean": 1.0001788139343262, + "sampling/importance_sampling_ratio/min": 0.735434353351593, + "sampling/sampling_logp_difference/max": 0.30729401111602783, + "sampling/sampling_logp_difference/mean": 0.015452738851308823, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 160.609375, + "completions/mean_terminated_length": 160.609375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.49578094482421875, + "epoch": 0.7238938053097345, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4227949776008897, + "kl": 0.057519737631082535, + "learning_rate": 8.063680898874157e-07, + "loss": -0.0002, + "num_tokens": 9511509.0, + "reward": 0.9375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.2921876907348633, + "sampling/importance_sampling_ratio/mean": 0.9999642372131348, + "sampling/importance_sampling_ratio/min": 0.7029215693473816, + "sampling/sampling_logp_difference/max": 0.3525099754333496, + "sampling/sampling_logp_difference/mean": 0.015130658634006977, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 113.078125, + "completions/mean_terminated_length": 113.078125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.3672482967376709, + "epoch": 0.7256637168141593, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04104218369204801, + "kl": 0.039082761853933334, + "learning_rate": 8.051460000530501e-07, + "loss": 0.0004, + "num_tokens": 9528218.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.295780897140503, + "sampling/importance_sampling_ratio/mean": 0.999136745929718, + "sampling/importance_sampling_ratio/min": 0.6808854341506958, + "sampling/sampling_logp_difference/max": 0.38436126708984375, + "sampling/sampling_logp_difference/mean": 0.016387619078159332, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 144.671875, + "completions/mean_terminated_length": 144.671875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.2750190794467926, + "epoch": 0.727433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02738721989729678, + "kl": 0.03227146714925766, + "learning_rate": 8.039209983943201e-07, + "loss": 0.0003, + "num_tokens": 9547013.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3791532516479492, + "sampling/importance_sampling_ratio/mean": 0.9994186162948608, + "sampling/importance_sampling_ratio/min": 0.7481702566146851, + "sampling/sampling_logp_difference/max": 0.32146966457366943, + "sampling/sampling_logp_difference/mean": 0.012029212899506092, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 240.0, + "completions/max_terminated_length": 240.0, + "completions/mean_length": 126.3125, + "completions/mean_terminated_length": 126.3125, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.5413132905960083, + "epoch": 0.7292035398230089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04237971922838883, + "kl": 0.07601577043533325, + "learning_rate": 8.026930966006778e-07, + "loss": 0.0008, + "num_tokens": 9567369.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5532801151275635, + "sampling/importance_sampling_ratio/mean": 0.9999905824661255, + "sampling/importance_sampling_ratio/min": 0.6942715644836426, + "sampling/sampling_logp_difference/max": 0.4403688907623291, + "sampling/sampling_logp_difference/mean": 0.017333954572677612, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 132.078125, + "completions/mean_terminated_length": 132.078125, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.46055909991264343, + "epoch": 0.7309734513274336, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2854979780959792, + "kl": 0.05208270251750946, + "learning_rate": 8.014623063892503e-07, + "loss": -0.0066, + "num_tokens": 9588606.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.306519627571106, + "sampling/importance_sampling_ratio/mean": 0.999424934387207, + "sampling/importance_sampling_ratio/min": 0.608730673789978, + "sampling/sampling_logp_difference/max": 0.49637937545776367, + "sampling/sampling_logp_difference/mean": 0.01695268228650093, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 180.5, + "completions/mean_terminated_length": 180.5, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.6001425981521606, + "epoch": 0.7327433628318584, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.606530499810133, + "kl": 0.06114194542169571, + "learning_rate": 8.002286395047266e-07, + "loss": -0.0075, + "num_tokens": 9617982.0, + "reward": 0.28125, + "reward_std": 0.5457825064659119, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.288370132446289, + "sampling/importance_sampling_ratio/mean": 0.9996581673622131, + "sampling/importance_sampling_ratio/min": 0.7591173052787781, + "sampling/sampling_logp_difference/max": 0.27559900283813477, + "sampling/sampling_logp_difference/mean": 0.01883193850517273, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 147.359375, + "completions/mean_terminated_length": 147.359375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.5782200694084167, + "epoch": 0.7345132743362832, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6151609782878316, + "kl": 0.061108771711587906, + "learning_rate": 7.989921077192463e-07, + "loss": -0.0003, + "num_tokens": 9644149.0, + "reward": 0.53125, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5092495679855347, + "sampling/importance_sampling_ratio/mean": 0.9999904632568359, + "sampling/importance_sampling_ratio/min": 0.6828128099441528, + "sampling/sampling_logp_difference/max": 0.41161251068115234, + "sampling/sampling_logp_difference/mean": 0.018274560570716858, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 171.65625, + "completions/mean_terminated_length": 171.65625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.37550604343414307, + "epoch": 0.736283185840708, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0364107658945485, + "kl": 0.05150478333234787, + "learning_rate": 7.97752722832287e-07, + "loss": 0.0202, + "num_tokens": 9667007.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2921267747879028, + "sampling/importance_sampling_ratio/mean": 1.0000576972961426, + "sampling/importance_sampling_ratio/min": 0.7077885866165161, + "sampling/sampling_logp_difference/max": 0.3456099033355713, + "sampling/sampling_logp_difference/mean": 0.013839101418852806, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 129.40625, + "completions/mean_terminated_length": 129.40625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.48952698707580566, + "epoch": 0.7380530973451327, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3150585729508277, + "kl": 0.05840027332305908, + "learning_rate": 7.965104966705517e-07, + "loss": 0.0002, + "num_tokens": 9687401.0, + "reward": 0.75, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4359703063964844, + "sampling/importance_sampling_ratio/mean": 1.0009379386901855, + "sampling/importance_sampling_ratio/min": 0.771267294883728, + "sampling/sampling_logp_difference/max": 0.36184072494506836, + "sampling/sampling_logp_difference/mean": 0.017398536205291748, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 133.34375, + "completions/mean_terminated_length": 133.34375, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.5486599802970886, + "epoch": 0.7398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1852547391700583, + "kl": 0.0636061578989029, + "learning_rate": 7.952654410878558e-07, + "loss": -0.0088, + "num_tokens": 9708095.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5659292936325073, + "sampling/importance_sampling_ratio/mean": 1.0008774995803833, + "sampling/importance_sampling_ratio/min": 0.7863955497741699, + "sampling/sampling_logp_difference/max": 0.44847941398620605, + "sampling/sampling_logp_difference/mean": 0.016996072605252266, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 133.0625, + "completions/mean_terminated_length": 133.0625, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.5455623865127563, + "epoch": 0.7415929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1828438991449177, + "kl": 0.07124529778957367, + "learning_rate": 7.940175679650145e-07, + "loss": 0.0085, + "num_tokens": 9729603.0, + "reward": 0.0625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.0625, + "rewards/decision_reward_func/std": 1.0059348344802856, + "sampling/importance_sampling_ratio/max": 1.2635778188705444, + "sampling/importance_sampling_ratio/mean": 0.9996267557144165, + "sampling/importance_sampling_ratio/min": 0.778712272644043, + "sampling/sampling_logp_difference/max": 0.25011372566223145, + "sampling/sampling_logp_difference/mean": 0.016824547201395035, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 145.5, + "completions/mean_terminated_length": 145.5, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.34660637378692627, + "epoch": 0.7433628318584071, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0312409799423157, + "kl": 0.038545429706573486, + "learning_rate": 7.927668892097288e-07, + "loss": 0.0003, + "num_tokens": 9748435.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3718312978744507, + "sampling/importance_sampling_ratio/mean": 0.9999306201934814, + "sampling/importance_sampling_ratio/min": 0.7005653381347656, + "sampling/sampling_logp_difference/max": 0.3558676242828369, + "sampling/sampling_logp_difference/mean": 0.014107013121247292, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 181.0, + "completions/max_terminated_length": 181.0, + "completions/mean_length": 98.859375, + "completions/mean_terminated_length": 98.859375, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "entropy": 0.2662732005119324, + "epoch": 0.7451327433628319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.040150729530619926, + "kl": 0.03974389284849167, + "learning_rate": 7.915134167564723e-07, + "loss": 0.0004, + "num_tokens": 9764202.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.362242579460144, + "sampling/importance_sampling_ratio/mean": 0.9992560148239136, + "sampling/importance_sampling_ratio/min": 0.36203813552856445, + "sampling/sampling_logp_difference/max": 1.0160057544708252, + "sampling/sampling_logp_difference/mean": 0.013069668784737587, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 99.25, + "completions/mean_terminated_length": 99.25, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.3316885828971863, + "epoch": 0.7469026548672566, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05377375733084318, + "kl": 0.038888368755578995, + "learning_rate": 7.902571625663772e-07, + "loss": 0.0004, + "num_tokens": 9780138.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3458665609359741, + "sampling/importance_sampling_ratio/mean": 1.0000932216644287, + "sampling/importance_sampling_ratio/min": 0.660549521446228, + "sampling/sampling_logp_difference/max": 0.4146832227706909, + "sampling/sampling_logp_difference/mean": 0.015012386254966259, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 136.453125, + "completions/mean_terminated_length": 136.453125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.2856818437576294, + "epoch": 0.7486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05363430229119572, + "kl": 0.03845176845788956, + "learning_rate": 7.8899813862712e-07, + "loss": 0.0004, + "num_tokens": 9797879.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4129092693328857, + "sampling/importance_sampling_ratio/mean": 0.999391496181488, + "sampling/importance_sampling_ratio/min": 0.6080278754234314, + "sampling/sampling_logp_difference/max": 0.49753451347351074, + "sampling/sampling_logp_difference/mean": 0.011145094409584999, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 201.0, + "completions/max_terminated_length": 201.0, + "completions/mean_length": 121.828125, + "completions/mean_terminated_length": 121.828125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4224778413772583, + "epoch": 0.7504424778761062, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.038346174136042306, + "kl": 0.05765840411186218, + "learning_rate": 7.877363569528075e-07, + "loss": 0.0005, + "num_tokens": 9815260.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.384250521659851, + "sampling/importance_sampling_ratio/mean": 1.0004199743270874, + "sampling/importance_sampling_ratio/min": 0.7318248152732849, + "sampling/sampling_logp_difference/max": 0.32515883445739746, + "sampling/sampling_logp_difference/mean": 0.015470411628484726, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 188.0, + "completions/max_terminated_length": 188.0, + "completions/mean_length": 113.9375, + "completions/mean_terminated_length": 113.9375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.35458582639694214, + "epoch": 0.7522123893805309, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03858518652079706, + "kl": 0.042834073305130005, + "learning_rate": 7.864718295838614e-07, + "loss": 0.0004, + "num_tokens": 9833256.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.295134425163269, + "sampling/importance_sampling_ratio/mean": 1.0006794929504395, + "sampling/importance_sampling_ratio/min": 0.6788650155067444, + "sampling/sampling_logp_difference/max": 0.3873330354690552, + "sampling/sampling_logp_difference/mean": 0.013968059793114662, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 196.34375, + "completions/mean_terminated_length": 196.34375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.5816829204559326, + "epoch": 0.7539823008849558, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.251531006828788, + "kl": 0.061315182596445084, + "learning_rate": 7.852045685869044e-07, + "loss": 0.0473, + "num_tokens": 9860062.0, + "reward": 0.8125, + "reward_std": 0.3943893015384674, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0004279613494873, + "sampling/importance_sampling_ratio/min": 0.6298375129699707, + "sampling/sampling_logp_difference/max": 0.8718147277832031, + "sampling/sampling_logp_difference/mean": 0.017862189561128616, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 130.0, + "completions/max_terminated_length": 130.0, + "completions/mean_length": 91.1875, + "completions/mean_terminated_length": 91.1875, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "entropy": 0.3042639493942261, + "epoch": 0.7557522123893805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.051794052447220504, + "kl": 0.0483333058655262, + "learning_rate": 7.839345860546447e-07, + "loss": 0.0005, + "num_tokens": 9875098.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4351999759674072, + "sampling/importance_sampling_ratio/mean": 1.000496745109558, + "sampling/importance_sampling_ratio/min": 0.6453400254249573, + "sampling/sampling_logp_difference/max": 0.43797802925109863, + "sampling/sampling_logp_difference/mean": 0.014278100803494453, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 228.953125, + "completions/mean_terminated_length": 228.953125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.43442392349243164, + "epoch": 0.7575221238938054, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6040956124876707, + "kl": 0.04339404031634331, + "learning_rate": 7.826618941057597e-07, + "loss": -0.0081, + "num_tokens": 9901031.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.344829797744751, + "sampling/importance_sampling_ratio/mean": 0.9997045993804932, + "sampling/importance_sampling_ratio/min": 0.7581149339675903, + "sampling/sampling_logp_difference/max": 0.29626739025115967, + "sampling/sampling_logp_difference/mean": 0.014257797971367836, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 144.875, + "completions/mean_terminated_length": 144.875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.4813069701194763, + "epoch": 0.7592920353982301, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5604049958307447, + "kl": 0.0463547520339489, + "learning_rate": 7.813865048847818e-07, + "loss": 0.0146, + "num_tokens": 9921391.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6355764865875244, + "sampling/importance_sampling_ratio/mean": 1.0004425048828125, + "sampling/importance_sampling_ratio/min": 0.6171419620513916, + "sampling/sampling_logp_difference/max": 0.49199533462524414, + "sampling/sampling_logp_difference/mean": 0.01544689480215311, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 186.9375, + "completions/mean_terminated_length": 186.9375, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.5194053649902344, + "epoch": 0.7610619469026548, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7701415244045536, + "kl": 0.062382422387599945, + "learning_rate": 7.801084305619818e-07, + "loss": 0.014, + "num_tokens": 9946155.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.811177134513855, + "sampling/importance_sampling_ratio/mean": 1.0002188682556152, + "sampling/importance_sampling_ratio/min": 0.6984061598777771, + "sampling/sampling_logp_difference/max": 0.5939769744873047, + "sampling/sampling_logp_difference/mean": 0.017058037221431732, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 133.421875, + "completions/mean_terminated_length": 133.421875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.32108262181282043, + "epoch": 0.7628318584070797, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02278105974564533, + "kl": 0.023188531398773193, + "learning_rate": 7.788276833332525e-07, + "loss": 0.0002, + "num_tokens": 9964166.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3770564794540405, + "sampling/importance_sampling_ratio/mean": 1.000881314277649, + "sampling/importance_sampling_ratio/min": 0.6873236894607544, + "sampling/sampling_logp_difference/max": 0.37494993209838867, + "sampling/sampling_logp_difference/mean": 0.012426692992448807, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 198.578125, + "completions/mean_terminated_length": 198.578125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.5111083984375, + "epoch": 0.7646017699115044, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2206219434768952, + "kl": 0.05128113925457001, + "learning_rate": 7.775442754199928e-07, + "loss": 0.0149, + "num_tokens": 9989691.0, + "reward": 0.40625, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.65772545337677, + "sampling/importance_sampling_ratio/mean": 0.9998399615287781, + "sampling/importance_sampling_ratio/min": 0.6675968170166016, + "sampling/sampling_logp_difference/max": 0.5054464340209961, + "sampling/sampling_logp_difference/mean": 0.01742260530591011, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 176.34375, + "completions/mean_terminated_length": 176.34375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.5092491507530212, + "epoch": 0.7663716814159292, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.230637386431485, + "kl": 0.04463362321257591, + "learning_rate": 7.76258219068991e-07, + "loss": 0.038, + "num_tokens": 10012513.0, + "reward": 0.5625, + "reward_std": 0.3265564441680908, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.5303585529327393, + "sampling/importance_sampling_ratio/mean": 0.9997532367706299, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.016243431717157364, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 141.046875, + "completions/mean_terminated_length": 141.046875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.44375866651535034, + "epoch": 0.768141592920354, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.401467803600503, + "kl": 0.04483048617839813, + "learning_rate": 7.749695265523075e-07, + "loss": -0.0296, + "num_tokens": 10032740.0, + "reward": 0.25, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.4549647569656372, + "sampling/importance_sampling_ratio/mean": 0.9999812841415405, + "sampling/importance_sampling_ratio/min": 0.6262632608413696, + "sampling/sampling_logp_difference/max": 0.4679844379425049, + "sampling/sampling_logp_difference/mean": 0.015169515274465084, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 117.875, + "completions/mean_terminated_length": 117.875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.28713029623031616, + "epoch": 0.7699115044247787, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041039635985315244, + "kl": 0.030406465753912926, + "learning_rate": 7.736782101671586e-07, + "loss": 0.0004, + "num_tokens": 10050092.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4981794357299805, + "sampling/importance_sampling_ratio/mean": 1.0000543594360352, + "sampling/importance_sampling_ratio/min": 0.6956551671028137, + "sampling/sampling_logp_difference/max": 0.4042506217956543, + "sampling/sampling_logp_difference/mean": 0.013050006702542305, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 161.71875, + "completions/mean_terminated_length": 161.71875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.45859166979789734, + "epoch": 0.7716814159292036, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.896432444156014, + "kl": 0.05509470775723457, + "learning_rate": 7.723842822357979e-07, + "loss": -0.0129, + "num_tokens": 10072474.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.2872775793075562, + "sampling/importance_sampling_ratio/mean": 0.9995081424713135, + "sampling/importance_sampling_ratio/min": 0.7153425216674805, + "sampling/sampling_logp_difference/max": 0.334993839263916, + "sampling/sampling_logp_difference/mean": 0.01598282903432846, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 152.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 93.078125, + "completions/mean_terminated_length": 93.078125, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.3131830394268036, + "epoch": 0.7734513274336283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03250132240398681, + "kl": 0.023867376148700714, + "learning_rate": 7.710877551054003e-07, + "loss": 0.0002, + "num_tokens": 10089263.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.301871657371521, + "sampling/importance_sampling_ratio/mean": 1.0001564025878906, + "sampling/importance_sampling_ratio/min": 0.6190708875656128, + "sampling/sampling_logp_difference/max": 0.4795355796813965, + "sampling/sampling_logp_difference/mean": 0.014258707873523235, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 117.578125, + "completions/mean_terminated_length": 117.578125, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.37190645933151245, + "epoch": 0.7752212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04040951774703129, + "kl": 0.04006647691130638, + "learning_rate": 7.697886411479421e-07, + "loss": 0.0005, + "num_tokens": 10107716.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3976877927780151, + "sampling/importance_sampling_ratio/mean": 0.9995604753494263, + "sampling/importance_sampling_ratio/min": 0.6151885390281677, + "sampling/sampling_logp_difference/max": 0.4858264923095703, + "sampling/sampling_logp_difference/mean": 0.01616573892533779, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 105.859375, + "completions/mean_terminated_length": 105.859375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3533839285373688, + "epoch": 0.7769911504424779, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2171059916419273, + "kl": 0.031037643551826477, + "learning_rate": 7.684869527600856e-07, + "loss": 0.0118, + "num_tokens": 10124987.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2914077043533325, + "sampling/importance_sampling_ratio/mean": 1.0000272989273071, + "sampling/importance_sampling_ratio/min": 0.7802943587303162, + "sampling/sampling_logp_difference/max": 0.25573277473449707, + "sampling/sampling_logp_difference/mean": 0.014694828540086746, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 337.0, + "completions/max_terminated_length": 337.0, + "completions/mean_length": 123.15625, + "completions/mean_terminated_length": 123.15625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.35793831944465637, + "epoch": 0.7787610619469026, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1659824024798076, + "kl": 0.03780563920736313, + "learning_rate": 7.671827023630579e-07, + "loss": 0.0051, + "num_tokens": 10141941.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.2982903718948364, + "sampling/importance_sampling_ratio/mean": 0.9997375011444092, + "sampling/importance_sampling_ratio/min": 0.7020929455757141, + "sampling/sampling_logp_difference/max": 0.35368943214416504, + "sampling/sampling_logp_difference/mean": 0.015126901678740978, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 164.71875, + "completions/mean_terminated_length": 164.71875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4047059714794159, + "epoch": 0.7805309734513274, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8609898222707638, + "kl": 0.03574547544121742, + "learning_rate": 7.658759024025347e-07, + "loss": 0.0028, + "num_tokens": 10163923.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.6026140451431274, + "sampling/importance_sampling_ratio/mean": 0.9995793104171753, + "sampling/importance_sampling_ratio/min": 0.6657065153121948, + "sampling/sampling_logp_difference/max": 0.4716360569000244, + "sampling/sampling_logp_difference/mean": 0.014743344858288765, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 151.5625, + "completions/mean_terminated_length": 151.5625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.33742740750312805, + "epoch": 0.7823008849557522, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026046778829816937, + "kl": 0.019752195104956627, + "learning_rate": 7.645665653485205e-07, + "loss": 0.0002, + "num_tokens": 10183463.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4193668365478516, + "sampling/importance_sampling_ratio/mean": 0.9995388388633728, + "sampling/importance_sampling_ratio/min": 0.7518067359924316, + "sampling/sampling_logp_difference/max": 0.35021090507507324, + "sampling/sampling_logp_difference/mean": 0.01452592946588993, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 307.0, + "completions/max_terminated_length": 307.0, + "completions/mean_length": 135.75, + "completions/mean_terminated_length": 135.75, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.42516061663627625, + "epoch": 0.784070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1784864186775827, + "kl": 0.03952057659626007, + "learning_rate": 7.632547036952295e-07, + "loss": 0.0116, + "num_tokens": 10202647.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.3767832517623901, + "sampling/importance_sampling_ratio/mean": 1.0001153945922852, + "sampling/importance_sampling_ratio/min": 0.6407551169395447, + "sampling/sampling_logp_difference/max": 0.44510793685913086, + "sampling/sampling_logp_difference/mean": 0.016151659190654755, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 126.921875, + "completions/mean_terminated_length": 126.921875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3769015669822693, + "epoch": 0.7858407079646018, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2305521421237382, + "kl": 0.0339299738407135, + "learning_rate": 7.619403299609667e-07, + "loss": 0.0263, + "num_tokens": 10221298.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.2958794832229614, + "sampling/importance_sampling_ratio/mean": 0.9998077154159546, + "sampling/importance_sampling_ratio/min": 0.628684401512146, + "sampling/sampling_logp_difference/max": 0.4641258716583252, + "sampling/sampling_logp_difference/mean": 0.015832651406526566, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 152.078125, + "completions/mean_terminated_length": 152.078125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.5478400588035583, + "epoch": 0.7876106194690266, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4222336373618596, + "kl": 0.06323939561843872, + "learning_rate": 7.606234566880088e-07, + "loss": 0.0088, + "num_tokens": 10241543.0, + "reward": 0.5625, + "reward_std": 0.49553054571151733, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.2920238971710205, + "sampling/importance_sampling_ratio/mean": 0.9995840191841125, + "sampling/importance_sampling_ratio/min": 0.7800106406211853, + "sampling/sampling_logp_difference/max": 0.2562098503112793, + "sampling/sampling_logp_difference/mean": 0.019120126962661743, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 112.328125, + "completions/mean_terminated_length": 112.328125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.3097269535064697, + "epoch": 0.7893805309734513, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04644537650187056, + "kl": 0.031152140349149704, + "learning_rate": 7.593040964424835e-07, + "loss": 0.0003, + "num_tokens": 10258044.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3562997579574585, + "sampling/importance_sampling_ratio/mean": 0.99954754114151, + "sampling/importance_sampling_ratio/min": 0.6547458171844482, + "sampling/sampling_logp_difference/max": 0.4235081672668457, + "sampling/sampling_logp_difference/mean": 0.014366969466209412, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 222.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 155.09375, + "completions/mean_terminated_length": 155.09375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.416738897562027, + "epoch": 0.7911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0408398377802692, + "kl": 0.06408806890249252, + "learning_rate": 7.579822618142503e-07, + "loss": 0.0155, + "num_tokens": 10278786.0, + "reward": -0.15625, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": -0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.4058080911636353, + "sampling/importance_sampling_ratio/mean": 1.0004191398620605, + "sampling/importance_sampling_ratio/min": 0.7061727643013, + "sampling/sampling_logp_difference/max": 0.34789538383483887, + "sampling/sampling_logp_difference/mean": 0.015817837789654732, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 147.859375, + "completions/mean_terminated_length": 147.859375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4140564203262329, + "epoch": 0.7929203539823009, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0493445376584571, + "kl": 0.04653691127896309, + "learning_rate": 7.56657965416781e-07, + "loss": -0.0014, + "num_tokens": 10299145.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5432850122451782, + "sampling/importance_sampling_ratio/mean": 1.0006117820739746, + "sampling/importance_sampling_ratio/min": 0.6773489117622375, + "sampling/sampling_logp_difference/max": 0.4339132308959961, + "sampling/sampling_logp_difference/mean": 0.015949150547385216, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 122.0625, + "completions/mean_terminated_length": 122.0625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.3602309823036194, + "epoch": 0.7946902654867256, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3145517594184841, + "kl": 0.04791180044412613, + "learning_rate": 7.553312198870372e-07, + "loss": -0.0096, + "num_tokens": 10319565.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.3871322870254517, + "sampling/importance_sampling_ratio/mean": 0.999633252620697, + "sampling/importance_sampling_ratio/min": 0.7378374338150024, + "sampling/sampling_logp_difference/max": 0.3272385597229004, + "sampling/sampling_logp_difference/mean": 0.015328210778534412, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 141.4375, + "completions/mean_terminated_length": 141.4375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.346060186624527, + "epoch": 0.7964601769911505, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05853215178325867, + "kl": 0.035528093576431274, + "learning_rate": 7.540020378853522e-07, + "loss": 0.0004, + "num_tokens": 10340185.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.326745867729187, + "sampling/importance_sampling_ratio/mean": 1.000696063041687, + "sampling/importance_sampling_ratio/min": 0.668019711971283, + "sampling/sampling_logp_difference/max": 0.40343761444091797, + "sampling/sampling_logp_difference/mean": 0.014905279502272606, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 146.625, + "completions/mean_terminated_length": 146.625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.4376601576805115, + "epoch": 0.7982300884955752, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6127170475491959, + "kl": 0.061810243874788284, + "learning_rate": 7.52670432095309e-07, + "loss": -0.021, + "num_tokens": 10360865.0, + "reward": 0.59375, + "reward_std": 0.497555673122406, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3121674060821533, + "sampling/importance_sampling_ratio/mean": 1.0003948211669922, + "sampling/importance_sampling_ratio/min": 0.6485395431518555, + "sampling/sampling_logp_difference/max": 0.4330322742462158, + "sampling/sampling_logp_difference/mean": 0.016728384420275688, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 142.09375, + "completions/mean_terminated_length": 142.09375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4573453962802887, + "epoch": 0.8, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.135755391878933, + "kl": 0.06898960471153259, + "learning_rate": 7.513364152236185e-07, + "loss": 0.0024, + "num_tokens": 10382919.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.532122015953064, + "sampling/importance_sampling_ratio/mean": 1.0002293586730957, + "sampling/importance_sampling_ratio/min": 0.6331928968429565, + "sampling/sampling_logp_difference/max": 0.45698022842407227, + "sampling/sampling_logp_difference/mean": 0.017683709040284157, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 199.0, + "completions/max_terminated_length": 199.0, + "completions/mean_length": 125.828125, + "completions/mean_terminated_length": 125.828125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.36240512132644653, + "epoch": 0.8017699115044248, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.09174085224713212, + "kl": 0.06328853964805603, + "learning_rate": 7.5e-07, + "loss": 0.0007, + "num_tokens": 10401644.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3621101379394531, + "sampling/importance_sampling_ratio/mean": 0.9999123811721802, + "sampling/importance_sampling_ratio/min": 0.6479644775390625, + "sampling/sampling_logp_difference/max": 0.43391942977905273, + "sampling/sampling_logp_difference/mean": 0.014787204563617706, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 150.046875, + "completions/mean_terminated_length": 150.046875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.3293919563293457, + "epoch": 0.8035398230088495, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04347173613789339, + "kl": 0.03755633533000946, + "learning_rate": 7.486611991770585e-07, + "loss": 0.0004, + "num_tokens": 10422335.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3207083940505981, + "sampling/importance_sampling_ratio/mean": 0.9996446371078491, + "sampling/importance_sampling_ratio/min": 0.6304682493209839, + "sampling/sampling_logp_difference/max": 0.4612925052642822, + "sampling/sampling_logp_difference/mean": 0.013409500941634178, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 124.96875, + "completions/mean_terminated_length": 124.96875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.30635595321655273, + "epoch": 0.8053097345132744, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.047622189612138245, + "kl": 0.033256709575653076, + "learning_rate": 7.473200255301634e-07, + "loss": 0.0004, + "num_tokens": 10441773.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5281130075454712, + "sampling/importance_sampling_ratio/mean": 1.0002222061157227, + "sampling/importance_sampling_ratio/min": 0.6043351292610168, + "sampling/sampling_logp_difference/max": 0.5036263465881348, + "sampling/sampling_logp_difference/mean": 0.015033609233796597, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 173.5625, + "completions/mean_terminated_length": 173.5625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.43391820788383484, + "epoch": 0.8070796460176991, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1702017725660032, + "kl": 0.06150873750448227, + "learning_rate": 7.459764918573264e-07, + "loss": -0.0113, + "num_tokens": 10464657.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4234800338745117, + "sampling/importance_sampling_ratio/mean": 0.9993876218795776, + "sampling/importance_sampling_ratio/min": 0.6902082562446594, + "sampling/sampling_logp_difference/max": 0.3707618713378906, + "sampling/sampling_logp_difference/mean": 0.016423087567090988, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 121.109375, + "completions/mean_terminated_length": 121.109375, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.270940899848938, + "epoch": 0.8088495575221238, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0566857769828913, + "kl": 0.040656525641679764, + "learning_rate": 7.446306109790797e-07, + "loss": 0.0004, + "num_tokens": 10482408.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3243651390075684, + "sampling/importance_sampling_ratio/mean": 0.999417781829834, + "sampling/importance_sampling_ratio/min": 0.6547061800956726, + "sampling/sampling_logp_difference/max": 0.4235687255859375, + "sampling/sampling_logp_difference/mean": 0.012482231482863426, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 122.40625, + "completions/mean_terminated_length": 122.40625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.27112001180648804, + "epoch": 0.8106194690265487, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04500104098821302, + "kl": 0.025868939235806465, + "learning_rate": 7.432823957383531e-07, + "loss": 0.0003, + "num_tokens": 10500258.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3218246698379517, + "sampling/importance_sampling_ratio/mean": 0.9997153282165527, + "sampling/importance_sampling_ratio/min": 0.695496678352356, + "sampling/sampling_logp_difference/max": 0.36312901973724365, + "sampling/sampling_logp_difference/mean": 0.011474039405584335, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 121.546875, + "completions/mean_terminated_length": 121.546875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.42170801758766174, + "epoch": 0.8123893805309734, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4227323614675174, + "kl": 0.05972127616405487, + "learning_rate": 7.419318590003523e-07, + "loss": -0.0236, + "num_tokens": 10520549.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.2994980812072754, + "sampling/importance_sampling_ratio/mean": 1.0001108646392822, + "sampling/importance_sampling_ratio/min": 0.6151829361915588, + "sampling/sampling_logp_difference/max": 0.48583555221557617, + "sampling/sampling_logp_difference/mean": 0.017824754118919373, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 149.5, + "completions/mean_terminated_length": 149.5, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.3732178807258606, + "epoch": 0.8141592920353983, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05982100635587919, + "kl": 0.04914889484643936, + "learning_rate": 7.405790136524352e-07, + "loss": 0.0005, + "num_tokens": 10539333.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.672418475151062, + "sampling/importance_sampling_ratio/mean": 0.9997685551643372, + "sampling/importance_sampling_ratio/min": 0.6069730520248413, + "sampling/sampling_logp_difference/max": 0.5142707824707031, + "sampling/sampling_logp_difference/mean": 0.014909964986145496, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 140.828125, + "completions/mean_terminated_length": 140.828125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.34480398893356323, + "epoch": 0.815929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1062441954754016, + "kl": 0.04970972612500191, + "learning_rate": 7.392238726039897e-07, + "loss": 0.0043, + "num_tokens": 10557898.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.2949442863464355, + "sampling/importance_sampling_ratio/mean": 0.9999631643295288, + "sampling/importance_sampling_ratio/min": 0.47875940799713135, + "sampling/sampling_logp_difference/max": 0.736557126045227, + "sampling/sampling_logp_difference/mean": 0.014596865512430668, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 187.9375, + "completions/mean_terminated_length": 187.9375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.49509763717651367, + "epoch": 0.8176991150442477, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3417823296242846, + "kl": 0.06408271193504333, + "learning_rate": 7.378664487863102e-07, + "loss": 0.0008, + "num_tokens": 10579830.0, + "reward": 0.28125, + "reward_std": 0.5061737298965454, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.6127060651779175, + "sampling/importance_sampling_ratio/mean": 0.9998746514320374, + "sampling/importance_sampling_ratio/min": 0.7186934947967529, + "sampling/sampling_logp_difference/max": 0.47791361808776855, + "sampling/sampling_logp_difference/mean": 0.016279179602861404, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 124.421875, + "completions/mean_terminated_length": 124.421875, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.38590937852859497, + "epoch": 0.8194690265486726, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04903077285315654, + "kl": 0.05098006874322891, + "learning_rate": 7.365067551524739e-07, + "loss": 0.0007, + "num_tokens": 10599633.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3535634279251099, + "sampling/importance_sampling_ratio/mean": 0.9995144605636597, + "sampling/importance_sampling_ratio/min": 0.6368655562400818, + "sampling/sampling_logp_difference/max": 0.45119667053222656, + "sampling/sampling_logp_difference/mean": 0.01614277996122837, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 121.0, + "completions/max_terminated_length": 121.0, + "completions/mean_length": 86.09375, + "completions/mean_terminated_length": 86.09375, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.3199424147605896, + "epoch": 0.8212389380530973, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0330539341227297, + "kl": 0.027219675481319427, + "learning_rate": 7.351448046772177e-07, + "loss": 0.0003, + "num_tokens": 10614487.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3655897378921509, + "sampling/importance_sampling_ratio/mean": 0.9999443292617798, + "sampling/importance_sampling_ratio/min": 0.6294812560081482, + "sampling/sampling_logp_difference/max": 0.4628591537475586, + "sampling/sampling_logp_difference/mean": 0.01586727797985077, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 156.625, + "completions/mean_terminated_length": 156.625, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "entropy": 0.4999556243419647, + "epoch": 0.8230088495575221, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4178807890164686, + "kl": 0.05209490656852722, + "learning_rate": 7.33780610356814e-07, + "loss": 0.0135, + "num_tokens": 10637375.0, + "reward": 0.78125, + "reward_std": 0.4101392924785614, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5122649669647217, + "sampling/importance_sampling_ratio/mean": 1.0001907348632812, + "sampling/importance_sampling_ratio/min": 0.7380893230438232, + "sampling/sampling_logp_difference/max": 0.4136085510253906, + "sampling/sampling_logp_difference/mean": 0.016630396246910095, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 147.515625, + "completions/mean_terminated_length": 147.515625, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.4395650029182434, + "epoch": 0.8247787610619469, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1663733853942424, + "kl": 0.04514811187982559, + "learning_rate": 7.324141852089471e-07, + "loss": -0.065, + "num_tokens": 10658352.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.3001909255981445, + "sampling/importance_sampling_ratio/mean": 0.9995812177658081, + "sampling/importance_sampling_ratio/min": 0.6797474026679993, + "sampling/sampling_logp_difference/max": 0.3860340118408203, + "sampling/sampling_logp_difference/mean": 0.016010360792279243, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 149.375, + "completions/mean_terminated_length": 149.375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.5551590919494629, + "epoch": 0.8265486725663717, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0624761608590831, + "kl": 0.061054326593875885, + "learning_rate": 7.310455422725889e-07, + "loss": 0.0303, + "num_tokens": 10680408.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.2980353832244873, + "sampling/importance_sampling_ratio/mean": 1.0006985664367676, + "sampling/importance_sampling_ratio/min": 0.7282189726829529, + "sampling/sampling_logp_difference/max": 0.3171534538269043, + "sampling/sampling_logp_difference/mean": 0.017895396798849106, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 258.0, + "completions/max_terminated_length": 258.0, + "completions/mean_length": 114.75, + "completions/mean_terminated_length": 114.75, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.4307693839073181, + "epoch": 0.8283185840707965, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3482466982523051, + "kl": 0.051996998488903046, + "learning_rate": 7.296746946078736e-07, + "loss": 0.0043, + "num_tokens": 10698520.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.3564072847366333, + "sampling/importance_sampling_ratio/mean": 0.9999678730964661, + "sampling/importance_sampling_ratio/min": 0.7147090435028076, + "sampling/sampling_logp_difference/max": 0.3358798027038574, + "sampling/sampling_logp_difference/mean": 0.015811704099178314, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 177.46875, + "completions/mean_terminated_length": 177.46875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.40639728307724, + "epoch": 0.8300884955752212, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9309105446327044, + "kl": 0.03369994834065437, + "learning_rate": 7.283016552959744e-07, + "loss": 0.0127, + "num_tokens": 10719158.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.2958474159240723, + "sampling/importance_sampling_ratio/mean": 0.9998207092285156, + "sampling/importance_sampling_ratio/min": 0.7702411413192749, + "sampling/sampling_logp_difference/max": 0.26105165481567383, + "sampling/sampling_logp_difference/mean": 0.013439834117889404, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 153.640625, + "completions/mean_terminated_length": 153.640625, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.4660520553588867, + "epoch": 0.831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2011060342315654, + "kl": 0.04497406259179115, + "learning_rate": 7.26926437438978e-07, + "loss": -0.0239, + "num_tokens": 10739599.0, + "reward": 0.8125, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.8125, + "rewards/decision_reward_func/std": 0.5875696539878845, + "sampling/importance_sampling_ratio/max": 1.4329568147659302, + "sampling/importance_sampling_ratio/mean": 1.0004122257232666, + "sampling/importance_sampling_ratio/min": 0.7717304229736328, + "sampling/sampling_logp_difference/max": 0.3597400188446045, + "sampling/sampling_logp_difference/mean": 0.015442630276083946, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 302.0, + "completions/max_terminated_length": 302.0, + "completions/mean_length": 137.578125, + "completions/mean_terminated_length": 137.578125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.42849624156951904, + "epoch": 0.8336283185840708, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.453756598093294, + "kl": 0.04561780393123627, + "learning_rate": 7.255490541597594e-07, + "loss": 0.0486, + "num_tokens": 10765268.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.6207348108291626, + "sampling/importance_sampling_ratio/mean": 1.000959873199463, + "sampling/importance_sampling_ratio/min": 0.6237696409225464, + "sampling/sampling_logp_difference/max": 0.482879638671875, + "sampling/sampling_logp_difference/mean": 0.016448060050606728, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 126.0, + "completions/mean_terminated_length": 126.0, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.30105122923851013, + "epoch": 0.8353982300884956, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03465108632451289, + "kl": 0.02864006534218788, + "learning_rate": 7.241695186018573e-07, + "loss": 0.0003, + "num_tokens": 10781780.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5226410627365112, + "sampling/importance_sampling_ratio/mean": 1.0001323223114014, + "sampling/importance_sampling_ratio/min": 0.672307014465332, + "sampling/sampling_logp_difference/max": 0.42044639587402344, + "sampling/sampling_logp_difference/mean": 0.013548655435442924, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 204.28125, + "completions/mean_terminated_length": 204.28125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.3732837438583374, + "epoch": 0.8371681415929203, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020149862214229524, + "kl": 0.03531515970826149, + "learning_rate": 7.227878439293476e-07, + "loss": 0.0003, + "num_tokens": 10804550.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3706494569778442, + "sampling/importance_sampling_ratio/mean": 1.0007022619247437, + "sampling/importance_sampling_ratio/min": 0.621472179889679, + "sampling/sampling_logp_difference/max": 0.4756641387939453, + "sampling/sampling_logp_difference/mean": 0.012865693308413029, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 155.875, + "completions/mean_terminated_length": 155.875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.585537314414978, + "epoch": 0.8389380530973451, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9360960877970692, + "kl": 0.06327377259731293, + "learning_rate": 7.214040433267198e-07, + "loss": 0.0017, + "num_tokens": 10827886.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.4348258972167969, + "sampling/importance_sampling_ratio/mean": 0.9995330572128296, + "sampling/importance_sampling_ratio/min": 0.6743716597557068, + "sampling/sampling_logp_difference/max": 0.3939739465713501, + "sampling/sampling_logp_difference/mean": 0.01880214363336563, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 150.359375, + "completions/mean_terminated_length": 150.359375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.5375866889953613, + "epoch": 0.8407079646017699, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.147876242707666, + "kl": 0.053672075271606445, + "learning_rate": 7.200181299987482e-07, + "loss": 0.0424, + "num_tokens": 10849205.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.456792950630188, + "sampling/importance_sampling_ratio/mean": 0.9994627833366394, + "sampling/importance_sampling_ratio/min": 0.6453744769096375, + "sampling/sampling_logp_difference/max": 0.4379246234893799, + "sampling/sampling_logp_difference/mean": 0.018585899844765663, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 128.609375, + "completions/mean_terminated_length": 128.609375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.3609449863433838, + "epoch": 0.8424778761061947, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028518032051111687, + "kl": 0.03358837962150574, + "learning_rate": 7.186301171703688e-07, + "loss": 0.0004, + "num_tokens": 10867388.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3706374168395996, + "sampling/importance_sampling_ratio/mean": 1.0000519752502441, + "sampling/importance_sampling_ratio/min": 0.6793352365493774, + "sampling/sampling_logp_difference/max": 0.3866405487060547, + "sampling/sampling_logp_difference/mean": 0.014430605806410313, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 144.96875, + "completions/mean_terminated_length": 144.96875, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.5752789378166199, + "epoch": 0.8442477876106195, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.062236280844594, + "kl": 0.06892964243888855, + "learning_rate": 7.172400180865513e-07, + "loss": -0.0185, + "num_tokens": 10888986.0, + "reward": 0.84375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.84375, + "rewards/decision_reward_func/std": 0.5409794449806213, + "sampling/importance_sampling_ratio/max": 1.2689205408096313, + "sampling/importance_sampling_ratio/mean": 0.9994760751724243, + "sampling/importance_sampling_ratio/min": 0.7767874002456665, + "sampling/sampling_logp_difference/max": 0.2525886297225952, + "sampling/sampling_logp_difference/mean": 0.01805744506418705, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 173.859375, + "completions/mean_terminated_length": 173.859375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.5288122892379761, + "epoch": 0.8460176991150442, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3337386411903083, + "kl": 0.05211670696735382, + "learning_rate": 7.158478460121734e-07, + "loss": 0.0176, + "num_tokens": 10911441.0, + "reward": 0.75, + "reward_std": 0.42078250646591187, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.4720792770385742, + "sampling/importance_sampling_ratio/mean": 0.9999220371246338, + "sampling/importance_sampling_ratio/min": 0.6960464119911194, + "sampling/sampling_logp_difference/max": 0.3866758346557617, + "sampling/sampling_logp_difference/mean": 0.016892949119210243, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.0, + "completions/max_terminated_length": 447.0, + "completions/mean_length": 252.328125, + "completions/mean_terminated_length": 252.328125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.6607073545455933, + "epoch": 0.8477876106194691, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.3976858016934577, + "kl": 0.07387363910675049, + "learning_rate": 7.144536142318944e-07, + "loss": 0.0108, + "num_tokens": 10939126.0, + "reward": 0.28125, + "reward_std": 0.6601393222808838, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.2892944812774658, + "sampling/importance_sampling_ratio/mean": 0.9996263980865479, + "sampling/importance_sampling_ratio/min": 0.6066848635673523, + "sampling/sampling_logp_difference/max": 0.49974584579467773, + "sampling/sampling_logp_difference/mean": 0.018056483939290047, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 144.609375, + "completions/mean_terminated_length": 144.609375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.5071415901184082, + "epoch": 0.8495575221238938, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1595070620281003, + "kl": 0.06269523501396179, + "learning_rate": 7.130573360500276e-07, + "loss": 0.0044, + "num_tokens": 10960237.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.4925907850265503, + "sampling/importance_sampling_ratio/mean": 1.000237226486206, + "sampling/importance_sampling_ratio/min": 0.694139301776886, + "sampling/sampling_logp_difference/max": 0.4005134105682373, + "sampling/sampling_logp_difference/mean": 0.01662774570286274, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 274.0, + "completions/max_terminated_length": 274.0, + "completions/mean_length": 151.71875, + "completions/mean_terminated_length": 151.71875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.49337881803512573, + "epoch": 0.8513274336283185, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.036517167279457426, + "kl": 0.05040563642978668, + "learning_rate": 7.116590247904143e-07, + "loss": 0.0005, + "num_tokens": 10979451.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3399325609207153, + "sampling/importance_sampling_ratio/mean": 0.9999549984931946, + "sampling/importance_sampling_ratio/min": 0.7470673322677612, + "sampling/sampling_logp_difference/max": 0.2926192283630371, + "sampling/sampling_logp_difference/mean": 0.016386952251195908, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 160.0, + "completions/mean_terminated_length": 160.0, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.385990709066391, + "epoch": 0.8530973451327434, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02866313871220871, + "kl": 0.043752796947956085, + "learning_rate": 7.10258693796296e-07, + "loss": 0.0005, + "num_tokens": 10999867.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4061437845230103, + "sampling/importance_sampling_ratio/mean": 1.0009467601776123, + "sampling/importance_sampling_ratio/min": 0.7561548352241516, + "sampling/sampling_logp_difference/max": 0.3408510684967041, + "sampling/sampling_logp_difference/mean": 0.013079589232802391, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 636.0, + "completions/max_terminated_length": 636.0, + "completions/mean_length": 223.21875, + "completions/mean_terminated_length": 223.21875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4007635712623596, + "epoch": 0.8548672566371681, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7446374945017039, + "kl": 0.03801294416189194, + "learning_rate": 7.088563564301873e-07, + "loss": -0.0126, + "num_tokens": 11024857.0, + "reward": 0.5625, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.5625, + "rewards/decision_reward_func/std": 0.8333333730697632, + "sampling/importance_sampling_ratio/max": 1.2873942852020264, + "sampling/importance_sampling_ratio/mean": 0.999816358089447, + "sampling/importance_sampling_ratio/min": 0.7708359956741333, + "sampling/sampling_logp_difference/max": 0.26027965545654297, + "sampling/sampling_logp_difference/mean": 0.013511259108781815, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 253.0, + "completions/max_terminated_length": 253.0, + "completions/mean_length": 141.953125, + "completions/mean_terminated_length": 141.953125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "entropy": 0.5220211148262024, + "epoch": 0.856637168141593, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5450247512624973, + "kl": 0.07348264753818512, + "learning_rate": 7.074520260737487e-07, + "loss": -0.0014, + "num_tokens": 11045814.0, + "reward": 0.375, + "reward_std": 0.36435678601264954, + "rewards/decision_reward_func/mean": 0.375, + "rewards/decision_reward_func/std": 0.934353232383728, + "sampling/importance_sampling_ratio/max": 1.5953445434570312, + "sampling/importance_sampling_ratio/mean": 1.0001152753829956, + "sampling/importance_sampling_ratio/min": 0.6447339057922363, + "sampling/sampling_logp_difference/max": 0.4670896530151367, + "sampling/sampling_logp_difference/mean": 0.01728680171072483, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 162.0, + "completions/max_terminated_length": 162.0, + "completions/mean_length": 96.9375, + "completions/mean_terminated_length": 96.9375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.32135164737701416, + "epoch": 0.8584070796460177, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04095930272407262, + "kl": 0.036933060735464096, + "learning_rate": 7.06045716127658e-07, + "loss": 0.0004, + "num_tokens": 11062098.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.60867440700531, + "sampling/importance_sampling_ratio/mean": 1.0000686645507812, + "sampling/importance_sampling_ratio/min": 0.6944010853767395, + "sampling/sampling_logp_difference/max": 0.47541046142578125, + "sampling/sampling_logp_difference/mean": 0.01514124684035778, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 151.421875, + "completions/mean_terminated_length": 151.421875, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.42495667934417725, + "epoch": 0.8601769911504424, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03181384667927138, + "kl": 0.04488690197467804, + "learning_rate": 7.04637440011484e-07, + "loss": 0.0005, + "num_tokens": 11082749.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3687118291854858, + "sampling/importance_sampling_ratio/mean": 1.0004634857177734, + "sampling/importance_sampling_ratio/min": 0.6973159909248352, + "sampling/sampling_logp_difference/max": 0.3605165481567383, + "sampling/sampling_logp_difference/mean": 0.015262722969055176, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 916.0, + "completions/max_terminated_length": 916.0, + "completions/mean_length": 200.703125, + "completions/mean_terminated_length": 200.703125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4573553204536438, + "epoch": 0.8619469026548673, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02711366077742337, + "kl": 0.04593441262841225, + "learning_rate": 7.032272111635565e-07, + "loss": 0.0004, + "num_tokens": 11105466.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.2976807355880737, + "sampling/importance_sampling_ratio/mean": 1.000078797340393, + "sampling/importance_sampling_ratio/min": 0.6960489749908447, + "sampling/sampling_logp_difference/max": 0.362335205078125, + "sampling/sampling_logp_difference/mean": 0.016789868474006653, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 161.4375, + "completions/mean_terminated_length": 161.4375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.5254327058792114, + "epoch": 0.863716814159292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0332785383253741, + "kl": 0.05357012152671814, + "learning_rate": 7.018150430408394e-07, + "loss": 0.0006, + "num_tokens": 11130358.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.4161489009857178, + "sampling/importance_sampling_ratio/mean": 0.9994027018547058, + "sampling/importance_sampling_ratio/min": 0.7706308960914612, + "sampling/sampling_logp_difference/max": 0.34794116020202637, + "sampling/sampling_logp_difference/mean": 0.01691107265651226, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 186.75, + "completions/mean_terminated_length": 186.75, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.5288914442062378, + "epoch": 0.8654867256637168, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9572866540516841, + "kl": 0.05500596761703491, + "learning_rate": 7.004009491188022e-07, + "loss": -0.0156, + "num_tokens": 11154518.0, + "reward": 0.28125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.28125, + "rewards/decision_reward_func/std": 0.9672207236289978, + "sampling/importance_sampling_ratio/max": 1.4253398180007935, + "sampling/importance_sampling_ratio/mean": 1.0004236698150635, + "sampling/importance_sampling_ratio/min": 0.7022836804389954, + "sampling/sampling_logp_difference/max": 0.35441017150878906, + "sampling/sampling_logp_difference/mean": 0.016183752566576004, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 123.421875, + "completions/mean_terminated_length": 123.421875, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "entropy": 0.3059542775154114, + "epoch": 0.8672566371681416, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02822826327338971, + "kl": 0.031212633475661278, + "learning_rate": 6.989849428912907e-07, + "loss": 0.0003, + "num_tokens": 11171825.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2959455251693726, + "sampling/importance_sampling_ratio/mean": 0.9996449947357178, + "sampling/importance_sampling_ratio/min": 0.6889712810516357, + "sampling/sampling_logp_difference/max": 0.3725557327270508, + "sampling/sampling_logp_difference/mean": 0.013596318662166595, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 131.09375, + "completions/mean_terminated_length": 131.09375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.3289196193218231, + "epoch": 0.8690265486725663, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.025211836053442026, + "kl": 0.03459160774946213, + "learning_rate": 6.975670378703992e-07, + "loss": 0.0004, + "num_tokens": 11190055.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.405799150466919, + "sampling/importance_sampling_ratio/mean": 0.9996116161346436, + "sampling/importance_sampling_ratio/min": 0.6268575191497803, + "sampling/sampling_logp_difference/max": 0.46703600883483887, + "sampling/sampling_logp_difference/mean": 0.013733677566051483, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 595.0, + "completions/max_terminated_length": 595.0, + "completions/mean_length": 211.171875, + "completions/mean_terminated_length": 211.171875, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.5561668872833252, + "epoch": 0.8707964601769912, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1722902782288536, + "kl": 0.06487689912319183, + "learning_rate": 6.961472475863405e-07, + "loss": 0.0084, + "num_tokens": 11217634.0, + "reward": 0.09375, + "reward_std": 0.34860679507255554, + "rewards/decision_reward_func/mean": 0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.549800992012024, + "sampling/importance_sampling_ratio/mean": 0.9999545812606812, + "sampling/importance_sampling_ratio/min": 0.7396615743637085, + "sampling/sampling_logp_difference/max": 0.4381265640258789, + "sampling/sampling_logp_difference/mean": 0.016957048326730728, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1571.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 338.65625, + "completions/mean_terminated_length": 338.65625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.4920404851436615, + "epoch": 0.8725663716814159, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.023947735032229, + "kl": 0.03984961286187172, + "learning_rate": 6.947255855873176e-07, + "loss": 0.0137, + "num_tokens": 11248828.0, + "reward": 0.46875, + "reward_std": 0.29578250646591187, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.2872081995010376, + "sampling/importance_sampling_ratio/mean": 1.0005526542663574, + "sampling/importance_sampling_ratio/min": 0.7777249813079834, + "sampling/sampling_logp_difference/max": 0.2524757385253906, + "sampling/sampling_logp_difference/mean": 0.014068529941141605, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 203.03125, + "completions/mean_terminated_length": 203.03125, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.54365473985672, + "epoch": 0.8743362831858407, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8781848845775025, + "kl": 0.04530525952577591, + "learning_rate": 6.93302065439394e-07, + "loss": 0.0259, + "num_tokens": 11275774.0, + "reward": 0.25, + "reward_std": 0.25819888710975647, + "rewards/decision_reward_func/mean": 0.25, + "rewards/decision_reward_func/std": 0.9759001135826111, + "sampling/importance_sampling_ratio/max": 1.4039392471313477, + "sampling/importance_sampling_ratio/mean": 1.000096321105957, + "sampling/importance_sampling_ratio/min": 0.6772672533988953, + "sampling/sampling_logp_difference/max": 0.3896893262863159, + "sampling/sampling_logp_difference/mean": 0.017321724444627762, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 182.0, + "completions/max_terminated_length": 182.0, + "completions/mean_length": 107.1875, + "completions/mean_terminated_length": 107.1875, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.28042668104171753, + "epoch": 0.8761061946902655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026140178793081347, + "kl": 0.025253865867853165, + "learning_rate": 6.918767007263645e-07, + "loss": 0.0003, + "num_tokens": 11291530.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.564685344696045, + "sampling/importance_sampling_ratio/mean": 1.0004124641418457, + "sampling/importance_sampling_ratio/min": 0.7787970900535583, + "sampling/sampling_logp_difference/max": 0.44768476486206055, + "sampling/sampling_logp_difference/mean": 0.012597857043147087, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 184.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 109.5625, + "completions/mean_terminated_length": 109.5625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "entropy": 0.3203164339065552, + "epoch": 0.8778761061946903, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.029914991404061227, + "kl": 0.029022786766290665, + "learning_rate": 6.904495050496258e-07, + "loss": 0.0003, + "num_tokens": 11309710.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4687899351119995, + "sampling/importance_sampling_ratio/mean": 1.0006183385849, + "sampling/importance_sampling_ratio/min": 0.7097292542457581, + "sampling/sampling_logp_difference/max": 0.3844388723373413, + "sampling/sampling_logp_difference/mean": 0.014193766750395298, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 168.09375, + "completions/mean_terminated_length": 168.09375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.48099666833877563, + "epoch": 0.879646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0155494990382203, + "kl": 0.046392716467380524, + "learning_rate": 6.890204920280457e-07, + "loss": 0.0237, + "num_tokens": 11332868.0, + "reward": 0.125, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.125, + "rewards/decision_reward_func/std": 1.0, + "sampling/importance_sampling_ratio/max": 1.3051731586456299, + "sampling/importance_sampling_ratio/mean": 0.9998812675476074, + "sampling/importance_sampling_ratio/min": 0.6971226334571838, + "sampling/sampling_logp_difference/max": 0.36079394817352295, + "sampling/sampling_logp_difference/mean": 0.015075111761689186, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 154.6875, + "completions/mean_terminated_length": 154.6875, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.4950066804885864, + "epoch": 0.8814159292035398, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0197985747632474, + "kl": 0.06584899127483368, + "learning_rate": 6.875896752978344e-07, + "loss": -0.0003, + "num_tokens": 11355344.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.282453179359436, + "sampling/importance_sampling_ratio/mean": 0.999314546585083, + "sampling/importance_sampling_ratio/min": 0.7079713940620422, + "sampling/sampling_logp_difference/max": 0.34535157680511475, + "sampling/sampling_logp_difference/mean": 0.016382791101932526, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 172.84375, + "completions/mean_terminated_length": 172.84375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.4558072090148926, + "epoch": 0.8831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.897514938846476, + "kl": 0.06502187997102737, + "learning_rate": 6.861570685124134e-07, + "loss": 0.0245, + "num_tokens": 11376726.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.3139907121658325, + "sampling/importance_sampling_ratio/mean": 0.9999817609786987, + "sampling/importance_sampling_ratio/min": 0.631839394569397, + "sampling/sampling_logp_difference/max": 0.4591200351715088, + "sampling/sampling_logp_difference/mean": 0.015304536558687687, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 180.578125, + "completions/mean_terminated_length": 180.578125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.5417348146438599, + "epoch": 0.8849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027568689738492257, + "kl": 0.05047761648893356, + "learning_rate": 6.847226853422861e-07, + "loss": 0.0005, + "num_tokens": 11403035.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3918542861938477, + "sampling/importance_sampling_ratio/mean": 1.0008066892623901, + "sampling/importance_sampling_ratio/min": 0.6774313449859619, + "sampling/sampling_logp_difference/max": 0.38944709300994873, + "sampling/sampling_logp_difference/mean": 0.01726376637816429, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 114.390625, + "completions/mean_terminated_length": 114.390625, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "entropy": 0.4357292354106903, + "epoch": 0.8867256637168142, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3679384762315754, + "kl": 0.056346792727708817, + "learning_rate": 6.832865394749065e-07, + "loss": 0.0006, + "num_tokens": 11422356.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.2793991565704346, + "sampling/importance_sampling_ratio/mean": 0.9996085166931152, + "sampling/importance_sampling_ratio/min": 0.6171379685401917, + "sampling/sampling_logp_difference/max": 0.4826626777648926, + "sampling/sampling_logp_difference/mean": 0.016202447935938835, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 343.0, + "completions/max_terminated_length": 343.0, + "completions/mean_length": 128.390625, + "completions/mean_terminated_length": 128.390625, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.4350653886795044, + "epoch": 0.8884955752212389, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030346917844838946, + "kl": 0.0385807603597641, + "learning_rate": 6.818486446145486e-07, + "loss": 0.0004, + "num_tokens": 11441917.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.406125783920288, + "sampling/importance_sampling_ratio/mean": 0.9999099969863892, + "sampling/importance_sampling_ratio/min": 0.6496437191963196, + "sampling/sampling_logp_difference/max": 0.43133115768432617, + "sampling/sampling_logp_difference/mean": 0.017174214124679565, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 116.765625, + "completions/mean_terminated_length": 116.765625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.45700663328170776, + "epoch": 0.8902654867256637, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030081798969186637, + "kl": 0.04402727261185646, + "learning_rate": 6.804090144821772e-07, + "loss": 0.0005, + "num_tokens": 11459886.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4959650039672852, + "sampling/importance_sampling_ratio/mean": 1.0003783702850342, + "sampling/importance_sampling_ratio/min": 0.73115074634552, + "sampling/sampling_logp_difference/max": 0.4027714729309082, + "sampling/sampling_logp_difference/mean": 0.017588015645742416, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 143.5, + "completions/mean_terminated_length": 143.5, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "entropy": 0.43855568766593933, + "epoch": 0.8920353982300885, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03683240239684041, + "kl": 0.04783044010400772, + "learning_rate": 6.789676628153143e-07, + "loss": 0.0005, + "num_tokens": 11480830.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.3063552379608154, + "sampling/importance_sampling_ratio/mean": 1.0003074407577515, + "sampling/importance_sampling_ratio/min": 0.7497009038925171, + "sampling/sampling_logp_difference/max": 0.28808093070983887, + "sampling/sampling_logp_difference/mean": 0.014854389242827892, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 145.234375, + "completions/mean_terminated_length": 145.234375, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.38934558629989624, + "epoch": 0.8938053097345132, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024098191218664168, + "kl": 0.031024843454360962, + "learning_rate": 6.775246033679104e-07, + "loss": 0.0003, + "num_tokens": 11500669.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5753343105316162, + "sampling/importance_sampling_ratio/mean": 1.0002654790878296, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.4544675350189209, + "sampling/sampling_logp_difference/mean": 0.014444435946643353, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 155.1875, + "completions/mean_terminated_length": 155.1875, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.5077278017997742, + "epoch": 0.8955752212389381, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.033627166083113544, + "kl": 0.06282509118318558, + "learning_rate": 6.76079849910212e-07, + "loss": 0.0007, + "num_tokens": 11520953.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.0, + "rewards/decision_reward_func/std": 1.0079052448272705, + "sampling/importance_sampling_ratio/max": 1.469017744064331, + "sampling/importance_sampling_ratio/mean": 0.9998058080673218, + "sampling/importance_sampling_ratio/min": 0.6368654370307922, + "sampling/sampling_logp_difference/max": 0.45119690895080566, + "sampling/sampling_logp_difference/mean": 0.016574004665017128, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 153.0, + "completions/max_terminated_length": 153.0, + "completions/mean_length": 97.96875, + "completions/mean_terminated_length": 97.96875, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.24034933745861053, + "epoch": 0.8973451327433628, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030981095291774924, + "kl": 0.020224817097187042, + "learning_rate": 6.746334162286307e-07, + "loss": 0.0002, + "num_tokens": 11536951.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6272845268249512, + "sampling/importance_sampling_ratio/mean": 1.00011146068573, + "sampling/importance_sampling_ratio/min": 0.43750959634780884, + "sampling/sampling_logp_difference/max": 0.826656699180603, + "sampling/sampling_logp_difference/mean": 0.013065225444734097, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 130.84375, + "completions/mean_terminated_length": 130.84375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3243350386619568, + "epoch": 0.8991150442477877, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022043362891743003, + "kl": 0.029396357014775276, + "learning_rate": 6.731853161256113e-07, + "loss": 0.0003, + "num_tokens": 11554589.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5598468780517578, + "sampling/importance_sampling_ratio/mean": 1.000610589981079, + "sampling/importance_sampling_ratio/min": 0.6803026795387268, + "sampling/sampling_logp_difference/max": 0.44458770751953125, + "sampling/sampling_logp_difference/mean": 0.013580295257270336, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 135.4375, + "completions/mean_terminated_length": 135.4375, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4605197608470917, + "epoch": 0.9008849557522124, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1726230337445993, + "kl": 0.052446477115154266, + "learning_rate": 6.717355634195004e-07, + "loss": -0.0156, + "num_tokens": 11573881.0, + "reward": 0.34375, + "reward_std": 0.23935678601264954, + "rewards/decision_reward_func/mean": 0.34375, + "rewards/decision_reward_func/std": 0.9464847445487976, + "sampling/importance_sampling_ratio/max": 1.595988154411316, + "sampling/importance_sampling_ratio/mean": 0.9996259212493896, + "sampling/importance_sampling_ratio/min": 0.7329869270324707, + "sampling/sampling_logp_difference/max": 0.46749305725097656, + "sampling/sampling_logp_difference/mean": 0.016616176813840866, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 171.1875, + "completions/mean_terminated_length": 171.1875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.5736621022224426, + "epoch": 0.9026548672566371, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.929166370597461, + "kl": 0.08164776861667633, + "learning_rate": 6.70284171944414e-07, + "loss": -0.0152, + "num_tokens": 11595221.0, + "reward": 0.40625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.40625, + "rewards/decision_reward_func/std": 0.9209855198860168, + "sampling/importance_sampling_ratio/max": 1.3658151626586914, + "sampling/importance_sampling_ratio/mean": 0.9998589754104614, + "sampling/importance_sampling_ratio/min": 0.6254509687423706, + "sampling/sampling_logp_difference/max": 0.4692823886871338, + "sampling/sampling_logp_difference/mean": 0.01716122031211853, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 120.90625, + "completions/mean_terminated_length": 120.90625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "entropy": 0.4975433349609375, + "epoch": 0.904424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4552774074894985, + "kl": 0.06420423090457916, + "learning_rate": 6.688311555501063e-07, + "loss": 0.0125, + "num_tokens": 11614815.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.4139158725738525, + "sampling/importance_sampling_ratio/mean": 0.9999160766601562, + "sampling/importance_sampling_ratio/min": 0.6870979070663452, + "sampling/sampling_logp_difference/max": 0.3752784729003906, + "sampling/sampling_logp_difference/mean": 0.018069611862301826, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 158.21875, + "completions/mean_terminated_length": 158.21875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.48136845231056213, + "epoch": 0.9061946902654867, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0306633495867539, + "kl": 0.04570081830024719, + "learning_rate": 6.673765281018372e-07, + "loss": 0.0012, + "num_tokens": 11634749.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.2743909358978271, + "sampling/importance_sampling_ratio/mean": 0.9994518756866455, + "sampling/importance_sampling_ratio/min": 0.7109566926956177, + "sampling/sampling_logp_difference/max": 0.3411438465118408, + "sampling/sampling_logp_difference/mean": 0.017170319333672523, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 159.078125, + "completions/mean_terminated_length": 159.078125, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "entropy": 0.40064916014671326, + "epoch": 0.9079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1565868940811759, + "kl": 0.03733426332473755, + "learning_rate": 6.659203034802396e-07, + "loss": 0.0168, + "num_tokens": 11656114.0, + "reward": 0.875, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.875, + "rewards/decision_reward_func/std": 0.48795005679130554, + "sampling/importance_sampling_ratio/max": 1.2868289947509766, + "sampling/importance_sampling_ratio/mean": 1.0002918243408203, + "sampling/importance_sampling_ratio/min": 0.7122735977172852, + "sampling/sampling_logp_difference/max": 0.3392932415008545, + "sampling/sampling_logp_difference/mean": 0.014757532626390457, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 123.796875, + "completions/mean_terminated_length": 123.796875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.5170900821685791, + "epoch": 0.9097345132743363, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2877298459068507, + "kl": 0.07492925971746445, + "learning_rate": 6.644624955811873e-07, + "loss": 0.0121, + "num_tokens": 11682117.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.297922134399414, + "sampling/importance_sampling_ratio/mean": 0.9996405839920044, + "sampling/importance_sampling_ratio/min": 0.6141785383224487, + "sampling/sampling_logp_difference/max": 0.4874696731567383, + "sampling/sampling_logp_difference/mean": 0.01811598241329193, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 142.3125, + "completions/mean_terminated_length": 142.3125, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.3735528290271759, + "epoch": 0.911504424778761, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02537209831694192, + "kl": 0.025548133999109268, + "learning_rate": 6.630031183156627e-07, + "loss": 0.0002, + "num_tokens": 11700953.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5218216180801392, + "sampling/importance_sampling_ratio/mean": 0.9998520612716675, + "sampling/importance_sampling_ratio/min": 0.6755814552307129, + "sampling/sampling_logp_difference/max": 0.4199080467224121, + "sampling/sampling_logp_difference/mean": 0.015386695973575115, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 145.875, + "completions/mean_terminated_length": 145.875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4176895022392273, + "epoch": 0.9132743362831859, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.028163093425068782, + "kl": 0.03728806972503662, + "learning_rate": 6.61542185609623e-07, + "loss": 0.0004, + "num_tokens": 11722417.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6107532978057861, + "sampling/importance_sampling_ratio/mean": 0.9997249841690063, + "sampling/importance_sampling_ratio/min": 0.7600250244140625, + "sampling/sampling_logp_difference/max": 0.4767019748687744, + "sampling/sampling_logp_difference/mean": 0.015322001650929451, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 166.34375, + "completions/mean_terminated_length": 166.34375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.3786749839782715, + "epoch": 0.9150442477876106, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03326183617417285, + "kl": 0.03697134554386139, + "learning_rate": 6.60079711403869e-07, + "loss": 0.0004, + "num_tokens": 11744871.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3370469808578491, + "sampling/importance_sampling_ratio/mean": 1.000791072845459, + "sampling/importance_sampling_ratio/min": 0.7023168206214905, + "sampling/sampling_logp_difference/max": 0.35337066650390625, + "sampling/sampling_logp_difference/mean": 0.013923844322562218, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 175.65625, + "completions/mean_terminated_length": 175.65625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.26630160212516785, + "epoch": 0.9168141592920354, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.022789951115606263, + "kl": 0.02661610022187233, + "learning_rate": 6.586157096539104e-07, + "loss": 0.0003, + "num_tokens": 11765681.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.468764305114746, + "sampling/importance_sampling_ratio/mean": 0.9996398091316223, + "sampling/importance_sampling_ratio/min": 0.6202806830406189, + "sampling/sampling_logp_difference/max": 0.4775831699371338, + "sampling/sampling_logp_difference/mean": 0.01199787575751543, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 133.609375, + "completions/mean_terminated_length": 133.609375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "entropy": 0.3493505120277405, + "epoch": 0.9185840707964602, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03952156425404006, + "kl": 0.0363231897354126, + "learning_rate": 6.571501943298335e-07, + "loss": 0.0004, + "num_tokens": 11783624.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.530820608139038, + "sampling/importance_sampling_ratio/mean": 1.000678539276123, + "sampling/importance_sampling_ratio/min": 0.6168789267539978, + "sampling/sampling_logp_difference/max": 0.48308253288269043, + "sampling/sampling_logp_difference/mean": 0.015622604638338089, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 171.109375, + "completions/mean_terminated_length": 171.109375, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3881581425666809, + "epoch": 0.9203539823008849, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.020049575859725548, + "kl": 0.03495844081044197, + "learning_rate": 6.556831794161677e-07, + "loss": 0.0003, + "num_tokens": 11804463.0, + "reward": -0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": -0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5633995532989502, + "sampling/importance_sampling_ratio/mean": 1.000249981880188, + "sampling/importance_sampling_ratio/min": 0.679577648639679, + "sampling/sampling_logp_difference/max": 0.44686269760131836, + "sampling/sampling_logp_difference/mean": 0.01593586802482605, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 639.0, + "completions/max_terminated_length": 639.0, + "completions/mean_length": 203.609375, + "completions/mean_terminated_length": 203.609375, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.3289709985256195, + "epoch": 0.9221238938053097, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.030300394275281445, + "kl": 0.03109907917678356, + "learning_rate": 6.542146789117523e-07, + "loss": 0.0003, + "num_tokens": 11826758.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5853697061538696, + "sampling/importance_sampling_ratio/mean": 0.9997753500938416, + "sampling/importance_sampling_ratio/min": 0.7064435482025146, + "sampling/sampling_logp_difference/max": 0.4608175754547119, + "sampling/sampling_logp_difference/mean": 0.013363179750740528, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 132.546875, + "completions/mean_terminated_length": 132.546875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3163865804672241, + "epoch": 0.9238938053097345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.027720750192685717, + "kl": 0.03124506026506424, + "learning_rate": 6.527447068296025e-07, + "loss": 0.0003, + "num_tokens": 11844713.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.4982117414474487, + "sampling/importance_sampling_ratio/mean": 0.9996817111968994, + "sampling/importance_sampling_ratio/min": 0.7301310896873474, + "sampling/sampling_logp_difference/max": 0.404272198677063, + "sampling/sampling_logp_difference/mean": 0.01464778184890747, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 144.140625, + "completions/mean_terminated_length": 144.140625, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "entropy": 0.4128933846950531, + "epoch": 0.9256637168141593, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1419393301748153, + "kl": 0.042740676552057266, + "learning_rate": 6.512732771967758e-07, + "loss": -0.0221, + "num_tokens": 11865010.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.326668381690979, + "sampling/importance_sampling_ratio/mean": 1.0002739429473877, + "sampling/importance_sampling_ratio/min": 0.6690137982368469, + "sampling/sampling_logp_difference/max": 0.4019505977630615, + "sampling/sampling_logp_difference/mean": 0.016076423227787018, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 227.734375, + "completions/mean_terminated_length": 227.734375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.4210180640220642, + "epoch": 0.9274336283185841, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02338603654633819, + "kl": 0.03688063472509384, + "learning_rate": 6.498004040542384e-07, + "loss": 0.0004, + "num_tokens": 11895681.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5071555376052856, + "sampling/importance_sampling_ratio/mean": 0.9997272491455078, + "sampling/importance_sampling_ratio/min": 0.6994218230247498, + "sampling/sampling_logp_difference/max": 0.4102240800857544, + "sampling/sampling_logp_difference/mean": 0.01420167088508606, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 153.625, + "completions/mean_terminated_length": 153.625, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.3878311514854431, + "epoch": 0.9292035398230089, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02325109143409543, + "kl": 0.031965360045433044, + "learning_rate": 6.483261014567311e-07, + "loss": 0.0003, + "num_tokens": 11916889.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2996877431869507, + "sampling/importance_sampling_ratio/mean": 0.9991119503974915, + "sampling/importance_sampling_ratio/min": 0.6263453960418701, + "sampling/sampling_logp_difference/max": 0.467853307723999, + "sampling/sampling_logp_difference/mean": 0.015429418534040451, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 318.0, + "completions/max_terminated_length": 318.0, + "completions/mean_length": 167.90625, + "completions/mean_terminated_length": 167.90625, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.47103917598724365, + "epoch": 0.9309734513274336, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.026374450972294128, + "kl": 0.03843798488378525, + "learning_rate": 6.468503834726349e-07, + "loss": 0.0004, + "num_tokens": 11939987.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.2908649444580078, + "sampling/importance_sampling_ratio/mean": 1.0000659227371216, + "sampling/importance_sampling_ratio/min": 0.6879383325576782, + "sampling/sampling_logp_difference/max": 0.3740561008453369, + "sampling/sampling_logp_difference/mean": 0.017067167907953262, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 173.671875, + "completions/mean_terminated_length": 173.671875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.44619613885879517, + "epoch": 0.9327433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03301712563254451, + "kl": 0.053856320679187775, + "learning_rate": 6.453732641838371e-07, + "loss": 0.0006, + "num_tokens": 11962830.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.463612675666809, + "sampling/importance_sampling_ratio/mean": 0.9998467564582825, + "sampling/importance_sampling_ratio/min": 0.6220102906227112, + "sampling/sampling_logp_difference/max": 0.47479867935180664, + "sampling/sampling_logp_difference/mean": 0.016407469287514687, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 887.0, + "completions/max_terminated_length": 887.0, + "completions/mean_length": 241.078125, + "completions/mean_terminated_length": 241.078125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.5302072763442993, + "epoch": 0.9345132743362832, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0162961773695491, + "kl": 0.05674497038125992, + "learning_rate": 6.438947576855966e-07, + "loss": 0.065, + "num_tokens": 11995027.0, + "reward": 0.3125, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.3125, + "rewards/decision_reward_func/std": 0.9574271440505981, + "sampling/importance_sampling_ratio/max": 1.4823534488677979, + "sampling/importance_sampling_ratio/mean": 1.0003397464752197, + "sampling/importance_sampling_ratio/min": 0.6627068519592285, + "sampling/sampling_logp_difference/max": 0.4114224910736084, + "sampling/sampling_logp_difference/mean": 0.016194399446249008, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 216.375, + "completions/mean_terminated_length": 216.375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.5207070708274841, + "epoch": 0.9362831858407079, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8818870799114665, + "kl": 0.05992306023836136, + "learning_rate": 6.424148780864103e-07, + "loss": -0.0149, + "num_tokens": 12021675.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.5495630502700806, + "sampling/importance_sampling_ratio/mean": 1.0000014305114746, + "sampling/importance_sampling_ratio/min": 0.7507371306419373, + "sampling/sampling_logp_difference/max": 0.4379730224609375, + "sampling/sampling_logp_difference/mean": 0.016500603407621384, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 143.53125, + "completions/mean_terminated_length": 143.53125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.45067259669303894, + "epoch": 0.9380530973451328, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11701618369725716, + "kl": 0.059737443923950195, + "learning_rate": 6.409336395078771e-07, + "loss": 0.0007, + "num_tokens": 12042589.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3233445882797241, + "sampling/importance_sampling_ratio/mean": 0.9995430707931519, + "sampling/importance_sampling_ratio/min": 0.6298382878303528, + "sampling/sampling_logp_difference/max": 0.4622921943664551, + "sampling/sampling_logp_difference/mean": 0.017670437693595886, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 158.046875, + "completions/mean_terminated_length": 158.046875, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "entropy": 0.4408401846885681, + "epoch": 0.9398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.181717859164751, + "kl": 0.058012235909700394, + "learning_rate": 6.394510560845636e-07, + "loss": 0.0071, + "num_tokens": 12069264.0, + "reward": 0.625, + "reward_std": 0.22360679507255554, + "rewards/decision_reward_func/mean": 0.625, + "rewards/decision_reward_func/std": 0.7867957949638367, + "sampling/importance_sampling_ratio/max": 1.8133851289749146, + "sampling/importance_sampling_ratio/mean": 1.000122308731079, + "sampling/importance_sampling_ratio/min": 0.6903498768806458, + "sampling/sampling_logp_difference/max": 0.5951954126358032, + "sampling/sampling_logp_difference/mean": 0.016324251890182495, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 186.71875, + "completions/mean_terminated_length": 186.71875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.4281838536262512, + "epoch": 0.9415929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.96229328750554, + "kl": 0.04793298989534378, + "learning_rate": 6.379671419638702e-07, + "loss": 0.0329, + "num_tokens": 12093038.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.2909739017486572, + "sampling/importance_sampling_ratio/mean": 1.000369906425476, + "sampling/importance_sampling_ratio/min": 0.6395174860954285, + "sampling/sampling_logp_difference/max": 0.447041392326355, + "sampling/sampling_logp_difference/mean": 0.01448612567037344, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 141.984375, + "completions/mean_terminated_length": 141.984375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.39643681049346924, + "epoch": 0.9433628318584071, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0649369196683396, + "kl": 0.04488767683506012, + "learning_rate": 6.364819113058951e-07, + "loss": -0.0014, + "num_tokens": 12122205.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.6062982082366943, + "sampling/importance_sampling_ratio/mean": 0.9994792938232422, + "sampling/importance_sampling_ratio/min": 0.6171591281890869, + "sampling/sampling_logp_difference/max": 0.48262834548950195, + "sampling/sampling_logp_difference/mean": 0.015572316013276577, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 160.390625, + "completions/mean_terminated_length": 160.390625, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.37770381569862366, + "epoch": 0.9451327433628318, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.086701832076824, + "kl": 0.03709157556295395, + "learning_rate": 6.349953782832991e-07, + "loss": 0.0184, + "num_tokens": 12143590.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.2991387844085693, + "sampling/importance_sampling_ratio/mean": 0.999364972114563, + "sampling/importance_sampling_ratio/min": 0.6300749778747559, + "sampling/sampling_logp_difference/max": 0.461916446685791, + "sampling/sampling_logp_difference/mean": 0.015483014285564423, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 194.390625, + "completions/mean_terminated_length": 194.390625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.4973522126674652, + "epoch": 0.9469026548672567, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.2320857968486798, + "kl": 0.08781930059194565, + "learning_rate": 6.335075570811708e-07, + "loss": 0.0507, + "num_tokens": 12168047.0, + "reward": 0.75, + "reward_std": 0.44091323018074036, + "rewards/decision_reward_func/mean": 0.75, + "rewards/decision_reward_func/std": 0.6666666865348816, + "sampling/importance_sampling_ratio/max": 1.3621286153793335, + "sampling/importance_sampling_ratio/mean": 0.9995167851448059, + "sampling/importance_sampling_ratio/min": 0.6857494115829468, + "sampling/sampling_logp_difference/max": 0.3772430419921875, + "sampling/sampling_logp_difference/mean": 0.015815015882253647, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 144.0, + "completions/max_terminated_length": 144.0, + "completions/mean_length": 102.984375, + "completions/mean_terminated_length": 102.984375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.2278555929660797, + "epoch": 0.9486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07993346210059651, + "kl": 0.033862821757793427, + "learning_rate": 6.320184618968914e-07, + "loss": 0.0003, + "num_tokens": 12184318.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6206591129302979, + "sampling/importance_sampling_ratio/mean": 1.0003883838653564, + "sampling/importance_sampling_ratio/min": 0.6372146010398865, + "sampling/sampling_logp_difference/max": 0.4828329086303711, + "sampling/sampling_logp_difference/mean": 0.012828746810555458, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 263.0, + "completions/max_terminated_length": 263.0, + "completions/mean_length": 157.625, + "completions/mean_terminated_length": 157.625, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.43120914697647095, + "epoch": 0.9504424778761061, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0956675860374605, + "kl": 0.06771903485059738, + "learning_rate": 6.305281069399988e-07, + "loss": -0.017, + "num_tokens": 12205334.0, + "reward": 0.1875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.1875, + "rewards/decision_reward_func/std": 0.9900296926498413, + "sampling/importance_sampling_ratio/max": 1.5264860391616821, + "sampling/importance_sampling_ratio/mean": 1.000443458557129, + "sampling/importance_sampling_ratio/min": 0.7132288813591003, + "sampling/sampling_logp_difference/max": 0.42296838760375977, + "sampling/sampling_logp_difference/mean": 0.01592453010380268, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 170.265625, + "completions/mean_terminated_length": 170.265625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "entropy": 0.39343422651290894, + "epoch": 0.952212389380531, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.05083102662365012, + "kl": 0.05068902671337128, + "learning_rate": 6.290365064320519e-07, + "loss": 0.0006, + "num_tokens": 12227047.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.4352738857269287, + "sampling/importance_sampling_ratio/mean": 1.0004734992980957, + "sampling/importance_sampling_ratio/min": 0.6985859274864197, + "sampling/sampling_logp_difference/max": 0.36135566234588623, + "sampling/sampling_logp_difference/mean": 0.014874329790472984, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 160.9375, + "completions/mean_terminated_length": 160.9375, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "entropy": 0.48297345638275146, + "epoch": 0.9539823008849557, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.179285871703919, + "kl": 0.06974999606609344, + "learning_rate": 6.275436746064956e-07, + "loss": -0.0058, + "num_tokens": 12248483.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.5349164009094238, + "sampling/importance_sampling_ratio/mean": 0.9997453689575195, + "sampling/importance_sampling_ratio/min": 0.662337064743042, + "sampling/sampling_logp_difference/max": 0.42847585678100586, + "sampling/sampling_logp_difference/mean": 0.01870621182024479, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 150.15625, + "completions/mean_terminated_length": 150.15625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.37128764390945435, + "epoch": 0.9557522123893806, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1728417499746477, + "kl": 0.061821479350328445, + "learning_rate": 6.260496257085239e-07, + "loss": 0.0028, + "num_tokens": 12268237.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.2984933853149414, + "sampling/importance_sampling_ratio/mean": 0.9998453259468079, + "sampling/importance_sampling_ratio/min": 0.6117988228797913, + "sampling/sampling_logp_difference/max": 0.491351842880249, + "sampling/sampling_logp_difference/mean": 0.015235353261232376, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 173.34375, + "completions/mean_terminated_length": 173.34375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.394723117351532, + "epoch": 0.9575221238938053, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9425096043335918, + "kl": 0.06985059380531311, + "learning_rate": 6.245543739949453e-07, + "loss": -0.0066, + "num_tokens": 12290307.0, + "reward": 0.6875, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.6875, + "rewards/decision_reward_func/std": 0.7319250702857971, + "sampling/importance_sampling_ratio/max": 1.50187349319458, + "sampling/importance_sampling_ratio/mean": 1.0000241994857788, + "sampling/importance_sampling_ratio/min": 0.6360552906990051, + "sampling/sampling_logp_difference/max": 0.4524698257446289, + "sampling/sampling_logp_difference/mean": 0.016086284071207047, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 154.28125, + "completions/mean_terminated_length": 154.28125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.3369419574737549, + "epoch": 0.95929203539823, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.037734265261044175, + "kl": 0.029881250113248825, + "learning_rate": 6.230579337340456e-07, + "loss": 0.0004, + "num_tokens": 12310821.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6232856512069702, + "sampling/importance_sampling_ratio/mean": 1.00018310546875, + "sampling/importance_sampling_ratio/min": 0.6511728763580322, + "sampling/sampling_logp_difference/max": 0.4844522476196289, + "sampling/sampling_logp_difference/mean": 0.015162945725023746, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 221.0, + "completions/max_terminated_length": 221.0, + "completions/mean_length": 125.578125, + "completions/mean_terminated_length": 125.578125, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.2706543505191803, + "epoch": 0.9610619469026549, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06230287677510357, + "kl": 0.030617382377386093, + "learning_rate": 6.215603192054521e-07, + "loss": 0.0003, + "num_tokens": 12328346.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 0.5, + "rewards/decision_reward_func/std": 0.8728715777397156, + "sampling/importance_sampling_ratio/max": 1.5816904306411743, + "sampling/importance_sampling_ratio/mean": 0.9994117021560669, + "sampling/importance_sampling_ratio/min": 0.43092525005340576, + "sampling/sampling_logp_difference/max": 0.8418207168579102, + "sampling/sampling_logp_difference/mean": 0.014314708299934864, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 116.78125, + "completions/mean_terminated_length": 116.78125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.21982413530349731, + "epoch": 0.9628318584070796, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03639281547593844, + "kl": 0.022563684731721878, + "learning_rate": 6.200615446999981e-07, + "loss": 0.0002, + "num_tokens": 12345772.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6089556217193604, + "sampling/importance_sampling_ratio/mean": 1.0011626482009888, + "sampling/importance_sampling_ratio/min": 0.644199788570404, + "sampling/sampling_logp_difference/max": 0.4755852222442627, + "sampling/sampling_logp_difference/mean": 0.01181740127503872, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 186.609375, + "completions/mean_terminated_length": 186.609375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.4190259277820587, + "epoch": 0.9646017699115044, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9419568150118021, + "kl": 0.07190361618995667, + "learning_rate": 6.185616245195848e-07, + "loss": 0.0066, + "num_tokens": 12370819.0, + "reward": 0.59375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.59375, + "rewards/decision_reward_func/std": 0.8110105991363525, + "sampling/importance_sampling_ratio/max": 1.3118776082992554, + "sampling/importance_sampling_ratio/mean": 1.00010347366333, + "sampling/importance_sampling_ratio/min": 0.6482194662094116, + "sampling/sampling_logp_difference/max": 0.4335259199142456, + "sampling/sampling_logp_difference/mean": 0.01625240594148636, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 217.6875, + "completions/mean_terminated_length": 217.6875, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.4670265316963196, + "epoch": 0.9663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8005211649308785, + "kl": 0.050225675106048584, + "learning_rate": 6.170605729770469e-07, + "loss": 0.0043, + "num_tokens": 12398863.0, + "reward": -0.09375, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": -0.09375, + "rewards/decision_reward_func/std": 1.003466248512268, + "sampling/importance_sampling_ratio/max": 1.405803918838501, + "sampling/importance_sampling_ratio/mean": 0.9996092319488525, + "sampling/importance_sampling_ratio/min": 0.6845299601554871, + "sampling/sampling_logp_difference/max": 0.37902283668518066, + "sampling/sampling_logp_difference/mean": 0.01664130762219429, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 162.125, + "completions/mean_terminated_length": 162.125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "entropy": 0.32101893424987793, + "epoch": 0.968141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1268576328489324, + "kl": 0.04037083685398102, + "learning_rate": 6.155584043960143e-07, + "loss": -0.0284, + "num_tokens": 12418791.0, + "reward": 0.53125, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.53125, + "rewards/decision_reward_func/std": 0.8539125919342041, + "sampling/importance_sampling_ratio/max": 1.5744267702102661, + "sampling/importance_sampling_ratio/mean": 0.9996744394302368, + "sampling/importance_sampling_ratio/min": 0.6205015778541565, + "sampling/sampling_logp_difference/max": 0.47722721099853516, + "sampling/sampling_logp_difference/mean": 0.013379833661019802, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 309.0, + "completions/max_terminated_length": 309.0, + "completions/mean_length": 149.8125, + "completions/mean_terminated_length": 149.8125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.38839036226272583, + "epoch": 0.9699115044247788, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1794775467338567, + "kl": 0.05261142924427986, + "learning_rate": 6.140551331107766e-07, + "loss": -0.0047, + "num_tokens": 12439211.0, + "reward": 0.46875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.46875, + "rewards/decision_reward_func/std": 0.8903138637542725, + "sampling/importance_sampling_ratio/max": 1.2884020805358887, + "sampling/importance_sampling_ratio/mean": 1.000056505203247, + "sampling/importance_sampling_ratio/min": 0.7180588245391846, + "sampling/sampling_logp_difference/max": 0.331203818321228, + "sampling/sampling_logp_difference/mean": 0.015212539583444595, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.0, + "completions/max_terminated_length": 578.0, + "completions/mean_length": 180.890625, + "completions/mean_terminated_length": 180.890625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.34552568197250366, + "epoch": 0.9716814159292035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.02976989825328661, + "kl": 0.0316726416349411, + "learning_rate": 6.125507734661458e-07, + "loss": 0.0003, + "num_tokens": 12463476.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5811829566955566, + "sampling/importance_sampling_ratio/mean": 0.9996100664138794, + "sampling/importance_sampling_ratio/min": 0.6776725649833679, + "sampling/sampling_logp_difference/max": 0.4581732749938965, + "sampling/sampling_logp_difference/mean": 0.014589247293770313, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 118.328125, + "completions/mean_terminated_length": 118.328125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "entropy": 0.3022470474243164, + "epoch": 0.9734513274336283, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.041772460103968415, + "kl": 0.028836343437433243, + "learning_rate": 6.110453398173187e-07, + "loss": 0.0003, + "num_tokens": 12480249.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.3200703859329224, + "sampling/importance_sampling_ratio/mean": 0.9993069767951965, + "sampling/importance_sampling_ratio/min": 0.6562559604644775, + "sampling/sampling_logp_difference/max": 0.4212043285369873, + "sampling/sampling_logp_difference/mean": 0.01448759064078331, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 140.515625, + "completions/mean_terminated_length": 140.515625, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "entropy": 0.36306560039520264, + "epoch": 0.9752212389380531, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.6733941874735485, + "kl": 0.059806544333696365, + "learning_rate": 6.095388465297418e-07, + "loss": 0.0431, + "num_tokens": 12499754.0, + "reward": 0.21875, + "reward_std": 0.4629635810852051, + "rewards/decision_reward_func/mean": 0.21875, + "rewards/decision_reward_func/std": 0.983494758605957, + "sampling/importance_sampling_ratio/max": 1.4759948253631592, + "sampling/importance_sampling_ratio/mean": 1.0003119707107544, + "sampling/importance_sampling_ratio/min": 0.6957031488418579, + "sampling/sampling_logp_difference/max": 0.3893321752548218, + "sampling/sampling_logp_difference/mean": 0.014819087460637093, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 170.703125, + "completions/mean_terminated_length": 170.703125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "entropy": 0.5047581791877747, + "epoch": 0.9769911504424779, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.1656785513547003, + "kl": 0.0593656450510025, + "learning_rate": 6.080313079789723e-07, + "loss": -0.0437, + "num_tokens": 12523159.0, + "reward": 0.4375, + "reward_std": 0.25, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.444342017173767, + "sampling/importance_sampling_ratio/mean": 0.9997916221618652, + "sampling/importance_sampling_ratio/min": 0.6242204308509827, + "sampling/sampling_logp_difference/max": 0.4712517261505127, + "sampling/sampling_logp_difference/mean": 0.017720595002174377, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 159.359375, + "completions/mean_terminated_length": 159.359375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.39272868633270264, + "epoch": 0.9787610619469026, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0153691210471831, + "kl": 0.051775217056274414, + "learning_rate": 6.065227385505421e-07, + "loss": 0.0171, + "num_tokens": 12543166.0, + "reward": 0.71875, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.71875, + "rewards/decision_reward_func/std": 0.7007648944854736, + "sampling/importance_sampling_ratio/max": 1.546797752380371, + "sampling/importance_sampling_ratio/mean": 0.9998806715011597, + "sampling/importance_sampling_ratio/min": 0.7299336194992065, + "sampling/sampling_logp_difference/max": 0.4361867904663086, + "sampling/sampling_logp_difference/mean": 0.015707682818174362, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 151.640625, + "completions/mean_terminated_length": 151.640625, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.33292603492736816, + "epoch": 0.9805309734513274, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0281427467866011, + "kl": 0.025199301540851593, + "learning_rate": 6.050131526398201e-07, + "loss": 0.0002, + "num_tokens": 12562823.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5091066360473633, + "sampling/importance_sampling_ratio/mean": 1.000128984451294, + "sampling/importance_sampling_ratio/min": 0.6200224161148071, + "sampling/sampling_logp_difference/max": 0.4779996871948242, + "sampling/sampling_logp_difference/mean": 0.015259217470884323, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 160.125, + "completions/mean_terminated_length": 160.125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.48752403259277344, + "epoch": 0.9823008849557522, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.559147835291522, + "kl": 0.05512962117791176, + "learning_rate": 6.035025646518746e-07, + "loss": 0.0112, + "num_tokens": 12585983.0, + "reward": 0.15625, + "reward_std": 0.47978055477142334, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.3810150623321533, + "sampling/importance_sampling_ratio/mean": 0.9998184442520142, + "sampling/importance_sampling_ratio/min": 0.6988722681999207, + "sampling/sampling_logp_difference/max": 0.35828733444213867, + "sampling/sampling_logp_difference/mean": 0.017440207302570343, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 170.625, + "completions/mean_terminated_length": 170.625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.39747118949890137, + "epoch": 0.984070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.829652216596529, + "kl": 0.05210302025079727, + "learning_rate": 6.019909890013366e-07, + "loss": -0.0149, + "num_tokens": 12606711.0, + "reward": 0.96875, + "reward_std": 0.125, + "rewards/decision_reward_func/mean": 0.96875, + "rewards/decision_reward_func/std": 0.25, + "sampling/importance_sampling_ratio/max": 1.6040233373641968, + "sampling/importance_sampling_ratio/mean": 0.9997381567955017, + "sampling/importance_sampling_ratio/min": 0.6278918981552124, + "sampling/sampling_logp_difference/max": 0.4725151062011719, + "sampling/sampling_logp_difference/mean": 0.015872985124588013, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 195.75, + "completions/mean_terminated_length": 195.75, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.5959153175354004, + "epoch": 0.9858407079646018, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.5838772016409295, + "kl": 0.06155340000987053, + "learning_rate": 6.004784401122612e-07, + "loss": 0.0408, + "num_tokens": 12634791.0, + "reward": 0.90625, + "reward_std": 0.375, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.32803213596344, + "sampling/importance_sampling_ratio/mean": 0.9998672008514404, + "sampling/importance_sampling_ratio/min": 0.608726441860199, + "sampling/sampling_logp_difference/max": 0.4963862895965576, + "sampling/sampling_logp_difference/mean": 0.019085844978690147, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.0, + "completions/max_terminated_length": 471.0, + "completions/mean_length": 179.75, + "completions/mean_terminated_length": 179.75, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.3235132098197937, + "epoch": 0.9876106194690265, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9844865975021971, + "kl": 0.02561378851532936, + "learning_rate": 5.98964932417991e-07, + "loss": -0.0045, + "num_tokens": 12656567.0, + "reward": 0.78125, + "reward_std": 0.2561737596988678, + "rewards/decision_reward_func/mean": 0.78125, + "rewards/decision_reward_func/std": 0.6291528940200806, + "sampling/importance_sampling_ratio/max": 1.4618462324142456, + "sampling/importance_sampling_ratio/mean": 0.999975323677063, + "sampling/importance_sampling_ratio/min": 0.7354151606559753, + "sampling/sampling_logp_difference/max": 0.3797001838684082, + "sampling/sampling_logp_difference/mean": 0.013623690232634544, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 184.921875, + "completions/mean_terminated_length": 184.921875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.5242243409156799, + "epoch": 0.9893805309734514, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.3583336237942298, + "kl": 0.04835689067840576, + "learning_rate": 5.974504803610178e-07, + "loss": -0.0223, + "num_tokens": 12680434.0, + "reward": 0.4375, + "reward_std": 0.47360679507255554, + "rewards/decision_reward_func/mean": 0.4375, + "rewards/decision_reward_func/std": 0.9063270092010498, + "sampling/importance_sampling_ratio/max": 1.4175808429718018, + "sampling/importance_sampling_ratio/mean": 1.000478982925415, + "sampling/importance_sampling_ratio/min": 0.7157229781150818, + "sampling/sampling_logp_difference/max": 0.3489518165588379, + "sampling/sampling_logp_difference/mean": 0.017108961939811707, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 415.0, + "completions/max_terminated_length": 415.0, + "completions/mean_length": 198.859375, + "completions/mean_terminated_length": 198.859375, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "entropy": 0.46707969903945923, + "epoch": 0.9911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8811747500237802, + "kl": 0.04385651648044586, + "learning_rate": 5.959350983928445e-07, + "loss": 0.0206, + "num_tokens": 12705833.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.2786628007888794, + "sampling/importance_sampling_ratio/mean": 0.9998224377632141, + "sampling/importance_sampling_ratio/min": 0.6214916110038757, + "sampling/sampling_logp_difference/max": 0.475632905960083, + "sampling/sampling_logp_difference/mean": 0.016529597342014313, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 157.15625, + "completions/mean_terminated_length": 157.15625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.48076122999191284, + "epoch": 0.9929203539823008, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.031460896412224985, + "kl": 0.052566226571798325, + "learning_rate": 5.944188009738483e-07, + "loss": 0.0005, + "num_tokens": 12728851.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6598697900772095, + "sampling/importance_sampling_ratio/mean": 1.0003383159637451, + "sampling/importance_sampling_ratio/min": 0.7128937244415283, + "sampling/sampling_logp_difference/max": 0.5067391395568848, + "sampling/sampling_logp_difference/mean": 0.016861015930771828, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 138.59375, + "completions/mean_terminated_length": 138.59375, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "entropy": 0.3797007203102112, + "epoch": 0.9946902654867257, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2475786187580753, + "kl": 0.037923719733953476, + "learning_rate": 5.929016025731413e-07, + "loss": 0.0133, + "num_tokens": 12747833.0, + "reward": 0.9375, + "reward_std": 0.17078250646591187, + "rewards/decision_reward_func/mean": 0.9375, + "rewards/decision_reward_func/std": 0.35073620080947876, + "sampling/importance_sampling_ratio/max": 1.56224524974823, + "sampling/importance_sampling_ratio/mean": 0.9994237422943115, + "sampling/importance_sampling_ratio/min": 0.657938539981842, + "sampling/sampling_logp_difference/max": 0.4461240768432617, + "sampling/sampling_logp_difference/mean": 0.014938208274543285, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 150.84375, + "completions/mean_terminated_length": 150.84375, + "completions/min_length": 65.0, + "completions/min_terminated_length": 65.0, + "entropy": 0.27015700936317444, + "epoch": 0.9964601769911504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.024702193806412247, + "kl": 0.02522423304617405, + "learning_rate": 5.913835176684334e-07, + "loss": 0.0002, + "num_tokens": 12767247.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.5145760774612427, + "sampling/importance_sampling_ratio/mean": 1.000138521194458, + "sampling/importance_sampling_ratio/min": 0.7146693468093872, + "sampling/sampling_logp_difference/max": 0.41513562202453613, + "sampling/sampling_logp_difference/mean": 0.012315331026911736, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 258.828125, + "completions/mean_terminated_length": 258.828125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.6288559436798096, + "epoch": 0.9982300884955753, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.2643571409135848, + "kl": 0.06781111657619476, + "learning_rate": 5.89864560745894e-07, + "loss": 0.0007, + "num_tokens": 12794596.0, + "reward": 0.15625, + "reward_std": 0.6601393222808838, + "rewards/decision_reward_func/mean": 0.15625, + "rewards/decision_reward_func/std": 0.9955257177352905, + "sampling/importance_sampling_ratio/max": 1.282141923904419, + "sampling/importance_sampling_ratio/mean": 1.0002317428588867, + "sampling/importance_sampling_ratio/min": 0.6257727146148682, + "sampling/sampling_logp_difference/max": 0.4687681198120117, + "sampling/sampling_logp_difference/mean": 0.018289338797330856, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 201.375, + "completions/mean_terminated_length": 201.375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.41560235619544983, + "epoch": 1.0, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8499870699291785, + "kl": 0.036372072994709015, + "learning_rate": 5.883447463000135e-07, + "loss": -0.0161, + "num_tokens": 12817276.0, + "reward": 0.90625, + "reward_std": 0.20155644416809082, + "rewards/decision_reward_func/mean": 0.90625, + "rewards/decision_reward_func/std": 0.42608407139778137, + "sampling/importance_sampling_ratio/max": 1.5545761585235596, + "sampling/importance_sampling_ratio/mean": 1.0002684593200684, + "sampling/importance_sampling_ratio/min": 0.6176016330718994, + "sampling/sampling_logp_difference/max": 0.48191165924072266, + "sampling/sampling_logp_difference/mean": 0.014586273580789566, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 112.84375, + "completions/mean_terminated_length": 112.84375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.2689701318740845, + "epoch": 1.0017699115044247, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03643559393027775, + "kl": 0.027217669412493706, + "learning_rate": 5.868240888334652e-07, + "loss": 0.0003, + "num_tokens": 12833986.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/decision_reward_func/mean": 1.0, + "rewards/decision_reward_func/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.513666033744812, + "sampling/importance_sampling_ratio/mean": 1.000664472579956, + "sampling/importance_sampling_ratio/min": 0.6641154885292053, + "sampling/sampling_logp_difference/max": 0.4145345687866211, + "sampling/sampling_logp_difference/mean": 0.012834830209612846, + "step": 566 + } + ], + "logging_steps": 1, + "max_steps": 1130, + "num_input_tokens_seen": 12833986, + "num_train_epochs": 2, + "save_steps": 283, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}