{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00936, "eval_steps": 500, "global_step": 468, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1667.71875, "completions/mean_terminated_length": 1667.71875, "completions/min_length": 1335.0, "completions/min_terminated_length": 1335.0, "entropy": 0.47570936381816864, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.7817060947418213, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0175, "num_tokens": 73979.0, "reward": -8.643012046813965, "reward_std": 11.93884563446045, "rewards/rollout_reward_func/mean": -8.643012046813965, "rewards/rollout_reward_func/std": 13.176301956176758, "sampling/importance_sampling_ratio/max": 1.8267614841461182, "sampling/importance_sampling_ratio/mean": 1.0556937456130981, "sampling/importance_sampling_ratio/min": 0.6958155035972595, "sampling/sampling_logp_difference/max": 0.4538118839263916, "sampling/sampling_logp_difference/mean": 0.018563803285360336, "step": 1, "step_time": 36.57323472299959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.47570936381816864, "epoch": 4e-05, "grad_norm": 1.778490424156189, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.0175, "step": 2, "step_time": 5.7524538550001125 }, { "clip_ratio/high_max": 0.007694128900766373, "clip_ratio/high_mean": 0.0038470644503831863, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038470644503831863, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1539.0625, "completions/mean_terminated_length": 1539.0625, "completions/min_length": 1102.0, "completions/min_terminated_length": 1102.0, "entropy": 0.48639967665076256, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.348313331604004, "kl": 0.0009020493234856986, "learning_rate": 5.714285714285715e-07, "loss": -0.0061, "num_tokens": 143940.0, "reward": -12.985191345214844, "reward_std": 9.043848037719727, "rewards/rollout_reward_func/mean": -12.985191345214844, "rewards/rollout_reward_func/std": 13.16507339477539, "sampling/importance_sampling_ratio/max": 1.4940505027770996, "sampling/importance_sampling_ratio/mean": 0.9786970019340515, "sampling/importance_sampling_ratio/min": 0.578092634677887, "sampling/sampling_logp_difference/max": 0.47244715690612793, "sampling/sampling_logp_difference/mean": 0.020807698369026184, "step": 3, "step_time": 33.27629456400018 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.4873173348605633, "epoch": 8e-05, "grad_norm": 2.1872665882110596, "kl": 0.0008225523779401556, "learning_rate": 8.571428571428572e-07, "loss": -0.0047, "step": 4, "step_time": 5.767302959000062 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 1677.5, "completions/mean_terminated_length": 1677.5, "completions/min_length": 1286.0, "completions/min_terminated_length": 1286.0, "entropy": 0.5088205523788929, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 2.25364089012146, "kl": 0.0010678053367882967, "learning_rate": 1.142857142857143e-06, "loss": 0.0065, "num_tokens": 218324.0, "reward": -10.823358535766602, "reward_std": 13.736562728881836, "rewards/rollout_reward_func/mean": -10.823358535766602, "rewards/rollout_reward_func/std": 15.484944343566895, "sampling/importance_sampling_ratio/max": 1.4805132150650024, "sampling/importance_sampling_ratio/mean": 1.0641556978225708, "sampling/importance_sampling_ratio/min": 0.568811297416687, "sampling/sampling_logp_difference/max": 0.2526984214782715, "sampling/sampling_logp_difference/mean": 0.023618247359991074, "step": 5, "step_time": 36.10746106100078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5093838162720203, "epoch": 0.00012, "grad_norm": 2.288219928741455, "kl": 0.001079607754945755, "learning_rate": 1.4285714285714286e-06, "loss": 0.0054, "step": 6, "step_time": 5.795254830000886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003738839295692742, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003738839295692742, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1591.78125, "completions/mean_terminated_length": 1591.78125, "completions/min_length": 1105.0, "completions/min_terminated_length": 1105.0, "entropy": 0.46493203938007355, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 2.118450164794922, "kl": 0.0011459436791483313, "learning_rate": 1.7142857142857145e-06, "loss": 0.0396, "num_tokens": 289395.0, "reward": -10.49427318572998, "reward_std": 11.239557266235352, "rewards/rollout_reward_func/mean": -10.49427318572998, "rewards/rollout_reward_func/std": 15.952676773071289, "sampling/importance_sampling_ratio/max": 1.448197603225708, "sampling/importance_sampling_ratio/mean": 1.0151325464248657, "sampling/importance_sampling_ratio/min": 7.746867383695566e-12, "sampling/sampling_logp_difference/max": 24.934412002563477, "sampling/sampling_logp_difference/mean": 0.07075877487659454, "step": 7, "step_time": 33.41762578200087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.46468354761600494, "epoch": 0.00016, "grad_norm": 2.175487995147705, "kl": 0.0008941343548940495, "learning_rate": 2.0000000000000003e-06, "loss": 0.0395, "step": 8, "step_time": 5.792184412999632 }, { "clip_ratio/high_max": 0.017968750093132257, "clip_ratio/high_mean": 0.008984375046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008984375046566129, "completions/clipped_ratio": 0.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 1506.34375, "completions/mean_terminated_length": 1506.34375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.49622904509305954, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 1.7127985954284668, "kl": 0.0012855095374106895, "learning_rate": 2.285714285714286e-06, "loss": 0.0714, "num_tokens": 358179.0, "reward": -11.01856803894043, "reward_std": 10.841419219970703, "rewards/rollout_reward_func/mean": -11.01856803894043, "rewards/rollout_reward_func/std": 16.40532112121582, "sampling/importance_sampling_ratio/max": 1.4218463897705078, "sampling/importance_sampling_ratio/mean": 0.9850149750709534, "sampling/importance_sampling_ratio/min": 0.4642792344093323, "sampling/sampling_logp_difference/max": 0.4877281188964844, "sampling/sampling_logp_difference/mean": 0.023507963865995407, "step": 9, "step_time": 32.45201305700084 }, { "clip_ratio/high_max": 0.021875000093132257, "clip_ratio/high_mean": 0.010937500046566129, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012890625046566129, "entropy": 0.494538277387619, "epoch": 0.0002, "grad_norm": 1.6264572143554688, "kl": 0.0011894339404534549, "learning_rate": 2.571428571428571e-06, "loss": 0.0722, "step": 10, "step_time": 6.248037073999512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1599.1875, "completions/mean_terminated_length": 1599.1875, "completions/min_length": 1161.0, "completions/min_terminated_length": 1161.0, "entropy": 0.4714800976216793, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 1.9238476753234863, "kl": 0.0010464704500918742, "learning_rate": 2.8571428571428573e-06, "loss": 0.0011, "num_tokens": 429182.0, "reward": -7.59177303314209, "reward_std": 6.999079704284668, "rewards/rollout_reward_func/mean": -7.59177303314209, "rewards/rollout_reward_func/std": 11.693867683410645, "sampling/importance_sampling_ratio/max": 1.6081898212432861, "sampling/importance_sampling_ratio/mean": 1.0232391357421875, "sampling/importance_sampling_ratio/min": 0.7353501915931702, "sampling/sampling_logp_difference/max": 0.3081374168395996, "sampling/sampling_logp_difference/mean": 0.018512647598981857, "step": 11, "step_time": 36.059418668000035 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.4721835069358349, "epoch": 0.00024, "grad_norm": 1.821286916732788, "kl": 0.0016941909561865032, "learning_rate": 3.142857142857143e-06, "loss": -0.0004, "step": 12, "step_time": 5.634688684001048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 1668.0625, "completions/mean_terminated_length": 1668.0625, "completions/min_length": 1357.0, "completions/min_terminated_length": 1357.0, "entropy": 0.44804077222943306, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 1.4859932661056519, "kl": 0.0011393425811547786, "learning_rate": 3.428571428571429e-06, "loss": -0.053, "num_tokens": 503173.0, "reward": -3.6963202953338623, "reward_std": 10.114439964294434, "rewards/rollout_reward_func/mean": -3.6963202953338623, "rewards/rollout_reward_func/std": 14.7977876663208, "sampling/importance_sampling_ratio/max": 1.287428855895996, "sampling/importance_sampling_ratio/mean": 0.9384276866912842, "sampling/importance_sampling_ratio/min": 1.816465272468093e-13, "sampling/sampling_logp_difference/max": 28.412260055541992, "sampling/sampling_logp_difference/mean": 0.07413282990455627, "step": 13, "step_time": 34.34775389900187 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.4483692906796932, "epoch": 0.00028, "grad_norm": 1.5191307067871094, "kl": 0.0010726663895184174, "learning_rate": 3.7142857142857146e-06, "loss": -0.0537, "step": 14, "step_time": 5.816520962000141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 1648.125, "completions/mean_terminated_length": 1646.51611328125, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "entropy": 0.46873993426561356, "epoch": 0.0003, "frac_reward_zero_std": 0.0, "grad_norm": 2.0255329608917236, "kl": 0.0014358056214405224, "learning_rate": 4.000000000000001e-06, "loss": -0.0246, "num_tokens": 576514.0, "reward": -10.058404922485352, "reward_std": 10.943680763244629, "rewards/rollout_reward_func/mean": -10.058404922485352, "rewards/rollout_reward_func/std": 13.969420433044434, "sampling/importance_sampling_ratio/max": 1.829142689704895, "sampling/importance_sampling_ratio/mean": 0.9690600633621216, "sampling/importance_sampling_ratio/min": 0.6791275143623352, "sampling/sampling_logp_difference/max": 0.32487010955810547, "sampling/sampling_logp_difference/mean": 0.019666891545057297, "step": 15, "step_time": 34.85173578099875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.4683184288442135, "epoch": 0.00032, "grad_norm": 1.9144654273986816, "kl": 0.0020070531027158722, "learning_rate": 4.2857142857142855e-06, "loss": -0.0245, "step": 16, "step_time": 6.776438935000442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1555.25, "completions/mean_terminated_length": 1555.25, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "entropy": 0.45006250962615013, "epoch": 0.00034, "frac_reward_zero_std": 0.0, "grad_norm": 1.7223249673843384, "kl": 0.0019661694386741146, "learning_rate": 4.571428571428572e-06, "loss": -0.0264, "num_tokens": 647530.0, "reward": -6.622595310211182, "reward_std": 12.143867492675781, "rewards/rollout_reward_func/mean": -6.622595310211182, "rewards/rollout_reward_func/std": 26.77586555480957, "sampling/importance_sampling_ratio/max": 1.2898485660552979, "sampling/importance_sampling_ratio/mean": 1.0187060832977295, "sampling/importance_sampling_ratio/min": 0.7567406892776489, "sampling/sampling_logp_difference/max": 0.2456502914428711, "sampling/sampling_logp_difference/mean": 0.017298312857747078, "step": 17, "step_time": 31.772850325000036 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.4529193826019764, "epoch": 0.00036, "grad_norm": 1.9187003374099731, "kl": 0.002475597968441434, "learning_rate": 4.857142857142858e-06, "loss": -0.0276, "step": 18, "step_time": 5.816502887000752 }, { "clip_ratio/high_max": 0.00728462846018374, "clip_ratio/high_mean": 0.00364231423009187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00364231423009187, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1580.625, "completions/mean_terminated_length": 1580.625, "completions/min_length": 1142.0, "completions/min_terminated_length": 1142.0, "entropy": 0.5051322989165783, "epoch": 0.00038, "frac_reward_zero_std": 0.0, "grad_norm": 1.7061576843261719, "kl": 0.0045548786874860525, "learning_rate": 5.142857142857142e-06, "loss": -0.0058, "num_tokens": 718703.0, "reward": -9.462235450744629, "reward_std": 12.071723937988281, "rewards/rollout_reward_func/mean": -9.462235450744629, "rewards/rollout_reward_func/std": 19.616609573364258, "sampling/importance_sampling_ratio/max": 1.4264099597930908, "sampling/importance_sampling_ratio/mean": 0.9698678255081177, "sampling/importance_sampling_ratio/min": 0.5692899823188782, "sampling/sampling_logp_difference/max": 0.41555118560791016, "sampling/sampling_logp_difference/mean": 0.025009114295244217, "step": 19, "step_time": 33.3764044440004 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0016891892300918698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00364231423009187, "entropy": 0.5052920691668987, "epoch": 0.0004, "grad_norm": 1.6149944067001343, "kl": 0.00593935526558198, "learning_rate": 5.428571428571429e-06, "loss": -0.009, "step": 20, "step_time": 5.7515413030014315 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.007694128900766373, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011600378900766373, "completions/clipped_ratio": 0.03125, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 1632.125, "completions/mean_terminated_length": 1655.54833984375, "completions/min_length": 906.0, "completions/min_terminated_length": 1245.0, "entropy": 0.4712696149945259, "epoch": 0.00042, "frac_reward_zero_std": 0.0, "grad_norm": 1.766438364982605, "kl": 0.007341491058468819, "learning_rate": 5.7142857142857145e-06, "loss": -0.0201, "num_tokens": 791658.0, "reward": -9.433378219604492, "reward_std": 12.105504989624023, "rewards/rollout_reward_func/mean": -9.433378219604492, "rewards/rollout_reward_func/std": 15.258283615112305, "sampling/importance_sampling_ratio/max": 1.574469804763794, "sampling/importance_sampling_ratio/mean": 0.9757359027862549, "sampling/importance_sampling_ratio/min": 0.6087821125984192, "sampling/sampling_logp_difference/max": 0.3755350112915039, "sampling/sampling_logp_difference/mean": 0.028891967609524727, "step": 21, "step_time": 33.09052270600023 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.46676792576909065, "epoch": 0.00044, "grad_norm": 1.957079529762268, "kl": 0.01086019104695879, "learning_rate": 6e-06, "loss": -0.0213, "step": 22, "step_time": 6.599295061999328 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005800189450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1548.6875, "completions/mean_terminated_length": 1548.6875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "entropy": 0.449709665030241, "epoch": 0.00046, "frac_reward_zero_std": 0.0, "grad_norm": 1.5633138418197632, "kl": 0.018061322276480496, "learning_rate": 6.285714285714286e-06, "loss": -0.0678, "num_tokens": 861907.0, "reward": -14.31330394744873, "reward_std": 9.498095512390137, "rewards/rollout_reward_func/mean": -14.31330394744873, "rewards/rollout_reward_func/std": 13.741255760192871, "sampling/importance_sampling_ratio/max": 1.6751961708068848, "sampling/importance_sampling_ratio/mean": 0.8833787441253662, "sampling/importance_sampling_ratio/min": 0.37046000361442566, "sampling/sampling_logp_difference/max": 0.5361905097961426, "sampling/sampling_logp_difference/mean": 0.039727941155433655, "step": 23, "step_time": 33.344133182000405 }, { "clip_ratio/high_max": 0.019294507801532745, "clip_ratio/high_mean": 0.009647253900766373, "clip_ratio/low_mean": 0.013395675574429333, "clip_ratio/low_min": 0.0037878789007663727, "clip_ratio/region_mean": 0.023042929475195706, "entropy": 0.4419500008225441, "epoch": 0.00048, "grad_norm": 1.234191656112671, "kl": 0.03053808701224625, "learning_rate": 6.571428571428572e-06, "loss": -0.0702, "step": 24, "step_time": 5.787397329001578 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1765.0, "completions/max_terminated_length": 1765.0, "completions/mean_length": 1629.6875, "completions/mean_terminated_length": 1629.6875, "completions/min_length": 1226.0, "completions/min_terminated_length": 1226.0, "entropy": 0.42522644996643066, "epoch": 0.0005, "frac_reward_zero_std": 0.0, "grad_norm": 2.869774103164673, "kl": 0.04610300064086914, "learning_rate": 6.857142857142858e-06, "loss": -0.0692, "num_tokens": 934819.0, "reward": -13.978726387023926, "reward_std": 13.509387969970703, "rewards/rollout_reward_func/mean": -13.978726387023926, "rewards/rollout_reward_func/std": 15.972912788391113, "sampling/importance_sampling_ratio/max": 2.516436815261841, "sampling/importance_sampling_ratio/mean": 1.0305383205413818, "sampling/importance_sampling_ratio/min": 0.15315498411655426, "sampling/sampling_logp_difference/max": 0.7299051284790039, "sampling/sampling_logp_difference/mean": 0.05616045743227005, "step": 25, "step_time": 34.39816641099969 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.019055451033636928, "clip_ratio/low_min": 0.0035714285913854837, "clip_ratio/region_mean": 0.021008576033636928, "entropy": 0.41626644134521484, "epoch": 0.00052, "grad_norm": 2.5028293132781982, "kl": 0.07302278326824307, "learning_rate": 7.1428571428571436e-06, "loss": -0.0701, "step": 26, "step_time": 5.674365414000931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1672.75, "completions/mean_terminated_length": 1672.75, "completions/min_length": 1139.0, "completions/min_terminated_length": 1139.0, "entropy": 0.45271916687488556, "epoch": 0.00054, "frac_reward_zero_std": 0.0, "grad_norm": 2.5938966274261475, "kl": 0.08275566063821316, "learning_rate": 7.428571428571429e-06, "loss": -0.1045, "num_tokens": 1009027.0, "reward": -8.39571762084961, "reward_std": 8.244482040405273, "rewards/rollout_reward_func/mean": -8.39571762084961, "rewards/rollout_reward_func/std": 10.250362396240234, "sampling/importance_sampling_ratio/max": 2.743147850036621, "sampling/importance_sampling_ratio/mean": 1.0750949382781982, "sampling/importance_sampling_ratio/min": 0.17412874102592468, "sampling/sampling_logp_difference/max": 1.0401973724365234, "sampling/sampling_logp_difference/mean": 0.06947841495275497, "step": 27, "step_time": 37.13791847399898 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.009706439450383186, "clip_ratio/low_min": 0.0037878789007663727, "clip_ratio/region_mean": 0.015565814450383186, "entropy": 0.4357459023594856, "epoch": 0.00056, "grad_norm": 2.490011692047119, "kl": 0.1187180420383811, "learning_rate": 7.714285714285716e-06, "loss": -0.1102, "step": 28, "step_time": 5.809195054999691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1632.46875, "completions/mean_terminated_length": 1632.46875, "completions/min_length": 1152.0, "completions/min_terminated_length": 1152.0, "entropy": 0.4116561934351921, "epoch": 0.00058, "frac_reward_zero_std": 0.0, "grad_norm": 1.761763572692871, "kl": 0.1548005947843194, "learning_rate": 8.000000000000001e-06, "loss": -0.2984, "num_tokens": 1081858.0, "reward": -3.9409875869750977, "reward_std": 13.231146812438965, "rewards/rollout_reward_func/mean": -3.9409875869750977, "rewards/rollout_reward_func/std": 15.818596839904785, "sampling/importance_sampling_ratio/max": 2.6883790493011475, "sampling/importance_sampling_ratio/mean": 0.9734947681427002, "sampling/importance_sampling_ratio/min": 0.0414450503885746, "sampling/sampling_logp_difference/max": 1.37030029296875, "sampling/sampling_logp_difference/mean": 0.07883325219154358, "step": 29, "step_time": 35.647616072001256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021484375, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.021484375, "entropy": 0.39363332837820053, "epoch": 0.0006, "grad_norm": 1.4616507291793823, "kl": 0.23487216513603926, "learning_rate": 8.285714285714287e-06, "loss": -0.3039, "step": 30, "step_time": 5.713043954000568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1727.0, "completions/max_terminated_length": 1727.0, "completions/mean_length": 1557.90625, "completions/mean_terminated_length": 1557.90625, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "entropy": 0.35244373232126236, "epoch": 0.00062, "frac_reward_zero_std": 0.0, "grad_norm": 1.890699863433838, "kl": 0.21074518747627735, "learning_rate": 8.571428571428571e-06, "loss": -0.29, "num_tokens": 1152094.0, "reward": -6.871125221252441, "reward_std": 8.818297386169434, "rewards/rollout_reward_func/mean": -6.871125221252441, "rewards/rollout_reward_func/std": 15.35950756072998, "sampling/importance_sampling_ratio/max": 2.7834908962249756, "sampling/importance_sampling_ratio/mean": 1.0891664028167725, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.534125804901123, "sampling/sampling_logp_difference/mean": 0.08870169520378113, "step": 31, "step_time": 32.18954384399967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.023319128900766373, "clip_ratio/low_min": 0.011482007801532745, "clip_ratio/region_mean": 0.023319128900766373, "entropy": 0.332146555185318, "epoch": 0.00064, "grad_norm": 1.486317753791809, "kl": 0.3121166592463851, "learning_rate": 8.857142857142858e-06, "loss": -0.2913, "step": 32, "step_time": 5.56816236999839 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 1660.71875, "completions/mean_terminated_length": 1660.71875, "completions/min_length": 1569.0, "completions/min_terminated_length": 1569.0, "entropy": 0.30892339907586575, "epoch": 0.00066, "frac_reward_zero_std": 0.0, "grad_norm": 1.552164912223816, "kl": 0.49738539289683104, "learning_rate": 9.142857142857144e-06, "loss": 0.0404, "num_tokens": 1225849.0, "reward": -2.268610954284668, "reward_std": 7.639632225036621, "rewards/rollout_reward_func/mean": -2.268610954284668, "rewards/rollout_reward_func/std": 9.778660774230957, "sampling/importance_sampling_ratio/max": 2.200782299041748, "sampling/importance_sampling_ratio/mean": 1.0492533445358276, "sampling/importance_sampling_ratio/min": 0.0332188606262207, "sampling/sampling_logp_difference/max": 1.9084991216659546, "sampling/sampling_logp_difference/mean": 0.10417380183935165, "step": 33, "step_time": 38.71719070200106 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.013671875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.017578125, "entropy": 0.2939576134085655, "epoch": 0.00068, "grad_norm": 1.684899091720581, "kl": 0.6880170339718461, "learning_rate": 9.42857142857143e-06, "loss": 0.0402, "step": 34, "step_time": 5.760281337999004 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1693.6875, "completions/mean_terminated_length": 1693.6875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "entropy": 0.29154328256845474, "epoch": 0.0007, "frac_reward_zero_std": 0.0, "grad_norm": 1.4360551834106445, "kl": 0.4088546773418784, "learning_rate": 9.714285714285715e-06, "loss": -0.0455, "num_tokens": 1301276.0, "reward": 0.6782079935073853, "reward_std": 8.422740936279297, "rewards/rollout_reward_func/mean": 0.6782079935073853, "rewards/rollout_reward_func/std": 11.004219055175781, "sampling/importance_sampling_ratio/max": 2.5485658645629883, "sampling/importance_sampling_ratio/mean": 0.9325703382492065, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.17055344581604, "sampling/sampling_logp_difference/mean": 0.10468995571136475, "step": 35, "step_time": 35.92719602200032 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.2877119109034538, "epoch": 0.00072, "grad_norm": 1.655564308166504, "kl": 0.46621189545840025, "learning_rate": 1e-05, "loss": -0.047, "step": 36, "step_time": 5.835250215999622 }, { "clip_ratio/high_max": 0.009588068351149559, "clip_ratio/high_mean": 0.0047940341755747795, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0067471591755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 1550.90625, "completions/mean_terminated_length": 1550.90625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.3196103312075138, "epoch": 0.00074, "frac_reward_zero_std": 0.0, "grad_norm": 2.215740919113159, "kl": 0.681450292468071, "learning_rate": 9.999999998148153e-06, "loss": -0.1484, "num_tokens": 1371674.0, "reward": -7.619454383850098, "reward_std": 9.219554901123047, "rewards/rollout_reward_func/mean": -7.619454383850098, "rewards/rollout_reward_func/std": 13.010725021362305, "sampling/importance_sampling_ratio/max": 2.8137710094451904, "sampling/importance_sampling_ratio/mean": 0.994865894317627, "sampling/importance_sampling_ratio/min": 0.05689575895667076, "sampling/sampling_logp_difference/max": 2.270341157913208, "sampling/sampling_logp_difference/mean": 0.10556286573410034, "step": 37, "step_time": 31.889447493997977 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.01065340917557478, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.02237215917557478, "entropy": 0.32365087792277336, "epoch": 0.00076, "grad_norm": 1.419662594795227, "kl": 0.6783300125971437, "learning_rate": 9.999999992592613e-06, "loss": -0.1485, "step": 38, "step_time": 5.828878851998525 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 1627.1875, "completions/mean_terminated_length": 1627.1875, "completions/min_length": 1383.0, "completions/min_terminated_length": 1383.0, "entropy": 0.27882106602191925, "epoch": 0.00078, "frac_reward_zero_std": 0.0, "grad_norm": 1.6235533952713013, "kl": 0.6874659867025912, "learning_rate": 9.999999983333379e-06, "loss": -0.185, "num_tokens": 1444324.0, "reward": -7.30010461807251, "reward_std": 5.225525856018066, "rewards/rollout_reward_func/mean": -7.30010461807251, "rewards/rollout_reward_func/std": 7.217170715332031, "sampling/importance_sampling_ratio/max": 1.8792587518692017, "sampling/importance_sampling_ratio/mean": 0.7841943502426147, "sampling/importance_sampling_ratio/min": 1.4742858626612398e-12, "sampling/sampling_logp_difference/max": 26.153600692749023, "sampling/sampling_logp_difference/mean": 0.14350782334804535, "step": 39, "step_time": 37.57111164300022 }, { "clip_ratio/high_max": 0.015395220601931214, "clip_ratio/high_mean": 0.007697610300965607, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011603860300965607, "entropy": 0.28622257336974144, "epoch": 0.0008, "grad_norm": 1.2302234172821045, "kl": 0.5287733990699053, "learning_rate": 9.999999970370451e-06, "loss": -0.1871, "step": 40, "step_time": 5.553051099000186 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1600.71875, "completions/mean_terminated_length": 1600.71875, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "entropy": 0.29325923696160316, "epoch": 0.00082, "frac_reward_zero_std": 0.0, "grad_norm": 1.7616039514541626, "kl": 1.0703641921281815, "learning_rate": 9.99999995370383e-06, "loss": -0.0902, "num_tokens": 1516459.0, "reward": -2.2486090660095215, "reward_std": 6.6893110275268555, "rewards/rollout_reward_func/mean": -2.2486090660095215, "rewards/rollout_reward_func/std": 8.419123649597168, "sampling/importance_sampling_ratio/max": 1.7888535261154175, "sampling/importance_sampling_ratio/mean": 0.7111120223999023, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.990407943725586, "sampling/sampling_logp_difference/mean": 0.1629885733127594, "step": 41, "step_time": 36.283448277998104 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.3051527179777622, "epoch": 0.00084, "grad_norm": 1.2423319816589355, "kl": 0.7511667739599943, "learning_rate": 9.999999933333514e-06, "loss": -0.0931, "step": 42, "step_time": 5.804295024000567 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1654.71875, "completions/mean_terminated_length": 1654.71875, "completions/min_length": 1497.0, "completions/min_terminated_length": 1497.0, "entropy": 0.30402176454663277, "epoch": 0.00086, "frac_reward_zero_std": 0.0, "grad_norm": 1.1876243352890015, "kl": 0.3072218671441078, "learning_rate": 9.999999909259504e-06, "loss": -0.2329, "num_tokens": 1590037.0, "reward": -7.089628219604492, "reward_std": 6.80203914642334, "rewards/rollout_reward_func/mean": -7.089628219604492, "rewards/rollout_reward_func/std": 8.887528419494629, "sampling/importance_sampling_ratio/max": 2.1073219776153564, "sampling/importance_sampling_ratio/mean": 0.829495906829834, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.700775384902954, "sampling/sampling_logp_difference/mean": 0.09777739644050598, "step": 43, "step_time": 36.3438146339995 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.31153809279203415, "epoch": 0.00088, "grad_norm": 1.2713712453842163, "kl": 0.26692129112780094, "learning_rate": 9.9999998814818e-06, "loss": -0.2308, "step": 44, "step_time": 6.615750631999617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018939394503831863, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1707.53125, "completions/mean_terminated_length": 1707.53125, "completions/min_length": 1563.0, "completions/min_terminated_length": 1563.0, "entropy": 0.3288617916405201, "epoch": 0.0009, "frac_reward_zero_std": 0.0, "grad_norm": 1.71139395236969, "kl": 0.36906831432133913, "learning_rate": 9.999999850000403e-06, "loss": -0.3049, "num_tokens": 1664951.0, "reward": -7.296619415283203, "reward_std": 7.16457462310791, "rewards/rollout_reward_func/mean": -7.296619415283203, "rewards/rollout_reward_func/std": 10.008492469787598, "sampling/importance_sampling_ratio/max": 2.4993622303009033, "sampling/importance_sampling_ratio/mean": 0.9107609987258911, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4973220825195312, "sampling/sampling_logp_difference/mean": 0.08788459748029709, "step": 45, "step_time": 37.92444931400041 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007753314450383186, "entropy": 0.33048854768276215, "epoch": 0.00092, "grad_norm": 1.5902395248413086, "kl": 0.3576007531955838, "learning_rate": 9.999999814815314e-06, "loss": -0.3085, "step": 46, "step_time": 5.829015459999937 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1676.21875, "completions/mean_terminated_length": 1676.21875, "completions/min_length": 1445.0, "completions/min_terminated_length": 1445.0, "entropy": 0.3326357714831829, "epoch": 0.00094, "frac_reward_zero_std": 0.0, "grad_norm": 1.6773115396499634, "kl": 0.5565784685313702, "learning_rate": 9.99999977592653e-06, "loss": -0.007, "num_tokens": 1739609.0, "reward": 0.43848538398742676, "reward_std": 6.706617832183838, "rewards/rollout_reward_func/mean": 0.43848538398742676, "rewards/rollout_reward_func/std": 11.718527793884277, "sampling/importance_sampling_ratio/max": 2.1868457794189453, "sampling/importance_sampling_ratio/mean": 0.8691270351409912, "sampling/importance_sampling_ratio/min": 0.01293564960360527, "sampling/sampling_logp_difference/max": 1.9674878120422363, "sampling/sampling_logp_difference/mean": 0.11179050803184509, "step": 47, "step_time": 38.77335685399976 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.32995380833745, "epoch": 0.00096, "grad_norm": 1.376336932182312, "kl": 0.6505217012017965, "learning_rate": 9.999999733334051e-06, "loss": -0.0105, "step": 48, "step_time": 5.757157200998336 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005264945677481592, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1583.28125, "completions/mean_terminated_length": 1583.28125, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.32827378809452057, "epoch": 0.00098, "frac_reward_zero_std": 0.0, "grad_norm": 1.5635427236557007, "kl": 0.2907373011112213, "learning_rate": 9.99999968703788e-06, "loss": -0.4158, "num_tokens": 1810562.0, "reward": -6.829834938049316, "reward_std": 7.807450294494629, "rewards/rollout_reward_func/mean": -6.829834938049316, "rewards/rollout_reward_func/std": 9.211454391479492, "sampling/importance_sampling_ratio/max": 2.658860683441162, "sampling/importance_sampling_ratio/mean": 1.1884284019470215, "sampling/importance_sampling_ratio/min": 8.98076324150387e-20, "sampling/sampling_logp_difference/max": 23.28862953186035, "sampling/sampling_logp_difference/mean": 0.18271556496620178, "step": 49, "step_time": 34.62787952399958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "entropy": 0.3216196559369564, "epoch": 0.001, "grad_norm": 1.6985197067260742, "kl": 0.3250427544116974, "learning_rate": 9.999999637038016e-06, "loss": -0.4232, "step": 50, "step_time": 6.581373721998716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1713.4375, "completions/mean_terminated_length": 1713.4375, "completions/min_length": 1563.0, "completions/min_terminated_length": 1563.0, "entropy": 0.3401281237602234, "epoch": 0.00102, "frac_reward_zero_std": 0.0, "grad_norm": 1.996994972229004, "kl": 0.4171795817092061, "learning_rate": 9.999999583334458e-06, "loss": -0.1942, "num_tokens": 1885989.0, "reward": -5.845695495605469, "reward_std": 7.324997901916504, "rewards/rollout_reward_func/mean": -5.845695495605469, "rewards/rollout_reward_func/std": 8.569113731384277, "sampling/importance_sampling_ratio/max": 2.724292039871216, "sampling/importance_sampling_ratio/mean": 0.9547368884086609, "sampling/importance_sampling_ratio/min": 0.08236531913280487, "sampling/sampling_logp_difference/max": 1.727935791015625, "sampling/sampling_logp_difference/mean": 0.10494339466094971, "step": 51, "step_time": 35.600800350999634 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.334337517619133, "epoch": 0.00104, "grad_norm": 1.823840618133545, "kl": 0.4406869113445282, "learning_rate": 9.999999525927207e-06, "loss": -0.199, "step": 52, "step_time": 5.828666499000974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1652.09375, "completions/mean_terminated_length": 1652.09375, "completions/min_length": 1469.0, "completions/min_terminated_length": 1469.0, "entropy": 0.30093443021178246, "epoch": 0.00106, "frac_reward_zero_std": 0.0, "grad_norm": 1.3367406129837036, "kl": 0.2392230462282896, "learning_rate": 9.999999464816262e-06, "loss": 0.0859, "num_tokens": 1959377.0, "reward": -2.500540256500244, "reward_std": 6.36287260055542, "rewards/rollout_reward_func/mean": -2.500540256500244, "rewards/rollout_reward_func/std": 8.245169639587402, "sampling/importance_sampling_ratio/max": 2.5041768550872803, "sampling/importance_sampling_ratio/mean": 0.9154846668243408, "sampling/importance_sampling_ratio/min": 3.6384679991119384e-12, "sampling/sampling_logp_difference/max": 26.868139266967773, "sampling/sampling_logp_difference/mean": 0.12778040766716003, "step": 53, "step_time": 37.75833414000044 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.30127133056521416, "epoch": 0.00108, "grad_norm": 1.2612088918685913, "kl": 0.2430503461509943, "learning_rate": 9.999999400001624e-06, "loss": 0.0846, "step": 54, "step_time": 5.662651007998647 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1671.09375, "completions/mean_terminated_length": 1671.09375, "completions/min_length": 1438.0, "completions/min_terminated_length": 1438.0, "entropy": 0.30898030288517475, "epoch": 0.0011, "frac_reward_zero_std": 0.0, "grad_norm": 1.7188196182250977, "kl": 0.3995477119460702, "learning_rate": 9.999999331483293e-06, "loss": -0.1787, "num_tokens": 2033283.0, "reward": -1.8547379970550537, "reward_std": 12.750988960266113, "rewards/rollout_reward_func/mean": -1.8547379970550537, "rewards/rollout_reward_func/std": 15.986498832702637, "sampling/importance_sampling_ratio/max": 2.544159173965454, "sampling/importance_sampling_ratio/mean": 0.9402295351028442, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6443579196929932, "sampling/sampling_logp_difference/mean": 0.084171824157238, "step": 55, "step_time": 36.70284881099997 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.31088376231491566, "epoch": 0.00112, "grad_norm": 1.6016024351119995, "kl": 0.4059265488758683, "learning_rate": 9.999999259261269e-06, "loss": -0.1811, "step": 56, "step_time": 6.270625210000617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1824.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1709.875, "completions/mean_terminated_length": 1709.875, "completions/min_length": 1545.0, "completions/min_terminated_length": 1545.0, "entropy": 0.29036473482847214, "epoch": 0.00114, "frac_reward_zero_std": 0.0, "grad_norm": 1.5101275444030762, "kl": 0.5090412199497223, "learning_rate": 9.999999183335551e-06, "loss": 0.0419, "num_tokens": 2108465.0, "reward": -3.56026291847229, "reward_std": 6.288623809814453, "rewards/rollout_reward_func/mean": -3.56026291847229, "rewards/rollout_reward_func/std": 8.626129150390625, "sampling/importance_sampling_ratio/max": 2.409752607345581, "sampling/importance_sampling_ratio/mean": 1.0026273727416992, "sampling/importance_sampling_ratio/min": 0.07137506455183029, "sampling/sampling_logp_difference/max": 1.469163417816162, "sampling/sampling_logp_difference/mean": 0.07790054380893707, "step": 57, "step_time": 37.90581317999931 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.2957710847258568, "epoch": 0.00116, "grad_norm": 1.3159449100494385, "kl": 0.4575840122997761, "learning_rate": 9.999999103706142e-06, "loss": 0.0392, "step": 58, "step_time": 5.813098757998887 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 1694.0, "completions/mean_terminated_length": 1694.0, "completions/min_length": 1574.0, "completions/min_terminated_length": 1574.0, "entropy": 0.31563786044716835, "epoch": 0.00118, "frac_reward_zero_std": 0.0, "grad_norm": 1.3663612604141235, "kl": 0.29621826112270355, "learning_rate": 9.999999020373038e-06, "loss": -0.1621, "num_tokens": 2183167.0, "reward": -2.3904330730438232, "reward_std": 9.909059524536133, "rewards/rollout_reward_func/mean": -2.3904330730438232, "rewards/rollout_reward_func/std": 13.05620002746582, "sampling/importance_sampling_ratio/max": 2.209512233734131, "sampling/importance_sampling_ratio/mean": 1.010425329208374, "sampling/importance_sampling_ratio/min": 0.09116992354393005, "sampling/sampling_logp_difference/max": 1.1912827491760254, "sampling/sampling_logp_difference/mean": 0.07018784433603287, "step": 59, "step_time": 35.51438805900034 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.3229809142649174, "epoch": 0.0012, "grad_norm": 1.4618602991104126, "kl": 0.2840815596282482, "learning_rate": 9.999998933336242e-06, "loss": -0.1622, "step": 60, "step_time": 5.714706689000195 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.003791360300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005744485300965607, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1640.28125, "completions/mean_terminated_length": 1640.28125, "completions/min_length": 1306.0, "completions/min_terminated_length": 1306.0, "entropy": 0.3373373970389366, "epoch": 0.00122, "frac_reward_zero_std": 0.0, "grad_norm": 1.2567992210388184, "kl": 0.4037298448383808, "learning_rate": 9.999998842595754e-06, "loss": -0.2237, "num_tokens": 2256023.0, "reward": -5.301498889923096, "reward_std": 9.393596649169922, "rewards/rollout_reward_func/mean": -5.301498889923096, "rewards/rollout_reward_func/std": 11.857550621032715, "sampling/importance_sampling_ratio/max": 1.5339993238449097, "sampling/importance_sampling_ratio/mean": 0.7339890599250793, "sampling/importance_sampling_ratio/min": 4.623656180147151e-10, "sampling/sampling_logp_difference/max": 21.543071746826172, "sampling/sampling_logp_difference/mean": 0.1303299069404602, "step": 61, "step_time": 37.366424696999275 }, { "clip_ratio/high_max": 0.02748579578474164, "clip_ratio/high_mean": 0.01374289789237082, "clip_ratio/low_mean": 0.003791360300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017534258076921105, "entropy": 0.3488129526376724, "epoch": 0.00124, "grad_norm": 1.0096025466918945, "kl": 0.36894314270466566, "learning_rate": 9.999998748151573e-06, "loss": -0.2269, "step": 62, "step_time": 5.755052162999164 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1623.4375, "completions/mean_terminated_length": 1623.4375, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "entropy": 0.33689238503575325, "epoch": 0.00126, "frac_reward_zero_std": 0.0, "grad_norm": 1.6014989614486694, "kl": 0.25019469298422337, "learning_rate": 9.999998650003697e-06, "loss": -0.2651, "num_tokens": 2328798.0, "reward": -1.0410969257354736, "reward_std": 7.0697126388549805, "rewards/rollout_reward_func/mean": -1.0410969257354736, "rewards/rollout_reward_func/std": 8.429243087768555, "sampling/importance_sampling_ratio/max": 2.9675095081329346, "sampling/importance_sampling_ratio/mean": 1.0478936433792114, "sampling/importance_sampling_ratio/min": 0.13199584186077118, "sampling/sampling_logp_difference/max": 1.1867618560791016, "sampling/sampling_logp_difference/mean": 0.0737098827958107, "step": 63, "step_time": 34.2187673239996 }, { "clip_ratio/high_max": 0.0037878789007663727, "clip_ratio/high_mean": 0.0018939394503831863, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038470644503831863, "entropy": 0.33753813430666924, "epoch": 0.00128, "grad_norm": 1.4934430122375488, "kl": 0.25292993150651455, "learning_rate": 9.999998548152132e-06, "loss": -0.269, "step": 64, "step_time": 5.7678524439998 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 1601.15625, "completions/mean_terminated_length": 1601.15625, "completions/min_length": 1455.0, "completions/min_terminated_length": 1455.0, "entropy": 0.32645104452967644, "epoch": 0.0013, "frac_reward_zero_std": 0.0, "grad_norm": 1.3427740335464478, "kl": 0.5479664504528046, "learning_rate": 9.999998442596872e-06, "loss": -0.0545, "num_tokens": 2400705.0, "reward": -2.5378642082214355, "reward_std": 5.739175319671631, "rewards/rollout_reward_func/mean": -2.5378642082214355, "rewards/rollout_reward_func/std": 7.828306198120117, "sampling/importance_sampling_ratio/max": 2.44716215133667, "sampling/importance_sampling_ratio/mean": 0.7654808163642883, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2951812744140625, "sampling/sampling_logp_difference/mean": 0.11022517830133438, "step": 65, "step_time": 37.02498609800023 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.015625, "entropy": 0.3282633610069752, "epoch": 0.00132, "grad_norm": 1.3897420167922974, "kl": 0.5626763068139553, "learning_rate": 9.999998333337923e-06, "loss": -0.0562, "step": 66, "step_time": 5.503358288000527 }, { "clip_ratio/high_max": 0.027107007801532745, "clip_ratio/high_mean": 0.013553503900766373, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017459753900766373, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1672.75, "completions/mean_terminated_length": 1672.75, "completions/min_length": 1138.0, "completions/min_terminated_length": 1138.0, "entropy": 0.37730714678764343, "epoch": 0.00134, "frac_reward_zero_std": 0.0, "grad_norm": 2.0150251388549805, "kl": 0.2086246032267809, "learning_rate": 9.99999822037528e-06, "loss": -0.2037, "num_tokens": 2474830.0, "reward": -2.223475933074951, "reward_std": 10.239798545837402, "rewards/rollout_reward_func/mean": -2.223475933074951, "rewards/rollout_reward_func/std": 10.8533353805542, "sampling/importance_sampling_ratio/max": 2.6771914958953857, "sampling/importance_sampling_ratio/mean": 0.894666850566864, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1186803579330444, "sampling/sampling_logp_difference/mean": 0.0862615779042244, "step": 67, "step_time": 36.272244204000344 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.007753314450383186, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009706439450383186, "entropy": 0.3758174180984497, "epoch": 0.00136, "grad_norm": 1.9776581525802612, "kl": 0.22687916830182076, "learning_rate": 9.999998103708944e-06, "loss": -0.2033, "step": 68, "step_time": 5.841565231000459 }, { "clip_ratio/high_max": 0.007694128900766373, "clip_ratio/high_mean": 0.0038470644503831863, "clip_ratio/low_mean": 0.006138392956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009985457174479961, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1643.09375, "completions/mean_terminated_length": 1643.09375, "completions/min_length": 983.0, "completions/min_terminated_length": 983.0, "entropy": 0.36833325773477554, "epoch": 0.00138, "frac_reward_zero_std": 0.0, "grad_norm": 1.960590124130249, "kl": 0.34138039220124483, "learning_rate": 9.999997983338918e-06, "loss": -0.0379, "num_tokens": 2548037.0, "reward": -7.881344318389893, "reward_std": 6.760621547698975, "rewards/rollout_reward_func/mean": -7.881344318389893, "rewards/rollout_reward_func/std": 10.78054428100586, "sampling/importance_sampling_ratio/max": 2.7962646484375, "sampling/importance_sampling_ratio/mean": 0.860181450843811, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.209362506866455, "sampling/sampling_logp_difference/mean": 0.09240779280662537, "step": 69, "step_time": 36.16751210700022 }, { "clip_ratio/high_max": 0.007694128900766373, "clip_ratio/high_mean": 0.0038470644503831863, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007753314450383186, "entropy": 0.36754436418414116, "epoch": 0.0014, "grad_norm": 2.076843500137329, "kl": 0.3671616306528449, "learning_rate": 9.999997859265198e-06, "loss": -0.0415, "step": 70, "step_time": 5.7710249699994165 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 1649.71875, "completions/mean_terminated_length": 1649.71875, "completions/min_length": 1263.0, "completions/min_terminated_length": 1263.0, "entropy": 0.3730818182229996, "epoch": 0.00142, "frac_reward_zero_std": 0.0, "grad_norm": 1.2103312015533447, "kl": 0.3976009897887707, "learning_rate": 9.999997731487788e-06, "loss": -0.1988, "num_tokens": 2621492.0, "reward": 1.924525260925293, "reward_std": 9.762751579284668, "rewards/rollout_reward_func/mean": 1.924525260925293, "rewards/rollout_reward_func/std": 11.85600757598877, "sampling/importance_sampling_ratio/max": 2.882676362991333, "sampling/importance_sampling_ratio/mean": 0.8822349905967712, "sampling/importance_sampling_ratio/min": 0.08272430300712585, "sampling/sampling_logp_difference/max": 1.3770942687988281, "sampling/sampling_logp_difference/mean": 0.09666060656309128, "step": 71, "step_time": 35.46656954899936 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.3668806441128254, "epoch": 0.00144, "grad_norm": 1.325650930404663, "kl": 0.4230933412909508, "learning_rate": 9.999997600006685e-06, "loss": -0.2015, "step": 72, "step_time": 6.277837039999213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1667.5625, "completions/mean_terminated_length": 1667.5625, "completions/min_length": 1473.0, "completions/min_terminated_length": 1473.0, "entropy": 0.3248750977218151, "epoch": 0.00146, "frac_reward_zero_std": 0.0, "grad_norm": 1.4092869758605957, "kl": 0.33990394696593285, "learning_rate": 9.999997464821892e-06, "loss": -0.0273, "num_tokens": 2695474.0, "reward": -1.5238330364227295, "reward_std": 6.099469184875488, "rewards/rollout_reward_func/mean": -1.5238330364227295, "rewards/rollout_reward_func/std": 7.693445682525635, "sampling/importance_sampling_ratio/max": 2.1886417865753174, "sampling/importance_sampling_ratio/mean": 0.9180707931518555, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.373499870300293, "sampling/sampling_logp_difference/mean": 0.07824774086475372, "step": 73, "step_time": 35.77179241700014 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.3200640454888344, "epoch": 0.00148, "grad_norm": 1.2980810403823853, "kl": 0.37775165028870106, "learning_rate": 9.999997325933409e-06, "loss": -0.0305, "step": 74, "step_time": 5.783254465000027 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 1648.6875, "completions/mean_terminated_length": 1648.6875, "completions/min_length": 1500.0, "completions/min_terminated_length": 1500.0, "entropy": 0.3307141959667206, "epoch": 0.0015, "frac_reward_zero_std": 0.0, "grad_norm": 1.529542088508606, "kl": 0.3330858051776886, "learning_rate": 9.999997183341233e-06, "loss": -0.1579, "num_tokens": 2768769.0, "reward": -1.9445281028747559, "reward_std": 6.960838317871094, "rewards/rollout_reward_func/mean": -1.9445281028747559, "rewards/rollout_reward_func/std": 8.304977416992188, "sampling/importance_sampling_ratio/max": 2.4997639656066895, "sampling/importance_sampling_ratio/mean": 0.8088055849075317, "sampling/importance_sampling_ratio/min": 0.09736470133066177, "sampling/sampling_logp_difference/max": 1.7568840980529785, "sampling/sampling_logp_difference/mean": 0.09207496047019958, "step": 75, "step_time": 34.512230323999574 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005800189450383186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007753314450383186, "entropy": 0.322955708950758, "epoch": 0.00152, "grad_norm": 1.5048940181732178, "kl": 0.37345754727721214, "learning_rate": 9.999997037045365e-06, "loss": -0.1606, "step": 76, "step_time": 5.6651066240001455 }, { "clip_ratio/high_max": 0.011600378900766373, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007753314450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 1672.03125, "completions/mean_terminated_length": 1672.03125, "completions/min_length": 1536.0, "completions/min_terminated_length": 1536.0, "entropy": 0.3117912784218788, "epoch": 0.00154, "frac_reward_zero_std": 0.0, "grad_norm": 1.2413570880889893, "kl": 0.7537785340100527, "learning_rate": 9.999996887045808e-06, "loss": -0.0368, "num_tokens": 2842874.0, "reward": -1.6352441310882568, "reward_std": 6.7997283935546875, "rewards/rollout_reward_func/mean": -1.6352441310882568, "rewards/rollout_reward_func/std": 9.460980415344238, "sampling/importance_sampling_ratio/max": 2.215763568878174, "sampling/importance_sampling_ratio/mean": 0.7862486839294434, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.4788150787353516, "sampling/sampling_logp_difference/mean": 0.10331455618143082, "step": 77, "step_time": 37.13179366900022 }, { "clip_ratio/high_max": 0.011600378900766373, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.013612689450383186, "entropy": 0.30787648260593414, "epoch": 0.00156, "grad_norm": 1.3049235343933105, "kl": 0.7575539667159319, "learning_rate": 9.99999673334256e-06, "loss": -0.0388, "step": 78, "step_time": 6.551664077999703 }, { "clip_ratio/high_max": 0.014436141354963183, "clip_ratio/high_mean": 0.007218070677481592, "clip_ratio/low_mean": 0.0034564394736662507, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010674510151147842, "completions/clipped_ratio": 0.03125, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 1622.15625, "completions/mean_terminated_length": 1619.806396484375, "completions/min_length": 1097.0, "completions/min_terminated_length": 1097.0, "entropy": 0.28395895659923553, "epoch": 0.00158, "frac_reward_zero_std": 0.0, "grad_norm": 1.774448275566101, "kl": 0.48293524608016014, "learning_rate": 9.99999657593562e-06, "loss": -0.108, "num_tokens": 2915438.0, "reward": -4.349352836608887, "reward_std": 5.503687381744385, "rewards/rollout_reward_func/mean": -4.349352836608887, "rewards/rollout_reward_func/std": 6.947903156280518, "sampling/importance_sampling_ratio/max": 2.6842427253723145, "sampling/importance_sampling_ratio/mean": 0.6991802453994751, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6641254425048828, "sampling/sampling_logp_difference/mean": 0.10404618084430695, "step": 79, "step_time": 37.2892129060001 }, { "clip_ratio/high_max": 0.021467391401529312, "clip_ratio/high_mean": 0.010733695700764656, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012686820700764656, "entropy": 0.28391823545098305, "epoch": 0.0016, "grad_norm": 1.2243019342422485, "kl": 0.5016566403210163, "learning_rate": 9.99999641482499e-06, "loss": -0.1094, "step": 80, "step_time": 5.5879295219992855 }, { "clip_ratio/high_max": 0.008370535913854837, "clip_ratio/high_mean": 0.006138392724096775, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008091517724096775, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 1655.34375, "completions/mean_terminated_length": 1655.34375, "completions/min_length": 1172.0, "completions/min_terminated_length": 1172.0, "entropy": 0.26277439296245575, "epoch": 0.00162, "frac_reward_zero_std": 0.0, "grad_norm": 1.137077808380127, "kl": 0.6223671287298203, "learning_rate": 9.999996250010671e-06, "loss": -0.1437, "num_tokens": 2989329.0, "reward": -2.7409677505493164, "reward_std": 7.541594505310059, "rewards/rollout_reward_func/mean": -2.7409677505493164, "rewards/rollout_reward_func/std": 10.378182411193848, "sampling/importance_sampling_ratio/max": 2.4534168243408203, "sampling/importance_sampling_ratio/mean": 0.8305980563163757, "sampling/importance_sampling_ratio/min": 0.05540309473872185, "sampling/sampling_logp_difference/max": 1.5272252559661865, "sampling/sampling_logp_difference/mean": 0.08948823064565659, "step": 81, "step_time": 34.51007757499883 }, { "clip_ratio/high_max": 0.012276785913854837, "clip_ratio/high_mean": 0.006138392956927419, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010044642956927419, "entropy": 0.2634422518312931, "epoch": 0.00164, "grad_norm": 1.2092783451080322, "kl": 0.5849091820418835, "learning_rate": 9.999996081492662e-06, "loss": -0.1446, "step": 82, "step_time": 5.717573774000812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1691.46875, "completions/mean_terminated_length": 1691.46875, "completions/min_length": 1465.0, "completions/min_terminated_length": 1465.0, "entropy": 0.274138493463397, "epoch": 0.00166, "frac_reward_zero_std": 0.0, "grad_norm": 2.1113321781158447, "kl": 0.42387660406529903, "learning_rate": 9.999995909270962e-06, "loss": -0.2307, "num_tokens": 3063737.0, "reward": -5.677342414855957, "reward_std": 5.941880226135254, "rewards/rollout_reward_func/mean": -5.677342414855957, "rewards/rollout_reward_func/std": 10.720724105834961, "sampling/importance_sampling_ratio/max": 2.54227614402771, "sampling/importance_sampling_ratio/mean": 1.0267385244369507, "sampling/importance_sampling_ratio/min": 0.09123067557811737, "sampling/sampling_logp_difference/max": 1.443819522857666, "sampling/sampling_logp_difference/mean": 0.08112908899784088, "step": 83, "step_time": 35.11318050199952 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.013671875, "entropy": 0.2726801373064518, "epoch": 0.00168, "grad_norm": 1.3651243448257446, "kl": 0.3971481677144766, "learning_rate": 9.999995733345573e-06, "loss": -0.2362, "step": 84, "step_time": 6.308203391001371 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 1705.21875, "completions/mean_terminated_length": 1705.21875, "completions/min_length": 1491.0, "completions/min_terminated_length": 1491.0, "entropy": 0.297397093847394, "epoch": 0.0017, "frac_reward_zero_std": 0.0, "grad_norm": 1.3406461477279663, "kl": 0.37086668983101845, "learning_rate": 9.999995553716494e-06, "loss": -0.1103, "num_tokens": 3139446.0, "reward": -6.638528823852539, "reward_std": 7.859951972961426, "rewards/rollout_reward_func/mean": -6.638528823852539, "rewards/rollout_reward_func/std": 12.807344436645508, "sampling/importance_sampling_ratio/max": 2.7764086723327637, "sampling/importance_sampling_ratio/mean": 0.7711232900619507, "sampling/importance_sampling_ratio/min": 0.055727217346429825, "sampling/sampling_logp_difference/max": 1.3549007177352905, "sampling/sampling_logp_difference/mean": 0.09796138107776642, "step": 85, "step_time": 35.18805706900184 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.29359815642237663, "epoch": 0.00172, "grad_norm": 1.2749756574630737, "kl": 0.37067493610084057, "learning_rate": 9.999995370383725e-06, "loss": -0.1135, "step": 86, "step_time": 5.857377255998472 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 1642.34375, "completions/mean_terminated_length": 1642.34375, "completions/min_length": 1546.0, "completions/min_terminated_length": 1546.0, "entropy": 0.25620525516569614, "epoch": 0.00174, "frac_reward_zero_std": 0.0, "grad_norm": 1.2850360870361328, "kl": 0.5967638976871967, "learning_rate": 9.999995183347268e-06, "loss": -0.2511, "num_tokens": 3212569.0, "reward": -4.25941276550293, "reward_std": 4.8053388595581055, "rewards/rollout_reward_func/mean": -4.25941276550293, "rewards/rollout_reward_func/std": 7.727044105529785, "sampling/importance_sampling_ratio/max": 2.633681058883667, "sampling/importance_sampling_ratio/mean": 0.743991494178772, "sampling/importance_sampling_ratio/min": 0.027961313724517822, "sampling/sampling_logp_difference/max": 2.183626651763916, "sampling/sampling_logp_difference/mean": 0.09684586524963379, "step": 87, "step_time": 35.86174799499986 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2555703092366457, "epoch": 0.00176, "grad_norm": 1.0292936563491821, "kl": 0.5942294523119926, "learning_rate": 9.999994992607122e-06, "loss": -0.2543, "step": 88, "step_time": 5.7727871480010435 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1587.25, "completions/mean_terminated_length": 1587.25, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "entropy": 0.2820359170436859, "epoch": 0.00178, "frac_reward_zero_std": 0.0, "grad_norm": 1.688633680343628, "kl": 0.7878309283405542, "learning_rate": 9.999994798163286e-06, "loss": -0.088, "num_tokens": 3283670.0, "reward": -1.0320963859558105, "reward_std": 7.258734703063965, "rewards/rollout_reward_func/mean": -1.0320963859558105, "rewards/rollout_reward_func/std": 11.793685913085938, "sampling/importance_sampling_ratio/max": 2.2084357738494873, "sampling/importance_sampling_ratio/mean": 0.8563207387924194, "sampling/importance_sampling_ratio/min": 0.050470076501369476, "sampling/sampling_logp_difference/max": 1.6832528114318848, "sampling/sampling_logp_difference/mean": 0.09054332971572876, "step": 89, "step_time": 35.323508203998244 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.28230165503919125, "epoch": 0.0018, "grad_norm": 1.442618727684021, "kl": 0.6910470742732286, "learning_rate": 9.999994600015764e-06, "loss": -0.0916, "step": 90, "step_time": 5.74529949799944 }, { "clip_ratio/high_max": 0.0032051282469183207, "clip_ratio/high_mean": 0.0016025641234591603, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035556891234591603, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1682.875, "completions/mean_terminated_length": 1682.875, "completions/min_length": 1197.0, "completions/min_terminated_length": 1197.0, "entropy": 0.31537965685129166, "epoch": 0.00182, "frac_reward_zero_std": 0.0, "grad_norm": 1.3677870035171509, "kl": 0.8561782389879227, "learning_rate": 9.99999439816455e-06, "loss": -0.0483, "num_tokens": 3358360.0, "reward": -6.243229866027832, "reward_std": 5.828756809234619, "rewards/rollout_reward_func/mean": -6.243229866027832, "rewards/rollout_reward_func/std": 7.688446998596191, "sampling/importance_sampling_ratio/max": 1.9665902853012085, "sampling/importance_sampling_ratio/mean": 0.5829079151153564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.1947691440582275, "sampling/sampling_logp_difference/mean": 0.10844805836677551, "step": 91, "step_time": 36.43737021200013 }, { "clip_ratio/high_max": 0.01101762824691832, "clip_ratio/high_mean": 0.00550881412345916, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00746193912345916, "entropy": 0.31900631822645664, "epoch": 0.00184, "grad_norm": 0.9948733448982239, "kl": 0.6338084079325199, "learning_rate": 9.999994192609649e-06, "loss": -0.0534, "step": 92, "step_time": 5.8420921550005005 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1658.375, "completions/mean_terminated_length": 1658.375, "completions/min_length": 1488.0, "completions/min_terminated_length": 1488.0, "entropy": 0.24276937916874886, "epoch": 0.00186, "frac_reward_zero_std": 0.0, "grad_norm": 0.994575023651123, "kl": 0.6424030996859074, "learning_rate": 9.99999398335106e-06, "loss": -0.1681, "num_tokens": 3432383.0, "reward": -2.1294469833374023, "reward_std": 8.107059478759766, "rewards/rollout_reward_func/mean": -2.1294469833374023, "rewards/rollout_reward_func/std": 9.736873626708984, "sampling/importance_sampling_ratio/max": 1.9103875160217285, "sampling/importance_sampling_ratio/mean": 0.5858784914016724, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.698678970336914, "sampling/sampling_logp_difference/mean": 0.10006298124790192, "step": 93, "step_time": 35.31451669100079 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017578125, "entropy": 0.24839669093489647, "epoch": 0.00188, "grad_norm": 0.7679377198219299, "kl": 0.5870474837720394, "learning_rate": 9.999993770388785e-06, "loss": -0.1693, "step": 94, "step_time": 5.76490683999873 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1640.625, "completions/mean_terminated_length": 1640.625, "completions/min_length": 1128.0, "completions/min_terminated_length": 1128.0, "entropy": 0.2957016546279192, "epoch": 0.0019, "frac_reward_zero_std": 0.0, "grad_norm": 2.351029634475708, "kl": 0.21496129594743252, "learning_rate": 9.99999355372282e-06, "loss": -0.1061, "num_tokens": 3505361.0, "reward": -2.3848154544830322, "reward_std": 4.5673828125, "rewards/rollout_reward_func/mean": -2.3848154544830322, "rewards/rollout_reward_func/std": 7.392127513885498, "sampling/importance_sampling_ratio/max": 2.9048802852630615, "sampling/importance_sampling_ratio/mean": 1.1882892847061157, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2800261974334717, "sampling/sampling_logp_difference/mean": 0.07288186252117157, "step": 95, "step_time": 35.78977110999949 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.30211447179317474, "epoch": 0.00192, "grad_norm": 1.8887192010879517, "kl": 0.20806447509676218, "learning_rate": 9.999993333353169e-06, "loss": -0.1059, "step": 96, "step_time": 5.819023845999254 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 1695.625, "completions/mean_terminated_length": 1695.625, "completions/min_length": 1572.0, "completions/min_terminated_length": 1572.0, "entropy": 0.29871419444680214, "epoch": 0.00194, "frac_reward_zero_std": 0.0, "grad_norm": 1.2661700248718262, "kl": 0.31952402740716934, "learning_rate": 9.999993109279829e-06, "loss": -0.2492, "num_tokens": 3580589.0, "reward": -1.2452682256698608, "reward_std": 4.780298233032227, "rewards/rollout_reward_func/mean": -1.2452682256698608, "rewards/rollout_reward_func/std": 7.40675163269043, "sampling/importance_sampling_ratio/max": 2.997992515563965, "sampling/importance_sampling_ratio/mean": 0.8266191482543945, "sampling/importance_sampling_ratio/min": 0.1390458345413208, "sampling/sampling_logp_difference/max": 1.113917589187622, "sampling/sampling_logp_difference/mean": 0.07985492050647736, "step": 97, "step_time": 35.633980886000245 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.30503564700484276, "epoch": 0.00196, "grad_norm": 1.3084163665771484, "kl": 0.3136520925909281, "learning_rate": 9.999992881502803e-06, "loss": -0.2512, "step": 98, "step_time": 5.782925431000876 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 1708.65625, "completions/mean_terminated_length": 1708.65625, "completions/min_length": 1498.0, "completions/min_terminated_length": 1498.0, "entropy": 0.28120192512869835, "epoch": 0.00198, "frac_reward_zero_std": 0.125, "grad_norm": 1.1371867656707764, "kl": 0.30992733500897884, "learning_rate": 9.999992650022092e-06, "loss": -0.0307, "num_tokens": 3655808.0, "reward": -2.348900556564331, "reward_std": 5.844359397888184, "rewards/rollout_reward_func/mean": -2.348900556564331, "rewards/rollout_reward_func/std": 7.3201165199279785, "sampling/importance_sampling_ratio/max": 2.0892417430877686, "sampling/importance_sampling_ratio/mean": 0.8098887205123901, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9662381410598755, "sampling/sampling_logp_difference/mean": 0.0835987776517868, "step": 99, "step_time": 38.374850201998925 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.28472086787223816, "epoch": 0.002, "grad_norm": 1.0553685426712036, "kl": 0.32899605855345726, "learning_rate": 9.999992414837692e-06, "loss": -0.0328, "step": 100, "step_time": 6.762840421000874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1659.1875, "completions/mean_terminated_length": 1659.1875, "completions/min_length": 1242.0, "completions/min_terminated_length": 1242.0, "entropy": 0.3777337931096554, "epoch": 0.00202, "frac_reward_zero_std": 0.0, "grad_norm": 1.684167504310608, "kl": 0.24033249728381634, "learning_rate": 9.999992175949606e-06, "loss": -0.1423, "num_tokens": 3729226.0, "reward": -0.08680570125579834, "reward_std": 6.447270393371582, "rewards/rollout_reward_func/mean": -0.08680570125579834, "rewards/rollout_reward_func/std": 11.136384963989258, "sampling/importance_sampling_ratio/max": 2.5989115238189697, "sampling/importance_sampling_ratio/mean": 0.9449913501739502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2526357173919678, "sampling/sampling_logp_difference/mean": 0.08848999440670013, "step": 101, "step_time": 35.97663474399906 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.0038470644503831863, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011659564450383186, "entropy": 0.3802182339131832, "epoch": 0.00204, "grad_norm": 1.5747473239898682, "kl": 0.22995636146515608, "learning_rate": 9.999991933357835e-06, "loss": -0.1454, "step": 102, "step_time": 5.79218666900033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 1652.96875, "completions/mean_terminated_length": 1652.96875, "completions/min_length": 1516.0, "completions/min_terminated_length": 1516.0, "entropy": 0.33094101771712303, "epoch": 0.00206, "frac_reward_zero_std": 0.0, "grad_norm": 1.6915507316589355, "kl": 0.277794674038887, "learning_rate": 9.999991687062379e-06, "loss": -0.2347, "num_tokens": 3802729.0, "reward": 1.5054678916931152, "reward_std": 6.697179317474365, "rewards/rollout_reward_func/mean": 1.5054678916931152, "rewards/rollout_reward_func/std": 12.069811820983887, "sampling/importance_sampling_ratio/max": 2.974453926086426, "sampling/importance_sampling_ratio/mean": 0.9312765598297119, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4296720027923584, "sampling/sampling_logp_difference/mean": 0.08249014616012573, "step": 103, "step_time": 36.59170679699946 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.33079541847109795, "epoch": 0.00208, "grad_norm": 1.049177885055542, "kl": 0.2827172800898552, "learning_rate": 9.999991437063234e-06, "loss": -0.2385, "step": 104, "step_time": 5.693732602999262 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 1666.9375, "completions/mean_terminated_length": 1666.9375, "completions/min_length": 1540.0, "completions/min_terminated_length": 1540.0, "entropy": 0.3682529255747795, "epoch": 0.0021, "frac_reward_zero_std": 0.0, "grad_norm": 1.7118496894836426, "kl": 0.2884139958769083, "learning_rate": 9.999991183360406e-06, "loss": -0.1423, "num_tokens": 3876320.0, "reward": -0.8790185451507568, "reward_std": 5.496585845947266, "rewards/rollout_reward_func/mean": -0.8790185451507568, "rewards/rollout_reward_func/std": 7.660371780395508, "sampling/importance_sampling_ratio/max": 2.7382900714874268, "sampling/importance_sampling_ratio/mean": 1.0317010879516602, "sampling/importance_sampling_ratio/min": 0.19462421536445618, "sampling/sampling_logp_difference/max": 1.0869190692901611, "sampling/sampling_logp_difference/mean": 0.07802367210388184, "step": 105, "step_time": 36.844014158999016 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.36161230877041817, "epoch": 0.00212, "grad_norm": 1.3527196645736694, "kl": 0.32736348174512386, "learning_rate": 9.999990925953894e-06, "loss": -0.1463, "step": 106, "step_time": 6.756272589999753 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.011659564450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1727.4375, "completions/mean_terminated_length": 1727.4375, "completions/min_length": 1536.0, "completions/min_terminated_length": 1536.0, "entropy": 0.3097492754459381, "epoch": 0.00214, "frac_reward_zero_std": 0.0, "grad_norm": 1.5999138355255127, "kl": 0.5242087468504906, "learning_rate": 9.999990664843696e-06, "loss": -0.3461, "num_tokens": 3952089.0, "reward": -2.7279787063598633, "reward_std": 7.685024738311768, "rewards/rollout_reward_func/mean": -2.7279787063598633, "rewards/rollout_reward_func/std": 8.918709754943848, "sampling/importance_sampling_ratio/max": 2.823850631713867, "sampling/importance_sampling_ratio/mean": 1.0143675804138184, "sampling/importance_sampling_ratio/min": 0.10803008079528809, "sampling/sampling_logp_difference/max": 1.6307916641235352, "sampling/sampling_logp_difference/mean": 0.08010027557611465, "step": 107, "step_time": 34.80215419899923 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.017518939450383186, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.023319128900766373, "entropy": 0.2990362048149109, "epoch": 0.00216, "grad_norm": 1.7662874460220337, "kl": 0.5438167825341225, "learning_rate": 9.999990400029814e-06, "loss": -0.3506, "step": 108, "step_time": 5.828784976000861 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 1701.5, "completions/mean_terminated_length": 1701.5, "completions/min_length": 1555.0, "completions/min_terminated_length": 1555.0, "entropy": 0.2941362299025059, "epoch": 0.00218, "frac_reward_zero_std": 0.0, "grad_norm": 2.4735493659973145, "kl": 0.46664091385900974, "learning_rate": 9.999990131512245e-06, "loss": -0.1545, "num_tokens": 4027063.0, "reward": -1.1875361204147339, "reward_std": 6.082664489746094, "rewards/rollout_reward_func/mean": -1.1875361204147339, "rewards/rollout_reward_func/std": 7.877804279327393, "sampling/importance_sampling_ratio/max": 2.3916215896606445, "sampling/importance_sampling_ratio/mean": 0.7803431153297424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4629955291748047, "sampling/sampling_logp_difference/mean": 0.08069309592247009, "step": 109, "step_time": 35.122385199001656 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.28531504422426224, "epoch": 0.0022, "grad_norm": 1.2467232942581177, "kl": 0.5820192918181419, "learning_rate": 9.999989859290995e-06, "loss": -0.1614, "step": 110, "step_time": 5.834867768998265 }, { "clip_ratio/high_max": 0.011600378900766373, "clip_ratio/high_mean": 0.005800189450383186, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007753314450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1649.5625, "completions/mean_terminated_length": 1649.5625, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "entropy": 0.29295698180794716, "epoch": 0.00222, "frac_reward_zero_std": 0.0, "grad_norm": 1.934397578239441, "kl": 0.9904868267476559, "learning_rate": 9.99998958336606e-06, "loss": -0.2592, "num_tokens": 4100465.0, "reward": -1.0518858432769775, "reward_std": 7.8057475090026855, "rewards/rollout_reward_func/mean": -1.0518858432769775, "rewards/rollout_reward_func/std": 12.11864185333252, "sampling/importance_sampling_ratio/max": 2.4532089233398438, "sampling/importance_sampling_ratio/mean": 0.7939997315406799, "sampling/importance_sampling_ratio/min": 0.03784231096506119, "sampling/sampling_logp_difference/max": 2.585331439971924, "sampling/sampling_logp_difference/mean": 0.11609256267547607, "step": 111, "step_time": 35.7164067620015 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.017578125, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.021484375, "entropy": 0.28337718918919563, "epoch": 0.00224, "grad_norm": 1.8777400255203247, "kl": 1.192313952371478, "learning_rate": 9.999989303737442e-06, "loss": -0.2583, "step": 112, "step_time": 6.234686438999233 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1700.0, "completions/max_terminated_length": 1700.0, "completions/mean_length": 1627.25, "completions/mean_terminated_length": 1627.25, "completions/min_length": 1213.0, "completions/min_terminated_length": 1213.0, "entropy": 0.23321667686104774, "epoch": 0.00226, "frac_reward_zero_std": 0.0, "grad_norm": 2.0336928367614746, "kl": 1.1097694747149944, "learning_rate": 9.999989020405141e-06, "loss": -0.136, "num_tokens": 4173152.0, "reward": -6.243222713470459, "reward_std": 10.733819961547852, "rewards/rollout_reward_func/mean": -6.243222713470459, "rewards/rollout_reward_func/std": 13.576552391052246, "sampling/importance_sampling_ratio/max": 2.0692925453186035, "sampling/importance_sampling_ratio/mean": 0.7719696760177612, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.4085850715637207, "sampling/sampling_logp_difference/mean": 0.11740700155496597, "step": 113, "step_time": 37.108936664000794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013055098708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013055098708719015, "entropy": 0.2277730144560337, "epoch": 0.00228, "grad_norm": 1.9860761165618896, "kl": 1.227958582341671, "learning_rate": 9.999988733369157e-06, "loss": -0.1385, "step": 114, "step_time": 5.572819186999368 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007753314450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1693.75, "completions/mean_terminated_length": 1693.75, "completions/min_length": 1321.0, "completions/min_terminated_length": 1321.0, "entropy": 0.23424900509417057, "epoch": 0.0023, "frac_reward_zero_std": 0.0, "grad_norm": 1.6176177263259888, "kl": 1.1609091851860285, "learning_rate": 9.999988442629489e-06, "loss": -0.2144, "num_tokens": 4248083.0, "reward": -5.069212913513184, "reward_std": 5.557045936584473, "rewards/rollout_reward_func/mean": -5.069212913513184, "rewards/rollout_reward_func/std": 12.084410667419434, "sampling/importance_sampling_ratio/max": 2.459650754928589, "sampling/importance_sampling_ratio/mean": 0.6402795910835266, "sampling/importance_sampling_ratio/min": 5.712434665512732e-15, "sampling/sampling_logp_difference/max": 30.60143280029297, "sampling/sampling_logp_difference/mean": 0.17031052708625793, "step": 115, "step_time": 35.503748573997655 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.005800189450383186, "clip_ratio/low_min": 0.0037878789007663727, "clip_ratio/region_mean": 0.017518939450383186, "entropy": 0.23416488245129585, "epoch": 0.00232, "grad_norm": 1.2904891967773438, "kl": 1.3006360940635204, "learning_rate": 9.99998814818614e-06, "loss": -0.217, "step": 116, "step_time": 5.804856802999893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 1710.40625, "completions/mean_terminated_length": 1710.40625, "completions/min_length": 1604.0, "completions/min_terminated_length": 1604.0, "entropy": 0.23163120076060295, "epoch": 0.00234, "frac_reward_zero_std": 0.0, "grad_norm": 1.6878459453582764, "kl": 0.438737278804183, "learning_rate": 9.999987850039108e-06, "loss": -0.0609, "num_tokens": 4323796.0, "reward": -2.6192214488983154, "reward_std": 8.0145263671875, "rewards/rollout_reward_func/mean": -2.6192214488983154, "rewards/rollout_reward_func/std": 10.107186317443848, "sampling/importance_sampling_ratio/max": 1.9165576696395874, "sampling/importance_sampling_ratio/mean": 0.9111311435699463, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.697357416152954, "sampling/sampling_logp_difference/mean": 0.08581140637397766, "step": 117, "step_time": 38.28407665499981 }, { "clip_ratio/high_max": 0.019412878900766373, "clip_ratio/high_mean": 0.009706439450383186, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009706439450383186, "entropy": 0.23309355787932873, "epoch": 0.00236, "grad_norm": 1.4197455644607544, "kl": 0.39980714581906796, "learning_rate": 9.999987548188395e-06, "loss": -0.0665, "step": 118, "step_time": 5.822731712999484 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.008984374813735485, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008984374813735485, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 1468.46875, "completions/mean_terminated_length": 1468.46875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.24193942546844482, "epoch": 0.00238, "frac_reward_zero_std": 0.0, "grad_norm": 1.7685405015945435, "kl": 0.39049934409558773, "learning_rate": 9.999987242634e-06, "loss": -0.0657, "num_tokens": 4391500.0, "reward": 1.9110275506973267, "reward_std": 6.384462356567383, "rewards/rollout_reward_func/mean": 1.9110275506973267, "rewards/rollout_reward_func/std": 14.606425285339355, "sampling/importance_sampling_ratio/max": 2.8693697452545166, "sampling/importance_sampling_ratio/mean": 1.2269116640090942, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.3209800720214844, "sampling/sampling_logp_difference/mean": 0.08356651663780212, "step": 119, "step_time": 31.813835332000053 }, { "clip_ratio/high_max": 0.03158482210710645, "clip_ratio/high_mean": 0.01774553582072258, "clip_ratio/low_mean": 0.009706439450383186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027451975271105766, "entropy": 0.2410776149481535, "epoch": 0.0024, "grad_norm": 1.5952562093734741, "kl": 0.40999631211161613, "learning_rate": 9.999986933375924e-06, "loss": -0.0703, "step": 120, "step_time": 5.7420445510006175 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 1555.34375, "completions/mean_terminated_length": 1555.34375, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "entropy": 0.2007010132074356, "epoch": 0.00242, "frac_reward_zero_std": 0.0, "grad_norm": 0.9870114922523499, "kl": 0.5888753104954958, "learning_rate": 9.999986620414169e-06, "loss": -0.0606, "num_tokens": 4461583.0, "reward": -3.0732581615448, "reward_std": 2.935732364654541, "rewards/rollout_reward_func/mean": -3.0732581615448, "rewards/rollout_reward_func/std": 5.027767658233643, "sampling/importance_sampling_ratio/max": 2.2580418586730957, "sampling/importance_sampling_ratio/mean": 0.8960647583007812, "sampling/importance_sampling_ratio/min": 4.473894033019121e-10, "sampling/sampling_logp_difference/max": 16.850034713745117, "sampling/sampling_logp_difference/mean": 0.11269309371709824, "step": 121, "step_time": 35.07806598499883 }, { "clip_ratio/high_max": 0.016927083488553762, "clip_ratio/high_mean": 0.008463541744276881, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012369791744276881, "entropy": 0.2029730472713709, "epoch": 0.00244, "grad_norm": 0.7402886152267456, "kl": 0.4976696763187647, "learning_rate": 9.999986303748731e-06, "loss": -0.0621, "step": 122, "step_time": 5.627297465000083 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1683.59375, "completions/mean_terminated_length": 1683.59375, "completions/min_length": 1471.0, "completions/min_terminated_length": 1471.0, "entropy": 0.21090154722332954, "epoch": 0.00246, "frac_reward_zero_std": 0.0, "grad_norm": 1.2972943782806396, "kl": 0.4575018994510174, "learning_rate": 9.999985983379614e-06, "loss": -0.2785, "num_tokens": 4535822.0, "reward": -3.0389723777770996, "reward_std": 4.446030616760254, "rewards/rollout_reward_func/mean": -3.0389723777770996, "rewards/rollout_reward_func/std": 6.439523220062256, "sampling/importance_sampling_ratio/max": 1.792663335800171, "sampling/importance_sampling_ratio/mean": 0.8646963834762573, "sampling/importance_sampling_ratio/min": 0.10849396139383316, "sampling/sampling_logp_difference/max": 2.1350996494293213, "sampling/sampling_logp_difference/mean": 0.07303433865308762, "step": 123, "step_time": 37.839723685000536 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.21065166406333447, "epoch": 0.00248, "grad_norm": 1.294693946838379, "kl": 0.4491172023117542, "learning_rate": 9.999985659306817e-06, "loss": -0.2798, "step": 124, "step_time": 5.816134531000898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1671.8125, "completions/mean_terminated_length": 1671.8125, "completions/min_length": 1453.0, "completions/min_terminated_length": 1453.0, "entropy": 0.31402457505464554, "epoch": 0.0025, "frac_reward_zero_std": 0.0, "grad_norm": 1.7626806497573853, "kl": 0.3699891809374094, "learning_rate": 9.999985331530339e-06, "loss": -0.123, "num_tokens": 4609806.0, "reward": -1.0957450866699219, "reward_std": 8.374095916748047, "rewards/rollout_reward_func/mean": -1.0957450866699219, "rewards/rollout_reward_func/std": 10.550527572631836, "sampling/importance_sampling_ratio/max": 2.8980863094329834, "sampling/importance_sampling_ratio/mean": 1.0585088729858398, "sampling/importance_sampling_ratio/min": 1.4836078201818364e-17, "sampling/sampling_logp_difference/max": 20.844825744628906, "sampling/sampling_logp_difference/mean": 0.13481880724430084, "step": 125, "step_time": 36.72139834099926 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007218070677481592, "entropy": 0.31360605917871, "epoch": 0.00252, "grad_norm": 1.7030891180038452, "kl": 0.3880602568387985, "learning_rate": 9.999985000050181e-06, "loss": -0.1268, "step": 126, "step_time": 5.752750929000285 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005800189450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 1624.90625, "completions/mean_terminated_length": 1624.90625, "completions/min_length": 1254.0, "completions/min_terminated_length": 1254.0, "entropy": 0.23735753446817398, "epoch": 0.00254, "frac_reward_zero_std": 0.0, "grad_norm": 2.49963116645813, "kl": 1.4479252882301807, "learning_rate": 9.999984664866347e-06, "loss": -0.1985, "num_tokens": 4682513.0, "reward": 0.2574213147163391, "reward_std": 9.919325828552246, "rewards/rollout_reward_func/mean": 0.2574213147163391, "rewards/rollout_reward_func/std": 12.862212181091309, "sampling/importance_sampling_ratio/max": 2.5531129837036133, "sampling/importance_sampling_ratio/mean": 0.8520662784576416, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.3626585006713867, "sampling/sampling_logp_difference/mean": 0.09023018181324005, "step": 127, "step_time": 36.4248705359978 }, { "clip_ratio/high_max": 0.011979166883975267, "clip_ratio/high_mean": 0.005989583441987634, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011848958441987634, "entropy": 0.2332585696130991, "epoch": 0.00256, "grad_norm": 1.9195269346237183, "kl": 1.4073903393000364, "learning_rate": 9.999984325978833e-06, "loss": -0.2016, "step": 128, "step_time": 6.607886812998004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 1695.625, "completions/mean_terminated_length": 1695.625, "completions/min_length": 1334.0, "completions/min_terminated_length": 1334.0, "entropy": 0.2307057324796915, "epoch": 0.00258, "frac_reward_zero_std": 0.0, "grad_norm": 6.967021465301514, "kl": 2.5539040379226208, "learning_rate": 9.99998398338764e-06, "loss": -0.1924, "num_tokens": 4757093.0, "reward": -2.7488653659820557, "reward_std": 5.045413017272949, "rewards/rollout_reward_func/mean": -2.7488653659820557, "rewards/rollout_reward_func/std": 10.707550048828125, "sampling/importance_sampling_ratio/max": 2.2723803520202637, "sampling/importance_sampling_ratio/mean": 0.799378514289856, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.946773529052734, "sampling/sampling_logp_difference/mean": 0.14229635894298553, "step": 129, "step_time": 36.29410157899838 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.22786042280495167, "epoch": 0.0026, "grad_norm": 2.270787477493286, "kl": 1.118930522352457, "learning_rate": 9.99998363709277e-06, "loss": -0.2081, "step": 130, "step_time": 5.818163544999152 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003791360300965607, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 1663.96875, "completions/mean_terminated_length": 1663.96875, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "entropy": 0.23138621263206005, "epoch": 0.00262, "frac_reward_zero_std": 0.0, "grad_norm": 10.312149047851562, "kl": 4.53411222063005, "learning_rate": 9.999983287094222e-06, "loss": 0.0936, "num_tokens": 4830857.0, "reward": -1.0345556735992432, "reward_std": 4.783634185791016, "rewards/rollout_reward_func/mean": -1.0345556735992432, "rewards/rollout_reward_func/std": 6.499684810638428, "sampling/importance_sampling_ratio/max": 2.2273147106170654, "sampling/importance_sampling_ratio/mean": 0.8017725944519043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.242347717285156, "sampling/sampling_logp_difference/mean": 0.13735142350196838, "step": 131, "step_time": 37.860813379999854 }, { "clip_ratio/high_max": 0.026424632407724857, "clip_ratio/high_mean": 0.013212316203862429, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015165441203862429, "entropy": 0.23595001175999641, "epoch": 0.00264, "grad_norm": 6.295961380004883, "kl": 2.904046291485429, "learning_rate": 9.999982933391998e-06, "loss": 0.0759, "step": 132, "step_time": 5.760328608001146 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.004185267956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008091517956927419, "completions/clipped_ratio": 0.0, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 1599.625, "completions/mean_terminated_length": 1599.625, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "entropy": 0.1828257255256176, "epoch": 0.00266, "frac_reward_zero_std": 0.125, "grad_norm": 1.1880617141723633, "kl": 0.6531639210879803, "learning_rate": 9.999982575986095e-06, "loss": -0.173, "num_tokens": 4902653.0, "reward": -1.3747859001159668, "reward_std": 5.477799415588379, "rewards/rollout_reward_func/mean": -1.3747859001159668, "rewards/rollout_reward_func/std": 7.5533223152160645, "sampling/importance_sampling_ratio/max": 2.5466907024383545, "sampling/importance_sampling_ratio/mean": 0.995758056640625, "sampling/importance_sampling_ratio/min": 0.007883523590862751, "sampling/sampling_logp_difference/max": 2.0956740379333496, "sampling/sampling_logp_difference/mean": 0.07060299813747406, "step": 133, "step_time": 34.61869661600031 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.19106297940015793, "epoch": 0.00268, "grad_norm": 1.2263319492340088, "kl": 0.6410297751426697, "learning_rate": 9.999982214876516e-06, "loss": -0.1744, "step": 134, "step_time": 6.524274291999973 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1685.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 1521.625, "completions/mean_terminated_length": 1521.625, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "entropy": 0.23908153176307678, "epoch": 0.0027, "frac_reward_zero_std": 0.0, "grad_norm": 2.1356704235076904, "kl": 0.3956664204597473, "learning_rate": 9.999981850063262e-06, "loss": -0.2331, "num_tokens": 4971778.0, "reward": 1.5177106857299805, "reward_std": 4.678309440612793, "rewards/rollout_reward_func/mean": 1.5177106857299805, "rewards/rollout_reward_func/std": 16.861400604248047, "sampling/importance_sampling_ratio/max": 2.699097156524658, "sampling/importance_sampling_ratio/mean": 0.861909806728363, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.98867416381836, "sampling/sampling_logp_difference/mean": 0.13872608542442322, "step": 135, "step_time": 32.25473025500105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.2471965067088604, "epoch": 0.00272, "grad_norm": 2.370650053024292, "kl": 0.35189586132764816, "learning_rate": 9.99998148154633e-06, "loss": -0.2343, "step": 136, "step_time": 5.5205114909995245 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1687.96875, "completions/mean_terminated_length": 1687.96875, "completions/min_length": 1462.0, "completions/min_terminated_length": 1462.0, "entropy": 0.25870174542069435, "epoch": 0.00274, "frac_reward_zero_std": 0.0, "grad_norm": 1.2739732265472412, "kl": 0.3620435334742069, "learning_rate": 9.999981109325725e-06, "loss": 0.0377, "num_tokens": 5046449.0, "reward": -4.809016227722168, "reward_std": 5.6776933670043945, "rewards/rollout_reward_func/mean": -4.809016227722168, "rewards/rollout_reward_func/std": 15.731524467468262, "sampling/importance_sampling_ratio/max": 2.0674188137054443, "sampling/importance_sampling_ratio/mean": 0.8131706714630127, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 24.600109100341797, "sampling/sampling_logp_difference/mean": 0.11974793672561646, "step": 137, "step_time": 36.03785672200047 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.017578125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021484375, "entropy": 0.2636088263243437, "epoch": 0.00276, "grad_norm": 0.9780673384666443, "kl": 0.32067642733454704, "learning_rate": 9.999980733401442e-06, "loss": 0.0335, "step": 138, "step_time": 5.760654310999598 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1777.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 1652.3125, "completions/mean_terminated_length": 1652.3125, "completions/min_length": 1389.0, "completions/min_terminated_length": 1389.0, "entropy": 0.21654790081083775, "epoch": 0.00278, "frac_reward_zero_std": 0.0, "grad_norm": 1.9906519651412964, "kl": 0.22392724081873894, "learning_rate": 9.999980353773486e-06, "loss": -0.1274, "num_tokens": 5119986.0, "reward": -3.179938316345215, "reward_std": 5.390859603881836, "rewards/rollout_reward_func/mean": -3.179938316345215, "rewards/rollout_reward_func/std": 7.339827537536621, "sampling/importance_sampling_ratio/max": 2.915480613708496, "sampling/importance_sampling_ratio/mean": 1.0259276628494263, "sampling/importance_sampling_ratio/min": 3.232071999360181e-13, "sampling/sampling_logp_difference/max": 29.06264877319336, "sampling/sampling_logp_difference/mean": 0.10719652473926544, "step": 139, "step_time": 35.030210969999644 }, { "clip_ratio/high_max": 0.015395220601931214, "clip_ratio/high_mean": 0.009650735300965607, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011603860184550285, "entropy": 0.22184443473815918, "epoch": 0.0028, "grad_norm": 1.378363013267517, "kl": 0.2069135345518589, "learning_rate": 9.999979970441856e-06, "loss": -0.1325, "step": 140, "step_time": 6.193294476998744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1623.75, "completions/mean_terminated_length": 1623.75, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "entropy": 0.29241427406668663, "epoch": 0.00282, "frac_reward_zero_std": 0.0, "grad_norm": 1.9409116506576538, "kl": 0.32110051065683365, "learning_rate": 9.999979583406551e-06, "loss": -0.1415, "num_tokens": 5192337.0, "reward": -3.2484803199768066, "reward_std": 5.998169422149658, "rewards/rollout_reward_func/mean": -3.2484803199768066, "rewards/rollout_reward_func/std": 7.8606648445129395, "sampling/importance_sampling_ratio/max": 2.497420072555542, "sampling/importance_sampling_ratio/mean": 1.0857844352722168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.159547805786133, "sampling/sampling_logp_difference/mean": 0.12232924997806549, "step": 141, "step_time": 36.19595617200048 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.009420956019312143, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.015280331019312143, "entropy": 0.2928194999694824, "epoch": 0.00284, "grad_norm": 1.5054811239242554, "kl": 0.3162485882639885, "learning_rate": 9.999979192667574e-06, "loss": -0.1445, "step": 142, "step_time": 5.68182536599943 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.007634943351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011541193351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1645.90625, "completions/mean_terminated_length": 1645.90625, "completions/min_length": 1448.0, "completions/min_terminated_length": 1448.0, "entropy": 0.3010127767920494, "epoch": 0.00286, "frac_reward_zero_std": 0.0, "grad_norm": 1.9966871738433838, "kl": 0.22623557038605213, "learning_rate": 9.999978798224922e-06, "loss": 0.094, "num_tokens": 5265626.0, "reward": -1.248368740081787, "reward_std": 8.209827423095703, "rewards/rollout_reward_func/mean": -1.248368740081787, "rewards/rollout_reward_func/std": 10.719414710998535, "sampling/importance_sampling_ratio/max": 2.8611063957214355, "sampling/importance_sampling_ratio/mean": 1.1009293794631958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.22269058227539, "sampling/sampling_logp_difference/mean": 0.10865116864442825, "step": 143, "step_time": 34.8812253310025 }, { "clip_ratio/high_max": 0.02688419120386243, "clip_ratio/high_mean": 0.013442095601931214, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013442095601931214, "entropy": 0.3076608795672655, "epoch": 0.00288, "grad_norm": 2.357238531112671, "kl": 0.22019376046955585, "learning_rate": 9.999978400078598e-06, "loss": 0.0904, "step": 144, "step_time": 5.731513302997882 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1654.75, "completions/mean_terminated_length": 1654.75, "completions/min_length": 1508.0, "completions/min_terminated_length": 1508.0, "entropy": 0.2691913191229105, "epoch": 0.0029, "frac_reward_zero_std": 0.0, "grad_norm": 1.4389311075210571, "kl": 0.30777561012655497, "learning_rate": 9.9999779982286e-06, "loss": -0.0461, "num_tokens": 5339115.0, "reward": 4.553075790405273, "reward_std": 7.375555992126465, "rewards/rollout_reward_func/mean": 4.553075790405273, "rewards/rollout_reward_func/std": 9.625307083129883, "sampling/importance_sampling_ratio/max": 2.137481451034546, "sampling/importance_sampling_ratio/mean": 1.0123145580291748, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0253870487213135, "sampling/sampling_logp_difference/mean": 0.053506266325712204, "step": 145, "step_time": 37.3788223370002 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.0038470644503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015565814450383186, "entropy": 0.27271147817373276, "epoch": 0.00292, "grad_norm": 1.379260540008545, "kl": 0.27049021050333977, "learning_rate": 9.999977592674933e-06, "loss": -0.0501, "step": 146, "step_time": 5.7508253040005 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 1629.46875, "completions/mean_terminated_length": 1629.46875, "completions/min_length": 1117.0, "completions/min_terminated_length": 1117.0, "entropy": 0.273640938103199, "epoch": 0.00294, "frac_reward_zero_std": 0.0, "grad_norm": 1.9744895696640015, "kl": 0.21257582679390907, "learning_rate": 9.999977183417593e-06, "loss": -0.0741, "num_tokens": 5412122.0, "reward": -1.7804348468780518, "reward_std": 5.91984224319458, "rewards/rollout_reward_func/mean": -1.7804348468780518, "rewards/rollout_reward_func/std": 9.313576698303223, "sampling/importance_sampling_ratio/max": 2.493762493133545, "sampling/importance_sampling_ratio/mean": 1.099168300628662, "sampling/importance_sampling_ratio/min": 0.22300267219543457, "sampling/sampling_logp_difference/max": 1.1365747451782227, "sampling/sampling_logp_difference/mean": 0.05694221705198288, "step": 147, "step_time": 36.152454600000965 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.2737910356372595, "epoch": 0.00296, "grad_norm": 1.6924679279327393, "kl": 0.2047729678452015, "learning_rate": 9.999976770456581e-06, "loss": -0.0774, "step": 148, "step_time": 5.677298628001154 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1715.875, "completions/mean_terminated_length": 1715.875, "completions/min_length": 1601.0, "completions/min_terminated_length": 1601.0, "entropy": 0.27875737100839615, "epoch": 0.00298, "frac_reward_zero_std": 0.0, "grad_norm": 2.0113556385040283, "kl": 0.2866439474746585, "learning_rate": 9.999976353791898e-06, "loss": -0.05, "num_tokens": 5487662.0, "reward": -3.409506320953369, "reward_std": 4.39523983001709, "rewards/rollout_reward_func/mean": -3.409506320953369, "rewards/rollout_reward_func/std": 9.453926086425781, "sampling/importance_sampling_ratio/max": 2.8223652839660645, "sampling/importance_sampling_ratio/mean": 0.9090695977210999, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.958104133605957, "sampling/sampling_logp_difference/mean": 0.0680600255727768, "step": 149, "step_time": 36.694275042998925 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2778263594955206, "epoch": 0.003, "grad_norm": 1.462227463722229, "kl": 0.30287417955696583, "learning_rate": 9.999975933423546e-06, "loss": -0.055, "step": 150, "step_time": 6.261132174002341 }, { "clip_ratio/high_max": 0.011979166883975267, "clip_ratio/high_mean": 0.005989583441987634, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005989583441987634, "completions/clipped_ratio": 0.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 1670.53125, "completions/mean_terminated_length": 1670.53125, "completions/min_length": 1212.0, "completions/min_terminated_length": 1212.0, "entropy": 0.2821527421474457, "epoch": 0.00302, "frac_reward_zero_std": 0.0, "grad_norm": 1.420735478401184, "kl": 0.2623867988586426, "learning_rate": 9.999975509351522e-06, "loss": -0.1198, "num_tokens": 5561719.0, "reward": 1.7372636795043945, "reward_std": 7.599005222320557, "rewards/rollout_reward_func/mean": 1.7372636795043945, "rewards/rollout_reward_func/std": 15.485095977783203, "sampling/importance_sampling_ratio/max": 2.6034064292907715, "sampling/importance_sampling_ratio/mean": 1.0206873416900635, "sampling/importance_sampling_ratio/min": 0.175743967294693, "sampling/sampling_logp_difference/max": 1.0962951183319092, "sampling/sampling_logp_difference/mean": 0.05628112703561783, "step": 151, "step_time": 34.65810895300274 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.011979166883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013932291883975267, "entropy": 0.27934372052550316, "epoch": 0.00304, "grad_norm": 1.3764787912368774, "kl": 0.30363442841917276, "learning_rate": 9.99997508157583e-06, "loss": -0.1206, "step": 152, "step_time": 5.7744073849989945 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1703.21875, "completions/mean_terminated_length": 1703.21875, "completions/min_length": 1567.0, "completions/min_terminated_length": 1567.0, "entropy": 0.2447413094341755, "epoch": 0.00306, "frac_reward_zero_std": 0.125, "grad_norm": 1.6576489210128784, "kl": 0.4590331744402647, "learning_rate": 9.999974650096467e-06, "loss": -0.1218, "num_tokens": 5636992.0, "reward": -0.15775525569915771, "reward_std": 9.043015480041504, "rewards/rollout_reward_func/mean": -0.15775525569915771, "rewards/rollout_reward_func/std": 10.28443431854248, "sampling/importance_sampling_ratio/max": 2.2586140632629395, "sampling/importance_sampling_ratio/mean": 0.8791904449462891, "sampling/importance_sampling_ratio/min": 0.13126111030578613, "sampling/sampling_logp_difference/max": 1.3272500038146973, "sampling/sampling_logp_difference/mean": 0.06481630355119705, "step": 153, "step_time": 35.60768943099811 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2421154472976923, "epoch": 0.00308, "grad_norm": 1.613027572631836, "kl": 0.47781567834317684, "learning_rate": 9.999974214913438e-06, "loss": -0.1224, "step": 154, "step_time": 5.782275483999001 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1742.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 1647.8125, "completions/mean_terminated_length": 1647.8125, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "entropy": 0.309613186866045, "epoch": 0.0031, "frac_reward_zero_std": 0.0, "grad_norm": 1.3915244340896606, "kl": 0.34489365108311176, "learning_rate": 9.99997377602674e-06, "loss": -0.2332, "num_tokens": 5710235.0, "reward": -5.272271156311035, "reward_std": 4.266254901885986, "rewards/rollout_reward_func/mean": -5.272271156311035, "rewards/rollout_reward_func/std": 11.99683666229248, "sampling/importance_sampling_ratio/max": 2.386578321456909, "sampling/importance_sampling_ratio/mean": 0.8981663584709167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.397716522216797, "sampling/sampling_logp_difference/mean": 0.16181710362434387, "step": 155, "step_time": 36.79902890199992 }, { "clip_ratio/high_max": 0.01907169120386243, "clip_ratio/high_mean": 0.01148897036910057, "clip_ratio/low_mean": 0.009142287075519562, "clip_ratio/low_min": 0.002659574383869767, "clip_ratio/region_mean": 0.020631257444620132, "entropy": 0.3100100867450237, "epoch": 0.00312, "grad_norm": 0.9337635636329651, "kl": 0.36718725040555, "learning_rate": 9.999973333436373e-06, "loss": -0.2342, "step": 156, "step_time": 6.479778260998501 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 1645.84375, "completions/mean_terminated_length": 1645.84375, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "entropy": 0.23720368929207325, "epoch": 0.00314, "frac_reward_zero_std": 0.0, "grad_norm": 1.3682451248168945, "kl": 0.3653513854369521, "learning_rate": 9.999972887142338e-06, "loss": -0.1482, "num_tokens": 5783053.0, "reward": -5.2926249504089355, "reward_std": 6.853837013244629, "rewards/rollout_reward_func/mean": -5.2926249504089355, "rewards/rollout_reward_func/std": 9.61239242553711, "sampling/importance_sampling_ratio/max": 1.6078870296478271, "sampling/importance_sampling_ratio/mean": 0.9465827345848083, "sampling/importance_sampling_ratio/min": 0.14573633670806885, "sampling/sampling_logp_difference/max": 1.2857561111450195, "sampling/sampling_logp_difference/mean": 0.05283693969249725, "step": 157, "step_time": 35.09920485400198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.23178580962121487, "epoch": 0.00316, "grad_norm": 1.3167202472686768, "kl": 0.4019450955092907, "learning_rate": 9.999972437144638e-06, "loss": -0.1491, "step": 158, "step_time": 5.754554526000902 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 1676.34375, "completions/mean_terminated_length": 1676.34375, "completions/min_length": 1622.0, "completions/min_terminated_length": 1622.0, "entropy": 0.25272640213370323, "epoch": 0.00318, "frac_reward_zero_std": 0.0, "grad_norm": 1.2554067373275757, "kl": 0.2935179714113474, "learning_rate": 9.99997198344327e-06, "loss": -0.1578, "num_tokens": 5857549.0, "reward": 1.5661380290985107, "reward_std": 8.48967170715332, "rewards/rollout_reward_func/mean": 1.5661380290985107, "rewards/rollout_reward_func/std": 10.192009925842285, "sampling/importance_sampling_ratio/max": 1.9953430891036987, "sampling/importance_sampling_ratio/mean": 0.9570077061653137, "sampling/importance_sampling_ratio/min": 0.08553915470838547, "sampling/sampling_logp_difference/max": 1.1288342475891113, "sampling/sampling_logp_difference/mean": 0.05689948797225952, "step": 159, "step_time": 36.0974576239978 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.24649916216731071, "epoch": 0.0032, "grad_norm": 1.2487872838974, "kl": 0.3261691927909851, "learning_rate": 9.999971526038236e-06, "loss": -0.1592, "step": 160, "step_time": 5.650420889000998 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0038470644503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005800189450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1604.75, "completions/mean_terminated_length": 1604.75, "completions/min_length": 1071.0, "completions/min_terminated_length": 1071.0, "entropy": 0.23597443290054798, "epoch": 0.00322, "frac_reward_zero_std": 0.0, "grad_norm": 1.1652355194091797, "kl": 0.5538041703402996, "learning_rate": 9.999971064929537e-06, "loss": -0.1124, "num_tokens": 5929687.0, "reward": 5.239955425262451, "reward_std": 8.498127937316895, "rewards/rollout_reward_func/mean": 5.239955425262451, "rewards/rollout_reward_func/std": 10.677428245544434, "sampling/importance_sampling_ratio/max": 2.1468427181243896, "sampling/importance_sampling_ratio/mean": 0.8562913537025452, "sampling/importance_sampling_ratio/min": 0.11029359698295593, "sampling/sampling_logp_difference/max": 1.4075746536254883, "sampling/sampling_logp_difference/mean": 0.05289062112569809, "step": 161, "step_time": 35.0522555460002 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.009836647659540176, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.013742897659540176, "entropy": 0.23265731893479824, "epoch": 0.00324, "grad_norm": 1.0499224662780762, "kl": 0.5876908944919705, "learning_rate": 9.999970600117172e-06, "loss": -0.1161, "step": 162, "step_time": 6.270556325998768 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1688.3125, "completions/mean_terminated_length": 1688.3125, "completions/min_length": 1426.0, "completions/min_terminated_length": 1426.0, "entropy": 0.24026566371321678, "epoch": 0.00326, "frac_reward_zero_std": 0.0, "grad_norm": 1.526390552520752, "kl": 0.6829012483358383, "learning_rate": 9.999970131601143e-06, "loss": -0.1204, "num_tokens": 6004625.0, "reward": -2.4879937171936035, "reward_std": 6.033343315124512, "rewards/rollout_reward_func/mean": -2.4879937171936035, "rewards/rollout_reward_func/std": 9.8729829788208, "sampling/importance_sampling_ratio/max": 2.08561372756958, "sampling/importance_sampling_ratio/mean": 0.930208683013916, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.826404571533203, "sampling/sampling_logp_difference/mean": 0.10662569105625153, "step": 163, "step_time": 37.25797335199968 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23994088359177113, "epoch": 0.00328, "grad_norm": 0.99588543176651, "kl": 0.7377689871937037, "learning_rate": 9.99996965938145e-06, "loss": -0.123, "step": 164, "step_time": 5.81927943699975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 1682.96875, "completions/mean_terminated_length": 1682.96875, "completions/min_length": 1526.0, "completions/min_terminated_length": 1526.0, "entropy": 0.20219792239367962, "epoch": 0.0033, "frac_reward_zero_std": 0.0, "grad_norm": 1.7853528261184692, "kl": 0.6294286772608757, "learning_rate": 9.999969183458093e-06, "loss": -0.1554, "num_tokens": 6078930.0, "reward": -0.3497920036315918, "reward_std": 7.241074085235596, "rewards/rollout_reward_func/mean": -0.3497920036315918, "rewards/rollout_reward_func/std": 11.06623363494873, "sampling/importance_sampling_ratio/max": 2.898749828338623, "sampling/importance_sampling_ratio/mean": 1.0241472721099854, "sampling/importance_sampling_ratio/min": 0.15246078372001648, "sampling/sampling_logp_difference/max": 1.200087070465088, "sampling/sampling_logp_difference/mean": 0.052497610449790955, "step": 165, "step_time": 36.50886810300017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.19982031919062138, "epoch": 0.00332, "grad_norm": 1.6592210531234741, "kl": 0.6106511801481247, "learning_rate": 9.999968703831072e-06, "loss": -0.1618, "step": 166, "step_time": 5.802947281998968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003791360300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003791360300965607, "completions/clipped_ratio": 0.0, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 1664.03125, "completions/mean_terminated_length": 1664.03125, "completions/min_length": 1444.0, "completions/min_terminated_length": 1444.0, "entropy": 0.25958102196455, "epoch": 0.00334, "frac_reward_zero_std": 0.0, "grad_norm": 1.5413814783096313, "kl": 0.43835699930787086, "learning_rate": 9.999968220500388e-06, "loss": -0.0403, "num_tokens": 6152886.0, "reward": 2.1668155193328857, "reward_std": 6.447092056274414, "rewards/rollout_reward_func/mean": 2.1668155193328857, "rewards/rollout_reward_func/std": 8.427111625671387, "sampling/importance_sampling_ratio/max": 2.721165180206299, "sampling/importance_sampling_ratio/mean": 1.0039149522781372, "sampling/importance_sampling_ratio/min": 1.5273299985851807e-11, "sampling/sampling_logp_difference/max": 25.23809814453125, "sampling/sampling_logp_difference/mean": 0.11750782281160355, "step": 167, "step_time": 37.20988352099903 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2585952654480934, "epoch": 0.00336, "grad_norm": 1.6427268981933594, "kl": 0.41400426626205444, "learning_rate": 9.99996773346604e-06, "loss": -0.044, "step": 168, "step_time": 6.244365029000619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1772.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 1701.78125, "completions/mean_terminated_length": 1701.78125, "completions/min_length": 1452.0, "completions/min_terminated_length": 1452.0, "entropy": 0.2237578984349966, "epoch": 0.00338, "frac_reward_zero_std": 0.0, "grad_norm": 1.704355239868164, "kl": 0.408824922516942, "learning_rate": 9.999967242728034e-06, "loss": 0.018, "num_tokens": 6227758.0, "reward": 1.3272455930709839, "reward_std": 8.560393333435059, "rewards/rollout_reward_func/mean": 1.3272455930709839, "rewards/rollout_reward_func/std": 9.793672561645508, "sampling/importance_sampling_ratio/max": 2.430826187133789, "sampling/importance_sampling_ratio/mean": 0.9597625136375427, "sampling/importance_sampling_ratio/min": 2.0419729275522602e-12, "sampling/sampling_logp_difference/max": 25.280433654785156, "sampling/sampling_logp_difference/mean": 0.11089442670345306, "step": 169, "step_time": 37.15113865700005 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.013671875, "entropy": 0.22276777774095535, "epoch": 0.0034, "grad_norm": 1.461653709411621, "kl": 0.3720815582200885, "learning_rate": 9.999966748286364e-06, "loss": 0.0138, "step": 170, "step_time": 5.70916496099926 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 1649.53125, "completions/mean_terminated_length": 1649.53125, "completions/min_length": 1475.0, "completions/min_terminated_length": 1475.0, "entropy": 0.24526489153504372, "epoch": 0.00342, "frac_reward_zero_std": 0.0, "grad_norm": 1.3005672693252563, "kl": 0.2525577172636986, "learning_rate": 9.999966250141033e-06, "loss": -0.1557, "num_tokens": 6300886.0, "reward": -5.017649173736572, "reward_std": 4.73761510848999, "rewards/rollout_reward_func/mean": -5.017649173736572, "rewards/rollout_reward_func/std": 11.356165885925293, "sampling/importance_sampling_ratio/max": 2.0284202098846436, "sampling/importance_sampling_ratio/mean": 0.9477963447570801, "sampling/importance_sampling_ratio/min": 0.1324770450592041, "sampling/sampling_logp_difference/max": 0.9475624561309814, "sampling/sampling_logp_difference/mean": 0.05894453451037407, "step": 171, "step_time": 36.065134260999 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.24479248374700546, "epoch": 0.00344, "grad_norm": 1.2820854187011719, "kl": 0.25359731540083885, "learning_rate": 9.999965748292042e-06, "loss": -0.1577, "step": 172, "step_time": 5.793963940999674 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1693.0625, "completions/mean_terminated_length": 1693.0625, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "entropy": 0.22402186505496502, "epoch": 0.00346, "frac_reward_zero_std": 0.0, "grad_norm": 1.0302447080612183, "kl": 0.6031353138387203, "learning_rate": 9.999965242739394e-06, "loss": -0.1091, "num_tokens": 6375440.0, "reward": -1.3529514074325562, "reward_std": 5.233081340789795, "rewards/rollout_reward_func/mean": -1.3529514074325562, "rewards/rollout_reward_func/std": 8.062092781066895, "sampling/importance_sampling_ratio/max": 2.0994455814361572, "sampling/importance_sampling_ratio/mean": 0.7845278978347778, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.5858683586120605, "sampling/sampling_logp_difference/mean": 0.07677553594112396, "step": 173, "step_time": 36.896430036998936 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.22430452704429626, "epoch": 0.00348, "grad_norm": 0.8624597191810608, "kl": 0.5716133154928684, "learning_rate": 9.999964733483082e-06, "loss": -0.1117, "step": 174, "step_time": 5.81097331699857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 1665.46875, "completions/mean_terminated_length": 1665.46875, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "entropy": 0.22247346490621567, "epoch": 0.0035, "frac_reward_zero_std": 0.0, "grad_norm": 1.365124225616455, "kl": 0.869195181876421, "learning_rate": 9.999964220523113e-06, "loss": -0.1818, "num_tokens": 6449233.0, "reward": 0.06757661700248718, "reward_std": 4.011537551879883, "rewards/rollout_reward_func/mean": 0.06757661700248718, "rewards/rollout_reward_func/std": 8.588602066040039, "sampling/importance_sampling_ratio/max": 1.9519811868667603, "sampling/importance_sampling_ratio/mean": 0.8214517831802368, "sampling/importance_sampling_ratio/min": 8.936198135633955e-13, "sampling/sampling_logp_difference/max": 24.413681030273438, "sampling/sampling_logp_difference/mean": 0.12217515707015991, "step": 175, "step_time": 35.031672232000346 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.22109077498316765, "epoch": 0.00352, "grad_norm": 1.0243955850601196, "kl": 0.8096068780869246, "learning_rate": 9.999963703859486e-06, "loss": -0.1845, "step": 176, "step_time": 5.7405531860003975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 1651.4375, "completions/mean_terminated_length": 1651.4375, "completions/min_length": 1455.0, "completions/min_terminated_length": 1455.0, "entropy": 0.2104360293596983, "epoch": 0.00354, "frac_reward_zero_std": 0.125, "grad_norm": 1.1368544101715088, "kl": 0.4305075518786907, "learning_rate": 9.999963183492201e-06, "loss": -0.1685, "num_tokens": 6522462.0, "reward": -1.636040449142456, "reward_std": 4.548352241516113, "rewards/rollout_reward_func/mean": -1.636040449142456, "rewards/rollout_reward_func/std": 9.328537940979004, "sampling/importance_sampling_ratio/max": 2.3601131439208984, "sampling/importance_sampling_ratio/mean": 1.1350016593933105, "sampling/importance_sampling_ratio/min": 9.077130865529914e-13, "sampling/sampling_logp_difference/max": 25.7470760345459, "sampling/sampling_logp_difference/mean": 0.10910745710134506, "step": 177, "step_time": 35.85149571300008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.20941109023988247, "epoch": 0.00356, "grad_norm": 1.1521154642105103, "kl": 0.41772544756531715, "learning_rate": 9.999962659421257e-06, "loss": -0.1698, "step": 178, "step_time": 6.28170923900052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1698.21875, "completions/mean_terminated_length": 1698.21875, "completions/min_length": 1600.0, "completions/min_terminated_length": 1600.0, "entropy": 0.24070261418819427, "epoch": 0.00358, "frac_reward_zero_std": 0.0, "grad_norm": 1.4422119855880737, "kl": 0.45182526856660843, "learning_rate": 9.999962131646657e-06, "loss": -0.2072, "num_tokens": 6597489.0, "reward": 3.5879530906677246, "reward_std": 5.621713638305664, "rewards/rollout_reward_func/mean": 3.5879530906677246, "rewards/rollout_reward_func/std": 9.696173667907715, "sampling/importance_sampling_ratio/max": 2.181058406829834, "sampling/importance_sampling_ratio/mean": 0.8959834575653076, "sampling/importance_sampling_ratio/min": 0.14532947540283203, "sampling/sampling_logp_difference/max": 1.420717716217041, "sampling/sampling_logp_difference/mean": 0.06300906836986542, "step": 179, "step_time": 36.23942648999855 }, { "clip_ratio/high_max": 0.006756756920367479, "clip_ratio/high_mean": 0.0033783784601837397, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00728462846018374, "entropy": 0.23607005551457405, "epoch": 0.0036, "grad_norm": 1.3579888343811035, "kl": 0.5251178797334433, "learning_rate": 9.999961600168402e-06, "loss": -0.2103, "step": 180, "step_time": 5.82093875299779 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1669.5, "completions/mean_terminated_length": 1669.5, "completions/min_length": 1570.0, "completions/min_terminated_length": 1570.0, "entropy": 0.18744167499244213, "epoch": 0.00362, "frac_reward_zero_std": 0.0, "grad_norm": 1.0111624002456665, "kl": 0.4290591198951006, "learning_rate": 9.99996106498649e-06, "loss": -0.1616, "num_tokens": 6671509.0, "reward": 2.143825054168701, "reward_std": 4.177690029144287, "rewards/rollout_reward_func/mean": 2.143825054168701, "rewards/rollout_reward_func/std": 8.129756927490234, "sampling/importance_sampling_ratio/max": 2.1915409564971924, "sampling/importance_sampling_ratio/mean": 0.9412689805030823, "sampling/importance_sampling_ratio/min": 0.09555165469646454, "sampling/sampling_logp_difference/max": 1.42826509475708, "sampling/sampling_logp_difference/mean": 0.05709037184715271, "step": 181, "step_time": 35.89454559800015 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.18788514845073223, "epoch": 0.00364, "grad_norm": 0.9392257332801819, "kl": 0.4581598751246929, "learning_rate": 9.999960526100922e-06, "loss": -0.163, "step": 182, "step_time": 5.8134966330007956 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1730.0, "completions/max_terminated_length": 1730.0, "completions/mean_length": 1637.65625, "completions/mean_terminated_length": 1637.65625, "completions/min_length": 1298.0, "completions/min_terminated_length": 1298.0, "entropy": 0.22764514200389385, "epoch": 0.00366, "frac_reward_zero_std": 0.0, "grad_norm": 1.381371259689331, "kl": 0.5989289321005344, "learning_rate": 9.9999599835117e-06, "loss": -0.0257, "num_tokens": 6744431.0, "reward": -4.3541483879089355, "reward_std": 6.479227066040039, "rewards/rollout_reward_func/mean": -4.3541483879089355, "rewards/rollout_reward_func/std": 11.574047088623047, "sampling/importance_sampling_ratio/max": 2.5868866443634033, "sampling/importance_sampling_ratio/mean": 0.8098483085632324, "sampling/importance_sampling_ratio/min": 1.047447632102072e-11, "sampling/sampling_logp_difference/max": 24.79946517944336, "sampling/sampling_logp_difference/mean": 0.10594826936721802, "step": 183, "step_time": 37.10624599499897 }, { "clip_ratio/high_max": 0.015395220601931214, "clip_ratio/high_mean": 0.007697610300965607, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007697610300965607, "entropy": 0.22827458009123802, "epoch": 0.00368, "grad_norm": 0.8857015371322632, "kl": 0.5525546111166477, "learning_rate": 9.999959437218823e-06, "loss": -0.0309, "step": 184, "step_time": 6.532324665999113 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1658.28125, "completions/mean_terminated_length": 1658.28125, "completions/min_length": 1517.0, "completions/min_terminated_length": 1517.0, "entropy": 0.21369964070618153, "epoch": 0.0037, "frac_reward_zero_std": 0.0, "grad_norm": 1.3246303796768188, "kl": 0.47560007125139236, "learning_rate": 9.999958887222293e-06, "loss": -0.2377, "num_tokens": 6817978.0, "reward": 2.9859910011291504, "reward_std": 4.5678887367248535, "rewards/rollout_reward_func/mean": 2.9859910011291504, "rewards/rollout_reward_func/std": 8.68184757232666, "sampling/importance_sampling_ratio/max": 2.1020846366882324, "sampling/importance_sampling_ratio/mean": 1.0168509483337402, "sampling/importance_sampling_ratio/min": 0.16904951632022858, "sampling/sampling_logp_difference/max": 1.180922269821167, "sampling/sampling_logp_difference/mean": 0.05478543043136597, "step": 185, "step_time": 36.360830822000025 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.2150357235223055, "epoch": 0.00372, "grad_norm": 1.0169010162353516, "kl": 0.4974839948117733, "learning_rate": 9.999958333522109e-06, "loss": -0.2427, "step": 186, "step_time": 5.833081005999702 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1707.5625, "completions/mean_terminated_length": 1707.5625, "completions/min_length": 1588.0, "completions/min_terminated_length": 1588.0, "entropy": 0.22898070700466633, "epoch": 0.00374, "frac_reward_zero_std": 0.0, "grad_norm": 1.298535943031311, "kl": 0.5328315943479538, "learning_rate": 9.999957776118273e-06, "loss": -0.087, "num_tokens": 6893533.0, "reward": -0.973763108253479, "reward_std": 5.657554626464844, "rewards/rollout_reward_func/mean": -0.973763108253479, "rewards/rollout_reward_func/std": 13.231979370117188, "sampling/importance_sampling_ratio/max": 2.4051334857940674, "sampling/importance_sampling_ratio/mean": 1.0956934690475464, "sampling/importance_sampling_ratio/min": 0.1452791541814804, "sampling/sampling_logp_difference/max": 1.2502107620239258, "sampling/sampling_logp_difference/mean": 0.05563567206263542, "step": 187, "step_time": 36.085149069999716 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 0.23145854473114014, "epoch": 0.00376, "grad_norm": 1.0598479509353638, "kl": 0.5177410617470741, "learning_rate": 9.999957215010786e-06, "loss": -0.0919, "step": 188, "step_time": 5.831472408998707 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1764.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1684.125, "completions/mean_terminated_length": 1684.125, "completions/min_length": 1553.0, "completions/min_terminated_length": 1553.0, "entropy": 0.20638220198452473, "epoch": 0.00378, "frac_reward_zero_std": 0.0, "grad_norm": 1.619344711303711, "kl": 0.3770788684487343, "learning_rate": 9.999956650199647e-06, "loss": -0.1053, "num_tokens": 6968120.0, "reward": 4.376737594604492, "reward_std": 4.202037811279297, "rewards/rollout_reward_func/mean": 4.376737594604492, "rewards/rollout_reward_func/std": 7.641676425933838, "sampling/importance_sampling_ratio/max": 2.5485401153564453, "sampling/importance_sampling_ratio/mean": 1.0775609016418457, "sampling/importance_sampling_ratio/min": 0.35982266068458557, "sampling/sampling_logp_difference/max": 0.8528366088867188, "sampling/sampling_logp_difference/mean": 0.04990018904209137, "step": 189, "step_time": 37.8296041270014 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "entropy": 0.20882763899862766, "epoch": 0.0038, "grad_norm": 1.2702538967132568, "kl": 0.39946580305695534, "learning_rate": 9.999956081684854e-06, "loss": -0.1089, "step": 190, "step_time": 6.217620228997475 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004036458441987634, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1665.28125, "completions/mean_terminated_length": 1665.28125, "completions/min_length": 1322.0, "completions/min_terminated_length": 1322.0, "entropy": 0.22902198880910873, "epoch": 0.00382, "frac_reward_zero_std": 0.0, "grad_norm": 1.5835474729537964, "kl": 0.5039874613285065, "learning_rate": 9.999955509466414e-06, "loss": -0.1075, "num_tokens": 7042262.0, "reward": 2.3839476108551025, "reward_std": 4.715667724609375, "rewards/rollout_reward_func/mean": 2.3839476108551025, "rewards/rollout_reward_func/std": 9.231707572937012, "sampling/importance_sampling_ratio/max": 2.368562936782837, "sampling/importance_sampling_ratio/mean": 0.8959805965423584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2122876644134521, "sampling/sampling_logp_difference/mean": 0.06825359165668488, "step": 191, "step_time": 36.59035418899839 }, { "clip_ratio/high_max": 0.020052083767950535, "clip_ratio/high_mean": 0.010026041883975267, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017838541883975267, "entropy": 0.22917617112398148, "epoch": 0.00384, "grad_norm": 1.1909282207489014, "kl": 0.5397434663027525, "learning_rate": 9.999954933544324e-06, "loss": -0.1104, "step": 192, "step_time": 5.772955709000598 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1807.0, "completions/max_terminated_length": 1807.0, "completions/mean_length": 1626.8125, "completions/mean_terminated_length": 1626.8125, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "entropy": 0.22650794871151447, "epoch": 0.00386, "frac_reward_zero_std": 0.0, "grad_norm": 1.1048918962478638, "kl": 0.7586736418306828, "learning_rate": 9.999954353918583e-06, "loss": -0.16, "num_tokens": 7115196.0, "reward": 6.401093482971191, "reward_std": 8.368595123291016, "rewards/rollout_reward_func/mean": 6.401093482971191, "rewards/rollout_reward_func/std": 10.889983177185059, "sampling/importance_sampling_ratio/max": 2.1145029067993164, "sampling/importance_sampling_ratio/mean": 0.8738132119178772, "sampling/importance_sampling_ratio/min": 0.07303983718156815, "sampling/sampling_logp_difference/max": 1.4383344650268555, "sampling/sampling_logp_difference/mean": 0.06884820759296417, "step": 193, "step_time": 35.971025829999235 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.01953125, "entropy": 0.22370813973248005, "epoch": 0.00388, "grad_norm": 0.8128796219825745, "kl": 0.7968976274132729, "learning_rate": 9.999953770589195e-06, "loss": -0.1637, "step": 194, "step_time": 5.7590284980014985 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 1585.0625, "completions/mean_terminated_length": 1585.0625, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "entropy": 0.1900235153734684, "epoch": 0.0039, "frac_reward_zero_std": 0.125, "grad_norm": 0.860226571559906, "kl": 0.3734680116176605, "learning_rate": 9.999953183556157e-06, "loss": -0.0902, "num_tokens": 7186577.0, "reward": 3.927793264389038, "reward_std": 5.897523880004883, "rewards/rollout_reward_func/mean": 3.927793264389038, "rewards/rollout_reward_func/std": 13.196045875549316, "sampling/importance_sampling_ratio/max": 2.547750949859619, "sampling/importance_sampling_ratio/mean": 0.9823294878005981, "sampling/importance_sampling_ratio/min": 0.09533966332674026, "sampling/sampling_logp_difference/max": 1.204984426498413, "sampling/sampling_logp_difference/mean": 0.047755300998687744, "step": 195, "step_time": 35.797688914998616 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.18859419785439968, "epoch": 0.00392, "grad_norm": 0.834666907787323, "kl": 0.3848831467330456, "learning_rate": 9.999952592819472e-06, "loss": -0.0905, "step": 196, "step_time": 5.723597453000366 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1687.1875, "completions/mean_terminated_length": 1687.1875, "completions/min_length": 1557.0, "completions/min_terminated_length": 1557.0, "entropy": 0.217379130423069, "epoch": 0.00394, "frac_reward_zero_std": 0.0, "grad_norm": 1.430611252784729, "kl": 0.6379734985530376, "learning_rate": 9.999951998379141e-06, "loss": -0.123, "num_tokens": 7260812.0, "reward": 2.1545767784118652, "reward_std": 5.987642765045166, "rewards/rollout_reward_func/mean": 2.1545767784118652, "rewards/rollout_reward_func/std": 7.767053127288818, "sampling/importance_sampling_ratio/max": 2.526907444000244, "sampling/importance_sampling_ratio/mean": 0.8589324951171875, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.872528076171875, "sampling/sampling_logp_difference/mean": 0.06381456553936005, "step": 197, "step_time": 35.551387553001405 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.21903221122920513, "epoch": 0.00396, "grad_norm": 1.5187474489212036, "kl": 0.6173972077667713, "learning_rate": 9.999951400235163e-06, "loss": -0.1276, "step": 198, "step_time": 5.812083100000564 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 1667.71875, "completions/mean_terminated_length": 1667.71875, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "entropy": 0.20807519741356373, "epoch": 0.00398, "frac_reward_zero_std": 0.0, "grad_norm": 1.3347526788711548, "kl": 0.47471469454467297, "learning_rate": 9.999950798387541e-06, "loss": 0.0517, "num_tokens": 7334918.0, "reward": -3.7062830924987793, "reward_std": 9.045034408569336, "rewards/rollout_reward_func/mean": -3.7062830924987793, "rewards/rollout_reward_func/std": 11.927645683288574, "sampling/importance_sampling_ratio/max": 1.9491627216339111, "sampling/importance_sampling_ratio/mean": 0.930275559425354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7468228340148926, "sampling/sampling_logp_difference/mean": 0.055586282163858414, "step": 199, "step_time": 35.96847454199906 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.013671875, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.2134502585977316, "epoch": 0.004, "grad_norm": 1.0929890871047974, "kl": 0.43012629821896553, "learning_rate": 9.999950192836272e-06, "loss": 0.0473, "step": 200, "step_time": 5.775383055001839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1598.46875, "completions/mean_terminated_length": 1598.46875, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "entropy": 0.22695614211261272, "epoch": 0.00402, "frac_reward_zero_std": 0.0, "grad_norm": 1.23384690284729, "kl": 0.5254122829064727, "learning_rate": 9.999949583581358e-06, "loss": -0.0825, "num_tokens": 7406851.0, "reward": 7.607372283935547, "reward_std": 9.699461936950684, "rewards/rollout_reward_func/mean": 7.607372283935547, "rewards/rollout_reward_func/std": 17.210861206054688, "sampling/importance_sampling_ratio/max": 1.7273722887039185, "sampling/importance_sampling_ratio/mean": 0.9000965356826782, "sampling/importance_sampling_ratio/min": 0.3363904654979706, "sampling/sampling_logp_difference/max": 1.1262538433074951, "sampling/sampling_logp_difference/mean": 0.05383963882923126, "step": 201, "step_time": 35.05857214800017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.23254447057843208, "epoch": 0.00404, "grad_norm": 1.067671298980713, "kl": 0.5273672332987189, "learning_rate": 9.999948970622801e-06, "loss": -0.0877, "step": 202, "step_time": 5.642065598000045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1694.34375, "completions/mean_terminated_length": 1694.34375, "completions/min_length": 1444.0, "completions/min_terminated_length": 1444.0, "entropy": 0.2840390596538782, "epoch": 0.00406, "frac_reward_zero_std": 0.0, "grad_norm": 1.5355794429779053, "kl": 0.39024360105395317, "learning_rate": 9.9999483539606e-06, "loss": -0.0927, "num_tokens": 7481438.0, "reward": 0.7141335010528564, "reward_std": 3.9084863662719727, "rewards/rollout_reward_func/mean": 0.7141335010528564, "rewards/rollout_reward_func/std": 9.917291641235352, "sampling/importance_sampling_ratio/max": 2.4801061153411865, "sampling/importance_sampling_ratio/mean": 1.0379252433776855, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 28.232906341552734, "sampling/sampling_logp_difference/mean": 0.10223247855901718, "step": 203, "step_time": 35.488045403002616 }, { "clip_ratio/high_max": 0.006623641354963183, "clip_ratio/high_mean": 0.0033118206774815917, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005264945677481592, "entropy": 0.28290311247110367, "epoch": 0.00408, "grad_norm": 1.391993522644043, "kl": 0.37290242314338684, "learning_rate": 9.999947733594757e-06, "loss": -0.0955, "step": 204, "step_time": 5.823711978000574 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1677.25, "completions/mean_terminated_length": 1677.25, "completions/min_length": 1565.0, "completions/min_terminated_length": 1565.0, "entropy": 0.25766652449965477, "epoch": 0.0041, "frac_reward_zero_std": 0.0, "grad_norm": 1.6373906135559082, "kl": 0.7895541861653328, "learning_rate": 9.99994710952527e-06, "loss": -0.2043, "num_tokens": 7555620.0, "reward": 0.5474326610565186, "reward_std": 5.27195930480957, "rewards/rollout_reward_func/mean": 0.5474326610565186, "rewards/rollout_reward_func/std": 9.263967514038086, "sampling/importance_sampling_ratio/max": 2.6494393348693848, "sampling/importance_sampling_ratio/mean": 0.9114236831665039, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2698535919189453, "sampling/sampling_logp_difference/mean": 0.07712831348180771, "step": 205, "step_time": 37.12327094200191 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 0.2550558466464281, "epoch": 0.00412, "grad_norm": 1.3040579557418823, "kl": 0.8384794592857361, "learning_rate": 9.999946481752143e-06, "loss": -0.208, "step": 206, "step_time": 6.206274641001073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 1649.0625, "completions/mean_terminated_length": 1649.0625, "completions/min_length": 1216.0, "completions/min_terminated_length": 1216.0, "entropy": 0.2342894095927477, "epoch": 0.00414, "frac_reward_zero_std": 0.0, "grad_norm": 0.9534094929695129, "kl": 0.5438143275678158, "learning_rate": 9.999945850275376e-06, "loss": -0.1419, "num_tokens": 7628698.0, "reward": 4.569943428039551, "reward_std": 5.696385383605957, "rewards/rollout_reward_func/mean": 4.569943428039551, "rewards/rollout_reward_func/std": 11.701920509338379, "sampling/importance_sampling_ratio/max": 2.2198283672332764, "sampling/importance_sampling_ratio/mean": 0.913837194442749, "sampling/importance_sampling_ratio/min": 0.21426241099834442, "sampling/sampling_logp_difference/max": 0.973109245300293, "sampling/sampling_logp_difference/mean": 0.05454317480325699, "step": 207, "step_time": 34.45527249300176 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.2328789085149765, "epoch": 0.00416, "grad_norm": 0.9537069797515869, "kl": 0.5390445850789547, "learning_rate": 9.999945215094968e-06, "loss": -0.1447, "step": 208, "step_time": 5.694117715996981 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1689.59375, "completions/mean_terminated_length": 1689.59375, "completions/min_length": 1565.0, "completions/min_terminated_length": 1565.0, "entropy": 0.23806917294859886, "epoch": 0.00418, "frac_reward_zero_std": 0.0, "grad_norm": 1.0990620851516724, "kl": 0.441210076212883, "learning_rate": 9.99994457621092e-06, "loss": -0.1143, "num_tokens": 7703515.0, "reward": -0.2485913634300232, "reward_std": 7.676312446594238, "rewards/rollout_reward_func/mean": -0.2485913634300232, "rewards/rollout_reward_func/std": 10.19659423828125, "sampling/importance_sampling_ratio/max": 2.198415994644165, "sampling/importance_sampling_ratio/mean": 0.9838775396347046, "sampling/importance_sampling_ratio/min": 0.2651430070400238, "sampling/sampling_logp_difference/max": 1.1037625074386597, "sampling/sampling_logp_difference/mean": 0.05238356068730354, "step": 209, "step_time": 35.312499495998054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.00390625, "entropy": 0.23526065610349178, "epoch": 0.0042, "grad_norm": 1.1086832284927368, "kl": 0.4285181984305382, "learning_rate": 9.999943933623233e-06, "loss": -0.119, "step": 210, "step_time": 5.8305381829995895 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 1694.34375, "completions/mean_terminated_length": 1694.34375, "completions/min_length": 1365.0, "completions/min_terminated_length": 1365.0, "entropy": 0.2092603985220194, "epoch": 0.00422, "frac_reward_zero_std": 0.0, "grad_norm": 1.210951328277588, "kl": 0.7087297588586807, "learning_rate": 9.999943287331909e-06, "loss": -0.1151, "num_tokens": 7778540.0, "reward": -0.5601588487625122, "reward_std": 4.062717437744141, "rewards/rollout_reward_func/mean": -0.5601588487625122, "rewards/rollout_reward_func/std": 7.223782539367676, "sampling/importance_sampling_ratio/max": 2.789172410964966, "sampling/importance_sampling_ratio/mean": 0.9125500917434692, "sampling/importance_sampling_ratio/min": 1.555470856284824e-11, "sampling/sampling_logp_difference/max": 23.064640045166016, "sampling/sampling_logp_difference/mean": 0.10738179087638855, "step": 211, "step_time": 36.472802145999594 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009535845601931214, "entropy": 0.2104271575808525, "epoch": 0.00424, "grad_norm": 1.276593804359436, "kl": 0.6613003388047218, "learning_rate": 9.999942637336943e-06, "loss": -0.1181, "step": 212, "step_time": 6.7614001400033885 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1655.40625, "completions/mean_terminated_length": 1655.40625, "completions/min_length": 1447.0, "completions/min_terminated_length": 1447.0, "entropy": 0.255456043407321, "epoch": 0.00426, "frac_reward_zero_std": 0.0, "grad_norm": 2.1123297214508057, "kl": 0.6608983650803566, "learning_rate": 9.999941983638343e-06, "loss": 0.0397, "num_tokens": 7852267.0, "reward": -2.758002996444702, "reward_std": 6.682594299316406, "rewards/rollout_reward_func/mean": -2.758002996444702, "rewards/rollout_reward_func/std": 14.27979850769043, "sampling/importance_sampling_ratio/max": 2.8922348022460938, "sampling/importance_sampling_ratio/mean": 1.0855534076690674, "sampling/importance_sampling_ratio/min": 0.13145792484283447, "sampling/sampling_logp_difference/max": 1.5352246761322021, "sampling/sampling_logp_difference/mean": 0.06498946994543076, "step": 213, "step_time": 35.69279204999839 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.009706439450383186, "clip_ratio/low_min": 0.0037878789007663727, "clip_ratio/region_mean": 0.015565814450383186, "entropy": 0.2532905638217926, "epoch": 0.00428, "grad_norm": 1.6830919981002808, "kl": 0.6681476458907127, "learning_rate": 9.999941326236106e-06, "loss": 0.0344, "step": 214, "step_time": 5.748447021000175 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0038470644503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005800189450383186, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1685.09375, "completions/mean_terminated_length": 1685.09375, "completions/min_length": 1064.0, "completions/min_terminated_length": 1064.0, "entropy": 0.304389376193285, "epoch": 0.0043, "frac_reward_zero_std": 0.0, "grad_norm": 1.0502004623413086, "kl": 0.5163263864815235, "learning_rate": 9.999940665130233e-06, "loss": -0.2013, "num_tokens": 7926984.0, "reward": 0.5382822751998901, "reward_std": 5.5314249992370605, "rewards/rollout_reward_func/mean": 0.5382822751998901, "rewards/rollout_reward_func/std": 8.343429565429688, "sampling/importance_sampling_ratio/max": 2.3640105724334717, "sampling/importance_sampling_ratio/mean": 1.0280786752700806, "sampling/importance_sampling_ratio/min": 2.17974208311227e-20, "sampling/sampling_logp_difference/max": 17.786712646484375, "sampling/sampling_logp_difference/mean": 0.15175247192382812, "step": 215, "step_time": 36.67021942499741 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.30115350522100925, "epoch": 0.00432, "grad_norm": 0.9675745964050293, "kl": 0.5201160311698914, "learning_rate": 9.999940000320726e-06, "loss": -0.2036, "step": 216, "step_time": 5.841338779997386 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1673.5625, "completions/mean_terminated_length": 1673.5625, "completions/min_length": 1486.0, "completions/min_terminated_length": 1486.0, "entropy": 0.21431122720241547, "epoch": 0.00434, "frac_reward_zero_std": 0.0, "grad_norm": 1.535396933555603, "kl": 0.516041487455368, "learning_rate": 9.999939331807582e-06, "loss": -0.0649, "num_tokens": 8001770.0, "reward": 4.304098129272461, "reward_std": 6.8736419677734375, "rewards/rollout_reward_func/mean": 4.304098129272461, "rewards/rollout_reward_func/std": 15.587159156799316, "sampling/importance_sampling_ratio/max": 2.9547388553619385, "sampling/importance_sampling_ratio/mean": 1.0765466690063477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.359184980392456, "sampling/sampling_logp_difference/mean": 0.05725931376218796, "step": 217, "step_time": 35.47372649599674 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.2103865798562765, "epoch": 0.00436, "grad_norm": 1.193834662437439, "kl": 0.5380803644657135, "learning_rate": 9.999938659590807e-06, "loss": -0.0676, "step": 218, "step_time": 6.670932665001601 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 1694.21875, "completions/mean_terminated_length": 1694.21875, "completions/min_length": 1544.0, "completions/min_terminated_length": 1544.0, "entropy": 0.1990877389907837, "epoch": 0.00438, "frac_reward_zero_std": 0.0, "grad_norm": 1.2838199138641357, "kl": 1.5804159492254257, "learning_rate": 9.999937983670399e-06, "loss": -0.1898, "num_tokens": 8076281.0, "reward": 4.216753005981445, "reward_std": 4.654492378234863, "rewards/rollout_reward_func/mean": 4.216753005981445, "rewards/rollout_reward_func/std": 7.823268413543701, "sampling/importance_sampling_ratio/max": 2.515824556350708, "sampling/importance_sampling_ratio/mean": 0.7889621257781982, "sampling/importance_sampling_ratio/min": 0.07681597769260406, "sampling/sampling_logp_difference/max": 2.2358238697052, "sampling/sampling_logp_difference/mean": 0.08003745228052139, "step": 219, "step_time": 37.58560269500049 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.19579317048192024, "epoch": 0.0044, "grad_norm": 0.9996125102043152, "kl": 1.5866832248866558, "learning_rate": 9.999937304046356e-06, "loss": -0.1923, "step": 220, "step_time": 5.752707515996008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1667.9375, "completions/mean_terminated_length": 1667.9375, "completions/min_length": 1480.0, "completions/min_terminated_length": 1480.0, "entropy": 0.22602487355470657, "epoch": 0.00442, "frac_reward_zero_std": 0.0, "grad_norm": 0.9601463675498962, "kl": 0.6449089199304581, "learning_rate": 9.99993662071868e-06, "loss": -0.1863, "num_tokens": 8150161.0, "reward": -3.656409740447998, "reward_std": 5.3043951988220215, "rewards/rollout_reward_func/mean": -3.656409740447998, "rewards/rollout_reward_func/std": 11.171483993530273, "sampling/importance_sampling_ratio/max": 2.952061653137207, "sampling/importance_sampling_ratio/mean": 0.7157564163208008, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.000808715820312, "sampling/sampling_logp_difference/mean": 0.11645625531673431, "step": 221, "step_time": 37.025948885000616 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.021484375, "entropy": 0.22284850105643272, "epoch": 0.00444, "grad_norm": 0.906544029712677, "kl": 0.6455521509051323, "learning_rate": 9.999935933687375e-06, "loss": -0.1901, "step": 222, "step_time": 5.777160473999174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1680.875, "completions/mean_terminated_length": 1680.875, "completions/min_length": 1399.0, "completions/min_terminated_length": 1399.0, "entropy": 0.20937101170420647, "epoch": 0.00446, "frac_reward_zero_std": 0.0, "grad_norm": 1.123390555381775, "kl": 0.7408150210976601, "learning_rate": 9.99993524295244e-06, "loss": -0.1761, "num_tokens": 8224415.0, "reward": -1.7068707942962646, "reward_std": 6.148789882659912, "rewards/rollout_reward_func/mean": -1.7068707942962646, "rewards/rollout_reward_func/std": 11.295839309692383, "sampling/importance_sampling_ratio/max": 2.4562361240386963, "sampling/importance_sampling_ratio/mean": 0.7759705781936646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6416139602661133, "sampling/sampling_logp_difference/mean": 0.07305672764778137, "step": 223, "step_time": 36.99016052300249 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.2060265801846981, "epoch": 0.00448, "grad_norm": 1.0533183813095093, "kl": 0.7319801151752472, "learning_rate": 9.999934548513875e-06, "loss": -0.1797, "step": 224, "step_time": 5.814677470998504 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004557291744276881, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1656.375, "completions/mean_terminated_length": 1656.375, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "entropy": 0.18023328483104706, "epoch": 0.0045, "frac_reward_zero_std": 0.125, "grad_norm": 1.3207415342330933, "kl": 0.9109714720398188, "learning_rate": 9.999933850371681e-06, "loss": -0.2514, "num_tokens": 8297842.0, "reward": 0.6653532981872559, "reward_std": 4.2065510749816895, "rewards/rollout_reward_func/mean": 0.6653532981872559, "rewards/rollout_reward_func/std": 7.791869163513184, "sampling/importance_sampling_ratio/max": 2.713641405105591, "sampling/importance_sampling_ratio/mean": 0.8001816868782043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8015203475952148, "sampling/sampling_logp_difference/mean": 0.07049349695444107, "step": 225, "step_time": 35.520827905998885 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.004557291744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006510416744276881, "entropy": 0.1753668338060379, "epoch": 0.00452, "grad_norm": 1.195987343788147, "kl": 0.9277527779340744, "learning_rate": 9.999933148525858e-06, "loss": -0.2553, "step": 226, "step_time": 5.7582581370006665 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037913601845502853, "completions/clipped_ratio": 0.0, "completions/max_length": 1834.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 1666.375, "completions/mean_terminated_length": 1666.375, "completions/min_length": 1242.0, "completions/min_terminated_length": 1242.0, "entropy": 0.20567630976438522, "epoch": 0.00454, "frac_reward_zero_std": 0.0, "grad_norm": 2.6567699909210205, "kl": 0.8131851889193058, "learning_rate": 9.999932442976408e-06, "loss": 0.2988, "num_tokens": 8371968.0, "reward": 1.6306254863739014, "reward_std": 4.1761884689331055, "rewards/rollout_reward_func/mean": 1.6306254863739014, "rewards/rollout_reward_func/std": 13.871889114379883, "sampling/importance_sampling_ratio/max": 2.815732717514038, "sampling/importance_sampling_ratio/mean": 0.9413543939590454, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 26.72653579711914, "sampling/sampling_logp_difference/mean": 0.11560939252376556, "step": 227, "step_time": 35.89721404099873 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.20439518056809902, "epoch": 0.00456, "grad_norm": 1.5760877132415771, "kl": 0.8568692058324814, "learning_rate": 9.99993173372333e-06, "loss": 0.2944, "step": 228, "step_time": 5.827364698001475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1671.65625, "completions/mean_terminated_length": 1671.65625, "completions/min_length": 1521.0, "completions/min_terminated_length": 1521.0, "entropy": 0.22973632253706455, "epoch": 0.00458, "frac_reward_zero_std": 0.0, "grad_norm": 1.1046069860458374, "kl": 1.1489849016070366, "learning_rate": 9.999931020766626e-06, "loss": -0.3328, "num_tokens": 8446037.0, "reward": 3.500030279159546, "reward_std": 7.045429229736328, "rewards/rollout_reward_func/mean": 3.500030279159546, "rewards/rollout_reward_func/std": 8.772632598876953, "sampling/importance_sampling_ratio/max": 2.272545099258423, "sampling/importance_sampling_ratio/mean": 0.9003783464431763, "sampling/importance_sampling_ratio/min": 7.296939094625365e-12, "sampling/sampling_logp_difference/max": 24.617910385131836, "sampling/sampling_logp_difference/mean": 0.12970557808876038, "step": 229, "step_time": 37.32937073400353 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009650735184550285, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019416360184550285, "entropy": 0.22551355883479118, "epoch": 0.0046, "grad_norm": 1.1626572608947754, "kl": 1.2178469970822334, "learning_rate": 9.999930304106296e-06, "loss": -0.3359, "step": 230, "step_time": 5.758782369999608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 1643.5, "completions/mean_terminated_length": 1643.5, "completions/min_length": 1231.0, "completions/min_terminated_length": 1231.0, "entropy": 0.18366078101098537, "epoch": 0.00462, "frac_reward_zero_std": 0.0, "grad_norm": 1.5905568599700928, "kl": 0.48303202725946903, "learning_rate": 9.99992958374234e-06, "loss": -0.0905, "num_tokens": 8519605.0, "reward": 6.464164733886719, "reward_std": 3.211686372756958, "rewards/rollout_reward_func/mean": 6.464164733886719, "rewards/rollout_reward_func/std": 16.602075576782227, "sampling/importance_sampling_ratio/max": 2.2648463249206543, "sampling/importance_sampling_ratio/mean": 0.9129816293716431, "sampling/importance_sampling_ratio/min": 1.7741487651274626e-18, "sampling/sampling_logp_difference/max": 25.275915145874023, "sampling/sampling_logp_difference/mean": 0.17626813054084778, "step": 231, "step_time": 35.11706399999821 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.003791360300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007697610300965607, "entropy": 0.1828291267156601, "epoch": 0.00464, "grad_norm": 1.4188190698623657, "kl": 0.4656708240509033, "learning_rate": 9.999928859674762e-06, "loss": -0.0945, "step": 232, "step_time": 5.8302720290066645 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 1539.03125, "completions/mean_terminated_length": 1539.03125, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.20367534644901752, "epoch": 0.00466, "frac_reward_zero_std": 0.0, "grad_norm": 3.0687413215637207, "kl": 1.4640729799866676, "learning_rate": 9.999928131903557e-06, "loss": 0.0491, "num_tokens": 8589842.0, "reward": 7.841023921966553, "reward_std": 7.656625747680664, "rewards/rollout_reward_func/mean": 7.841023921966553, "rewards/rollout_reward_func/std": 13.105290412902832, "sampling/importance_sampling_ratio/max": 2.7232446670532227, "sampling/importance_sampling_ratio/mean": 0.7804238796234131, "sampling/importance_sampling_ratio/min": 0.13633084297180176, "sampling/sampling_logp_difference/max": 2.269442558288574, "sampling/sampling_logp_difference/mean": 0.06575377285480499, "step": 233, "step_time": 33.579131972999676 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008463541744276881, "entropy": 0.20685471966862679, "epoch": 0.00468, "grad_norm": 2.082902193069458, "kl": 1.0169252753257751, "learning_rate": 9.999927400428733e-06, "loss": 0.0434, "step": 234, "step_time": 5.8269990400003735 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 1706.3125, "completions/mean_terminated_length": 1706.3125, "completions/min_length": 1573.0, "completions/min_terminated_length": 1573.0, "entropy": 0.1788796652108431, "epoch": 0.0047, "frac_reward_zero_std": 0.0, "grad_norm": 1.1087870597839355, "kl": 0.5621245931833982, "learning_rate": 9.999926665250287e-06, "loss": -0.0253, "num_tokens": 8664926.0, "reward": 2.5220413208007812, "reward_std": 3.3794925212860107, "rewards/rollout_reward_func/mean": 2.5220413208007812, "rewards/rollout_reward_func/std": 5.743170261383057, "sampling/importance_sampling_ratio/max": 2.7984251976013184, "sampling/importance_sampling_ratio/mean": 0.8231557607650757, "sampling/importance_sampling_ratio/min": 0.2042286992073059, "sampling/sampling_logp_difference/max": 1.3143759965896606, "sampling/sampling_logp_difference/mean": 0.058986593037843704, "step": 235, "step_time": 35.771082822997414 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.18589365482330322, "epoch": 0.00472, "grad_norm": 1.253989338874817, "kl": 0.5263552982360125, "learning_rate": 9.999925926368217e-06, "loss": -0.028, "step": 236, "step_time": 5.811419122999723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1694.59375, "completions/mean_terminated_length": 1694.59375, "completions/min_length": 1606.0, "completions/min_terminated_length": 1606.0, "entropy": 0.17474745213985443, "epoch": 0.00474, "frac_reward_zero_std": 0.0, "grad_norm": 1.3517323732376099, "kl": 0.8328725062310696, "learning_rate": 9.999925183782528e-06, "loss": -0.1822, "num_tokens": 8739844.0, "reward": 0.9312727451324463, "reward_std": 4.400314807891846, "rewards/rollout_reward_func/mean": 0.9312727451324463, "rewards/rollout_reward_func/std": 12.9136381149292, "sampling/importance_sampling_ratio/max": 1.6590906381607056, "sampling/importance_sampling_ratio/mean": 0.8407135605812073, "sampling/importance_sampling_ratio/min": 0.12429223209619522, "sampling/sampling_logp_difference/max": 1.6658029556274414, "sampling/sampling_logp_difference/mean": 0.05644724518060684, "step": 237, "step_time": 36.696694154996294 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.1794893555343151, "epoch": 0.00476, "grad_norm": 1.0692886114120483, "kl": 0.8188136555254459, "learning_rate": 9.99992443749322e-06, "loss": -0.184, "step": 238, "step_time": 5.769766430996242 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 1614.9375, "completions/mean_terminated_length": 1614.9375, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "entropy": 0.18347695656120777, "epoch": 0.00478, "frac_reward_zero_std": 0.125, "grad_norm": 2.0552101135253906, "kl": 0.3097874727100134, "learning_rate": 9.99992368750029e-06, "loss": 0.0694, "num_tokens": 8812269.0, "reward": 8.014856338500977, "reward_std": 4.783290863037109, "rewards/rollout_reward_func/mean": 8.014856338500977, "rewards/rollout_reward_func/std": 12.856328964233398, "sampling/importance_sampling_ratio/max": 2.936600923538208, "sampling/importance_sampling_ratio/mean": 1.0911171436309814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1101388931274414, "sampling/sampling_logp_difference/mean": 0.040012530982494354, "step": 239, "step_time": 33.62449117500364 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.006138392956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011997767956927419, "entropy": 0.1877030562609434, "epoch": 0.0048, "grad_norm": 1.3781535625457764, "kl": 0.3060177452862263, "learning_rate": 9.999922933803743e-06, "loss": 0.0649, "step": 240, "step_time": 6.31635257200287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1668.1875, "completions/mean_terminated_length": 1668.1875, "completions/min_length": 1092.0, "completions/min_terminated_length": 1092.0, "entropy": 0.2379392758011818, "epoch": 0.00482, "frac_reward_zero_std": 0.0, "grad_norm": 1.326453685760498, "kl": 0.6201226636767387, "learning_rate": 9.999922176403579e-06, "loss": -0.1752, "num_tokens": 8886070.0, "reward": -0.9512331485748291, "reward_std": 5.470861911773682, "rewards/rollout_reward_func/mean": -0.9512331485748291, "rewards/rollout_reward_func/std": 10.099501609802246, "sampling/importance_sampling_ratio/max": 2.10463285446167, "sampling/importance_sampling_ratio/mean": 0.9869784116744995, "sampling/importance_sampling_ratio/min": 1.0831281184453534e-11, "sampling/sampling_logp_difference/max": 23.32485580444336, "sampling/sampling_logp_difference/mean": 0.11264218389987946, "step": 241, "step_time": 36.34408466700006 }, { "clip_ratio/high_max": 0.011488970601931214, "clip_ratio/high_mean": 0.007697610300965607, "clip_ratio/low_mean": 0.013020833488553762, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.02071844378951937, "entropy": 0.2382805310189724, "epoch": 0.00484, "grad_norm": 0.9912970662117004, "kl": 0.5726640149950981, "learning_rate": 9.999921415299796e-06, "loss": -0.1802, "step": 242, "step_time": 5.751053724003214 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.017578125, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 1690.5, "completions/mean_terminated_length": 1690.5, "completions/min_length": 1430.0, "completions/min_terminated_length": 1430.0, "entropy": 0.19938462413847446, "epoch": 0.00486, "frac_reward_zero_std": 0.125, "grad_norm": 1.312639594078064, "kl": 0.6614471711218357, "learning_rate": 9.9999206504924e-06, "loss": -0.1751, "num_tokens": 8960640.0, "reward": 1.9128804206848145, "reward_std": 4.455126762390137, "rewards/rollout_reward_func/mean": 1.9128804206848145, "rewards/rollout_reward_func/std": 7.3313374519348145, "sampling/importance_sampling_ratio/max": 2.6208341121673584, "sampling/importance_sampling_ratio/mean": 1.0078068971633911, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7909104824066162, "sampling/sampling_logp_difference/mean": 0.060140155255794525, "step": 243, "step_time": 36.51225224099835 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.017578125, "entropy": 0.19580959528684616, "epoch": 0.00488, "grad_norm": 1.1415836811065674, "kl": 0.6649853922426701, "learning_rate": 9.999919881981385e-06, "loss": -0.1771, "step": 244, "step_time": 5.780806564001978 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 1675.125, "completions/mean_terminated_length": 1675.125, "completions/min_length": 1362.0, "completions/min_terminated_length": 1362.0, "entropy": 0.24602361768484116, "epoch": 0.0049, "frac_reward_zero_std": 0.0, "grad_norm": 1.2397838830947876, "kl": 0.5857466273009777, "learning_rate": 9.99991910976676e-06, "loss": -0.041, "num_tokens": 9034635.0, "reward": 1.560486912727356, "reward_std": 6.24477481842041, "rewards/rollout_reward_func/mean": 1.560486912727356, "rewards/rollout_reward_func/std": 11.325467109680176, "sampling/importance_sampling_ratio/max": 1.8394286632537842, "sampling/importance_sampling_ratio/mean": 0.8830543756484985, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.549360275268555, "sampling/sampling_logp_difference/mean": 0.12986382842063904, "step": 245, "step_time": 36.87764433600023 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.24585570394992828, "epoch": 0.00492, "grad_norm": 1.0776090621948242, "kl": 0.6381884589791298, "learning_rate": 9.999918333848517e-06, "loss": -0.0432, "step": 246, "step_time": 6.26423046400123 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1672.25, "completions/mean_terminated_length": 1672.25, "completions/min_length": 1498.0, "completions/min_terminated_length": 1498.0, "entropy": 0.1877462100237608, "epoch": 0.00494, "frac_reward_zero_std": 0.125, "grad_norm": 1.3833686113357544, "kl": 0.9569054283201694, "learning_rate": 9.999917554226663e-06, "loss": -0.0526, "num_tokens": 9108890.0, "reward": 4.796204090118408, "reward_std": 5.669317722320557, "rewards/rollout_reward_func/mean": 4.796204090118408, "rewards/rollout_reward_func/std": 10.93161678314209, "sampling/importance_sampling_ratio/max": 2.1446852684020996, "sampling/importance_sampling_ratio/mean": 0.9228631258010864, "sampling/importance_sampling_ratio/min": 0.04307766631245613, "sampling/sampling_logp_difference/max": 1.969184160232544, "sampling/sampling_logp_difference/mean": 0.06175312399864197, "step": 247, "step_time": 35.40395965700009 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.18796673975884914, "epoch": 0.00496, "grad_norm": 1.1867882013320923, "kl": 0.8879027515649796, "learning_rate": 9.999916770901197e-06, "loss": -0.0583, "step": 248, "step_time": 5.787131861001399 }, { "clip_ratio/high_max": 0.011488970601931214, "clip_ratio/high_mean": 0.005744485300965607, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005744485300965607, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 1709.9375, "completions/mean_terminated_length": 1709.9375, "completions/min_length": 1346.0, "completions/min_terminated_length": 1346.0, "entropy": 0.18857184424996376, "epoch": 0.00498, "frac_reward_zero_std": 0.125, "grad_norm": 1.3715571165084839, "kl": 0.6793716847896576, "learning_rate": 9.999915983872118e-06, "loss": 0.0529, "num_tokens": 9184343.0, "reward": 8.366767883300781, "reward_std": 4.157901287078857, "rewards/rollout_reward_func/mean": 8.366767883300781, "rewards/rollout_reward_func/std": 10.230462074279785, "sampling/importance_sampling_ratio/max": 1.9824284315109253, "sampling/importance_sampling_ratio/mean": 0.9958308339118958, "sampling/importance_sampling_ratio/min": 9.667034967658639e-13, "sampling/sampling_logp_difference/max": 26.4067325592041, "sampling/sampling_logp_difference/mean": 0.10002212971448898, "step": 249, "step_time": 34.97354119299962 }, { "clip_ratio/high_max": 0.023207720601931214, "clip_ratio/high_mean": 0.013556985300965607, "clip_ratio/low_mean": 0.009650735300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023207720601931214, "entropy": 0.18937593139708042, "epoch": 0.005, "grad_norm": 0.9603456854820251, "kl": 0.6774471215903759, "learning_rate": 9.99991519313943e-06, "loss": 0.0489, "step": 250, "step_time": 5.79304310400039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1708.8125, "completions/mean_terminated_length": 1708.8125, "completions/min_length": 1359.0, "completions/min_terminated_length": 1359.0, "entropy": 0.1984053999185562, "epoch": 0.00502, "frac_reward_zero_std": 0.0, "grad_norm": 2.047070026397705, "kl": 1.8021456748247147, "learning_rate": 9.999914398703129e-06, "loss": -0.038, "num_tokens": 9259484.0, "reward": 6.946834087371826, "reward_std": 5.76743221282959, "rewards/rollout_reward_func/mean": 6.946834087371826, "rewards/rollout_reward_func/std": 14.136740684509277, "sampling/importance_sampling_ratio/max": 2.717007875442505, "sampling/importance_sampling_ratio/mean": 1.0246163606643677, "sampling/importance_sampling_ratio/min": 6.007295355603404e-11, "sampling/sampling_logp_difference/max": 23.782129287719727, "sampling/sampling_logp_difference/mean": 0.11047248542308807, "step": 251, "step_time": 37.13627710699984 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.005744485300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015510110300965607, "entropy": 0.20306510664522648, "epoch": 0.00504, "grad_norm": 1.5997041463851929, "kl": 1.5759989619255066, "learning_rate": 9.99991360056322e-06, "loss": -0.0419, "step": 252, "step_time": 5.818557085000066 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013671875, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1716.0, "completions/mean_terminated_length": 1716.0, "completions/min_length": 1596.0, "completions/min_terminated_length": 1596.0, "entropy": 0.24246199801564217, "epoch": 0.00506, "frac_reward_zero_std": 0.0, "grad_norm": 1.3612204790115356, "kl": 0.5908343940973282, "learning_rate": 9.999912798719703e-06, "loss": -0.078, "num_tokens": 9334973.0, "reward": 6.846713542938232, "reward_std": 8.991424560546875, "rewards/rollout_reward_func/mean": 6.846713542938232, "rewards/rollout_reward_func/std": 9.810685157775879, "sampling/importance_sampling_ratio/max": 2.0176548957824707, "sampling/importance_sampling_ratio/mean": 0.9388402104377747, "sampling/importance_sampling_ratio/min": 0.17772360146045685, "sampling/sampling_logp_difference/max": 1.4709601402282715, "sampling/sampling_logp_difference/mean": 0.06256502121686935, "step": 253, "step_time": 36.63974985200002 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017578125, "entropy": 0.24471604451537132, "epoch": 0.00508, "grad_norm": 1.3345394134521484, "kl": 0.5763493180274963, "learning_rate": 9.999911993172577e-06, "loss": -0.081, "step": 254, "step_time": 5.774125941998136 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 1612.78125, "completions/mean_terminated_length": 1612.78125, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "entropy": 0.19570082426071167, "epoch": 0.0051, "frac_reward_zero_std": 0.0, "grad_norm": 1.4826679229736328, "kl": 0.6293303966522217, "learning_rate": 9.999911183921846e-06, "loss": -0.1805, "num_tokens": 9407379.0, "reward": 3.33709716796875, "reward_std": 4.991109848022461, "rewards/rollout_reward_func/mean": 3.33709716796875, "rewards/rollout_reward_func/std": 10.105610847473145, "sampling/importance_sampling_ratio/max": 2.305222511291504, "sampling/importance_sampling_ratio/mean": 0.985419750213623, "sampling/importance_sampling_ratio/min": 0.12469647079706192, "sampling/sampling_logp_difference/max": 1.334002137184143, "sampling/sampling_logp_difference/mean": 0.039491329342126846, "step": 255, "step_time": 35.05917253399275 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "entropy": 0.19735204800963402, "epoch": 0.00512, "grad_norm": 1.344710111618042, "kl": 0.5909395255148411, "learning_rate": 9.999910370967508e-06, "loss": -0.184, "step": 256, "step_time": 5.756917624001289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1787.0, "completions/max_terminated_length": 1787.0, "completions/mean_length": 1546.15625, "completions/mean_terminated_length": 1546.15625, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "entropy": 0.1884195413440466, "epoch": 0.00514, "frac_reward_zero_std": 0.125, "grad_norm": 1.3824015855789185, "kl": 0.3445777054876089, "learning_rate": 9.999909554309565e-06, "loss": -0.0226, "num_tokens": 9477160.0, "reward": 1.7944698333740234, "reward_std": 7.204285621643066, "rewards/rollout_reward_func/mean": 1.7944698333740234, "rewards/rollout_reward_func/std": 15.566790580749512, "sampling/importance_sampling_ratio/max": 1.7864809036254883, "sampling/importance_sampling_ratio/mean": 0.9369933605194092, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.003820776939392, "sampling/sampling_logp_difference/mean": 0.04316379129886627, "step": 257, "step_time": 36.06274823200329 }, { "clip_ratio/high_max": 0.013020833488553762, "clip_ratio/high_mean": 0.008463541977107525, "clip_ratio/low_mean": 0.004557291744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833721384406, "entropy": 0.19156392104923725, "epoch": 0.00516, "grad_norm": 1.3535003662109375, "kl": 0.3384551331400871, "learning_rate": 9.999908733948019e-06, "loss": -0.0275, "step": 258, "step_time": 5.753975410003477 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1711.4375, "completions/mean_terminated_length": 1711.4375, "completions/min_length": 1467.0, "completions/min_terminated_length": 1467.0, "entropy": 0.22483949549496174, "epoch": 0.00518, "frac_reward_zero_std": 0.0, "grad_norm": 1.9391635656356812, "kl": 0.404471006244421, "learning_rate": 9.999907909882866e-06, "loss": -0.0004, "num_tokens": 9552804.0, "reward": -1.872456431388855, "reward_std": 3.4258460998535156, "rewards/rollout_reward_func/mean": -1.872456431388855, "rewards/rollout_reward_func/std": 10.646695137023926, "sampling/importance_sampling_ratio/max": 2.742706537246704, "sampling/importance_sampling_ratio/mean": 0.9974928498268127, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1340551376342773, "sampling/sampling_logp_difference/mean": 0.04761877655982971, "step": 259, "step_time": 35.838881236002635 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.22717351838946342, "epoch": 0.0052, "grad_norm": 1.7140471935272217, "kl": 0.41914040222764015, "learning_rate": 9.999907082114113e-06, "loss": -0.0055, "step": 260, "step_time": 5.778732164999383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 1634.71875, "completions/mean_terminated_length": 1634.71875, "completions/min_length": 1312.0, "completions/min_terminated_length": 1312.0, "entropy": 0.22770886309444904, "epoch": 0.00522, "frac_reward_zero_std": 0.0, "grad_norm": 1.3558635711669922, "kl": 0.603220384567976, "learning_rate": 9.999906250641757e-06, "loss": -0.0501, "num_tokens": 9625522.0, "reward": 8.935173988342285, "reward_std": 8.57618522644043, "rewards/rollout_reward_func/mean": 8.935173988342285, "rewards/rollout_reward_func/std": 14.713119506835938, "sampling/importance_sampling_ratio/max": 1.785384178161621, "sampling/importance_sampling_ratio/mean": 0.9235571622848511, "sampling/importance_sampling_ratio/min": 4.5317635797921906e-17, "sampling/sampling_logp_difference/max": 21.840742111206055, "sampling/sampling_logp_difference/mean": 0.10905331373214722, "step": 261, "step_time": 34.28906107799594 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "entropy": 0.2287269402295351, "epoch": 0.00524, "grad_norm": 1.2795958518981934, "kl": 0.6055284570902586, "learning_rate": 9.9999054154658e-06, "loss": -0.0522, "step": 262, "step_time": 5.695764250996945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1626.8125, "completions/mean_terminated_length": 1626.8125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.25801716931164265, "epoch": 0.00526, "frac_reward_zero_std": 0.0, "grad_norm": 2.5012454986572266, "kl": 1.5967573653906584, "learning_rate": 9.999904576586242e-06, "loss": -0.1271, "num_tokens": 9698251.0, "reward": 6.984275817871094, "reward_std": 8.85349178314209, "rewards/rollout_reward_func/mean": 6.984275817871094, "rewards/rollout_reward_func/std": 9.481241226196289, "sampling/importance_sampling_ratio/max": 2.636712074279785, "sampling/importance_sampling_ratio/mean": 0.9890180826187134, "sampling/importance_sampling_ratio/min": 6.474818459167864e-10, "sampling/sampling_logp_difference/max": 21.65591049194336, "sampling/sampling_logp_difference/mean": 0.08674205839633942, "step": 263, "step_time": 37.276569184001346 }, { "clip_ratio/high_max": 0.011488970601931214, "clip_ratio/high_mean": 0.007697610300965607, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009650735300965607, "entropy": 0.26125726476311684, "epoch": 0.00528, "grad_norm": 2.118901491165161, "kl": 1.4512584786862135, "learning_rate": 9.999903734003084e-06, "loss": -0.1336, "step": 264, "step_time": 5.745041152002159 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 1624.78125, "completions/mean_terminated_length": 1624.78125, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "entropy": 0.22747422195971012, "epoch": 0.0053, "frac_reward_zero_std": 0.0, "grad_norm": 1.3240665197372437, "kl": 0.2730750413611531, "learning_rate": 9.999902887716329e-06, "loss": -0.2542, "num_tokens": 9771080.0, "reward": -2.6601641178131104, "reward_std": 5.340880870819092, "rewards/rollout_reward_func/mean": -2.6601641178131104, "rewards/rollout_reward_func/std": 8.568302154541016, "sampling/importance_sampling_ratio/max": 2.1538095474243164, "sampling/importance_sampling_ratio/mean": 1.0698328018188477, "sampling/importance_sampling_ratio/min": 2.7574214611965437e-18, "sampling/sampling_logp_difference/max": 25.709339141845703, "sampling/sampling_logp_difference/mean": 0.1604633778333664, "step": 265, "step_time": 35.148489481998695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004356971243396401, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004356971243396401, "entropy": 0.22533436864614487, "epoch": 0.00532, "grad_norm": 1.1646156311035156, "kl": 0.27127676364034414, "learning_rate": 9.999902037725978e-06, "loss": -0.2582, "step": 266, "step_time": 5.661973868998757 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 1673.6875, "completions/mean_terminated_length": 1673.6875, "completions/min_length": 1527.0, "completions/min_terminated_length": 1527.0, "entropy": 0.2219645120203495, "epoch": 0.00534, "frac_reward_zero_std": 0.0, "grad_norm": 1.302032709121704, "kl": 0.7969283424317837, "learning_rate": 9.999901184032026e-06, "loss": -0.0049, "num_tokens": 9845301.0, "reward": 4.670074939727783, "reward_std": 4.829067230224609, "rewards/rollout_reward_func/mean": 4.670074939727783, "rewards/rollout_reward_func/std": 12.953495025634766, "sampling/importance_sampling_ratio/max": 2.7272145748138428, "sampling/importance_sampling_ratio/mean": 0.9917882680892944, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0262956619262695, "sampling/sampling_logp_difference/mean": 0.052393145859241486, "step": 267, "step_time": 35.151963327001795 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017578125, "entropy": 0.21915441751480103, "epoch": 0.00536, "grad_norm": 1.4033211469650269, "kl": 0.8538418840616941, "learning_rate": 9.999900326634479e-06, "loss": -0.0088, "step": 268, "step_time": 6.602220959000988 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006119791883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1656.53125, "completions/mean_terminated_length": 1656.53125, "completions/min_length": 1121.0, "completions/min_terminated_length": 1121.0, "entropy": 0.22007588855922222, "epoch": 0.00538, "frac_reward_zero_std": 0.0, "grad_norm": 1.6505411863327026, "kl": 0.35358257219195366, "learning_rate": 9.999899465533338e-06, "loss": -0.0248, "num_tokens": 9919144.0, "reward": -0.7222501039505005, "reward_std": 6.3980913162231445, "rewards/rollout_reward_func/mean": -0.7222501039505005, "rewards/rollout_reward_func/std": 10.620811462402344, "sampling/importance_sampling_ratio/max": 2.1663742065429688, "sampling/importance_sampling_ratio/mean": 0.9528242349624634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.454941749572754, "sampling/sampling_logp_difference/mean": 0.04491892457008362, "step": 269, "step_time": 35.42834161200153 }, { "clip_ratio/high_max": 0.008370535913854837, "clip_ratio/high_mean": 0.006268601398915052, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006268601398915052, "entropy": 0.22122930735349655, "epoch": 0.0054, "grad_norm": 1.4210882186889648, "kl": 0.3376936595886946, "learning_rate": 9.999898600728599e-06, "loss": -0.0281, "step": 270, "step_time": 5.819409946001542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 1610.53125, "completions/mean_terminated_length": 1610.53125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "entropy": 0.2343030981719494, "epoch": 0.00542, "frac_reward_zero_std": 0.0, "grad_norm": 1.257897138595581, "kl": 0.5152953676879406, "learning_rate": 9.99989773222027e-06, "loss": -0.1168, "num_tokens": 9991321.0, "reward": 5.170373439788818, "reward_std": 8.166898727416992, "rewards/rollout_reward_func/mean": 5.170373439788818, "rewards/rollout_reward_func/std": 11.35286808013916, "sampling/importance_sampling_ratio/max": 2.139758825302124, "sampling/importance_sampling_ratio/mean": 0.9269818663597107, "sampling/importance_sampling_ratio/min": 4.740564885086229e-11, "sampling/sampling_logp_difference/max": 23.737991333007812, "sampling/sampling_logp_difference/mean": 0.08782394975423813, "step": 271, "step_time": 33.72024956799942 }, { "clip_ratio/high_max": 0.014062500093132257, "clip_ratio/high_mean": 0.008869485231116414, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008869485231116414, "entropy": 0.23815979063510895, "epoch": 0.00544, "grad_norm": 1.2076594829559326, "kl": 0.5196094233542681, "learning_rate": 9.999896860008346e-06, "loss": -0.1191, "step": 272, "step_time": 5.770830136003497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1821.0, "completions/max_terminated_length": 1821.0, "completions/mean_length": 1693.09375, "completions/mean_terminated_length": 1693.09375, "completions/min_length": 1545.0, "completions/min_terminated_length": 1545.0, "entropy": 0.24442918226122856, "epoch": 0.00546, "frac_reward_zero_std": 0.0, "grad_norm": 1.7162142992019653, "kl": 0.4226865954697132, "learning_rate": 9.999895984092831e-06, "loss": -0.0679, "num_tokens": 10066041.0, "reward": -1.6974170207977295, "reward_std": 6.5335187911987305, "rewards/rollout_reward_func/mean": -1.6974170207977295, "rewards/rollout_reward_func/std": 12.975322723388672, "sampling/importance_sampling_ratio/max": 2.2359983921051025, "sampling/importance_sampling_ratio/mean": 1.0108340978622437, "sampling/importance_sampling_ratio/min": 0.21830794215202332, "sampling/sampling_logp_difference/max": 0.9789800643920898, "sampling/sampling_logp_difference/mean": 0.05331780016422272, "step": 273, "step_time": 37.78565727399837 }, { "clip_ratio/high_max": 0.02734375, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.013671875, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.029296875, "entropy": 0.24502579309046268, "epoch": 0.00548, "grad_norm": 1.5775851011276245, "kl": 0.41350132040679455, "learning_rate": 9.999895104473725e-06, "loss": -0.0729, "step": 274, "step_time": 6.474478788997658 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 1534.1875, "completions/mean_terminated_length": 1534.1875, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "entropy": 0.22560410387814045, "epoch": 0.0055, "frac_reward_zero_std": 0.0, "grad_norm": 1.2990355491638184, "kl": 0.43298108130693436, "learning_rate": 9.99989422115103e-06, "loss": -0.004, "num_tokens": 10135257.0, "reward": -7.943665027618408, "reward_std": 6.444216251373291, "rewards/rollout_reward_func/mean": -7.943665027618408, "rewards/rollout_reward_func/std": 17.156274795532227, "sampling/importance_sampling_ratio/max": 1.7411776781082153, "sampling/importance_sampling_ratio/mean": 0.9610604643821716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.30423903465271, "sampling/sampling_logp_difference/mean": 0.03806076943874359, "step": 275, "step_time": 31.760855431999516 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.006510416744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009114583488553762, "entropy": 0.2212864998728037, "epoch": 0.00552, "grad_norm": 1.0952664613723755, "kl": 0.43978017941117287, "learning_rate": 9.999893334124745e-06, "loss": -0.0056, "step": 276, "step_time": 5.71456800100168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 1633.125, "completions/mean_terminated_length": 1633.125, "completions/min_length": 641.0, "completions/min_terminated_length": 641.0, "entropy": 0.2532899435609579, "epoch": 0.00554, "frac_reward_zero_std": 0.0, "grad_norm": 1.429874062538147, "kl": 0.6599202789366245, "learning_rate": 9.99989244339487e-06, "loss": -0.2194, "num_tokens": 10208192.0, "reward": -1.6404476165771484, "reward_std": 7.98501443862915, "rewards/rollout_reward_func/mean": -1.6404476165771484, "rewards/rollout_reward_func/std": 10.619029998779297, "sampling/importance_sampling_ratio/max": 2.4701716899871826, "sampling/importance_sampling_ratio/mean": 0.91197669506073, "sampling/importance_sampling_ratio/min": 0.14245294034481049, "sampling/sampling_logp_difference/max": 1.5070490837097168, "sampling/sampling_logp_difference/mean": 0.05014316365122795, "step": 277, "step_time": 36.03027680699779 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005744485300965607, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009650735300965607, "entropy": 0.24842147901654243, "epoch": 0.00556, "grad_norm": 1.1689871549606323, "kl": 0.6852821763604879, "learning_rate": 9.999891548961409e-06, "loss": -0.2256, "step": 278, "step_time": 5.809108069001013 }, { "clip_ratio/high_max": 0.010156250093132257, "clip_ratio/high_mean": 0.005078125046566129, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005078125046566129, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 1578.4375, "completions/mean_terminated_length": 1578.4375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "entropy": 0.1794843440875411, "epoch": 0.00558, "frac_reward_zero_std": 0.0, "grad_norm": 1.132340908050537, "kl": 0.5266461782157421, "learning_rate": 9.999890650824362e-06, "loss": -0.032, "num_tokens": 10279208.0, "reward": 7.001322269439697, "reward_std": 10.167577743530273, "rewards/rollout_reward_func/mean": 7.001322269439697, "rewards/rollout_reward_func/std": 19.354318618774414, "sampling/importance_sampling_ratio/max": 1.6495786905288696, "sampling/importance_sampling_ratio/mean": 0.8468691110610962, "sampling/importance_sampling_ratio/min": 0.30830448865890503, "sampling/sampling_logp_difference/max": 1.2979681491851807, "sampling/sampling_logp_difference/mean": 0.04318168759346008, "step": 279, "step_time": 35.29856429800202 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.17729349061846733, "epoch": 0.0056, "grad_norm": 0.9538998007774353, "kl": 0.5198981948196888, "learning_rate": 9.999889748983727e-06, "loss": -0.034, "step": 280, "step_time": 6.418356822001442 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1810.0, "completions/max_terminated_length": 1810.0, "completions/mean_length": 1660.09375, "completions/mean_terminated_length": 1660.09375, "completions/min_length": 1468.0, "completions/min_terminated_length": 1468.0, "entropy": 0.22451071441173553, "epoch": 0.00562, "frac_reward_zero_std": 0.0, "grad_norm": 1.166965365409851, "kl": 0.46352874115109444, "learning_rate": 9.999888843439508e-06, "loss": -0.0105, "num_tokens": 10352876.0, "reward": 0.566585898399353, "reward_std": 3.541024684906006, "rewards/rollout_reward_func/mean": 0.566585898399353, "rewards/rollout_reward_func/std": 5.582573890686035, "sampling/importance_sampling_ratio/max": 2.2927284240722656, "sampling/importance_sampling_ratio/mean": 0.962428092956543, "sampling/importance_sampling_ratio/min": 0.35809534788131714, "sampling/sampling_logp_difference/max": 0.8381476402282715, "sampling/sampling_logp_difference/mean": 0.04447294771671295, "step": 281, "step_time": 38.91772174799917 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2204912230372429, "epoch": 0.00564, "grad_norm": 1.1434154510498047, "kl": 0.4616197645664215, "learning_rate": 9.999887934191706e-06, "loss": -0.0105, "step": 282, "step_time": 5.863146633000724 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 1792.0, "completions/max_terminated_length": 1792.0, "completions/mean_length": 1650.9375, "completions/mean_terminated_length": 1650.9375, "completions/min_length": 1334.0, "completions/min_terminated_length": 1334.0, "entropy": 0.1282420428469777, "epoch": 0.00566, "frac_reward_zero_std": 0.125, "grad_norm": 1.2290998697280884, "kl": 0.8328725211322308, "learning_rate": 9.99988702124032e-06, "loss": -0.0243, "num_tokens": 10425939.0, "reward": 3.5679147243499756, "reward_std": 4.225099086761475, "rewards/rollout_reward_func/mean": 3.5679147243499756, "rewards/rollout_reward_func/std": 6.293881893157959, "sampling/importance_sampling_ratio/max": 1.7022098302841187, "sampling/importance_sampling_ratio/mean": 0.8366431593894958, "sampling/importance_sampling_ratio/min": 3.1675653853341368e-12, "sampling/sampling_logp_difference/max": 26.352497100830078, "sampling/sampling_logp_difference/mean": 0.08530725538730621, "step": 283, "step_time": 36.011625641003775 }, { "clip_ratio/high_max": 0.007582720601931214, "clip_ratio/high_mean": 0.003791360300965607, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005744485300965607, "entropy": 0.12602030392736197, "epoch": 0.00568, "grad_norm": 1.02045476436615, "kl": 0.7334638945758343, "learning_rate": 9.99988610458535e-06, "loss": -0.0291, "step": 284, "step_time": 5.740358126000501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004126082407310605, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004126082407310605, "completions/clipped_ratio": 0.0, "completions/max_length": 1760.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 1634.625, "completions/mean_terminated_length": 1634.625, "completions/min_length": 1178.0, "completions/min_terminated_length": 1178.0, "entropy": 0.18211907986551523, "epoch": 0.0057, "frac_reward_zero_std": 0.0, "grad_norm": 2.0849609375, "kl": 0.623099785298109, "learning_rate": 9.999885184226803e-06, "loss": -0.0595, "num_tokens": 10498682.0, "reward": -1.8584654331207275, "reward_std": 3.9287822246551514, "rewards/rollout_reward_func/mean": -1.8584654331207275, "rewards/rollout_reward_func/std": 12.91176986694336, "sampling/importance_sampling_ratio/max": 2.443671464920044, "sampling/importance_sampling_ratio/mean": 1.0301685333251953, "sampling/importance_sampling_ratio/min": 0.1954619437456131, "sampling/sampling_logp_difference/max": 1.7249445915222168, "sampling/sampling_logp_difference/mean": 0.034441880881786346, "step": 285, "step_time": 35.48645443800342 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.1817738525569439, "epoch": 0.00572, "grad_norm": 1.5344396829605103, "kl": 0.6733249872922897, "learning_rate": 9.999884260164672e-06, "loss": -0.0645, "step": 286, "step_time": 5.6632803509983205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 1682.5, "completions/mean_terminated_length": 1682.5, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "entropy": 0.16152677033096552, "epoch": 0.00574, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796907186508179, "kl": 0.7577519491314888, "learning_rate": 9.999883332398963e-06, "loss": -0.1018, "num_tokens": 10573032.0, "reward": 3.490713119506836, "reward_std": 6.548834800720215, "rewards/rollout_reward_func/mean": 3.490713119506836, "rewards/rollout_reward_func/std": 9.092996597290039, "sampling/importance_sampling_ratio/max": 1.64301335811615, "sampling/importance_sampling_ratio/mean": 0.9178614020347595, "sampling/importance_sampling_ratio/min": 0.23202794790267944, "sampling/sampling_logp_difference/max": 0.8612185120582581, "sampling/sampling_logp_difference/mean": 0.03624614328145981, "step": 287, "step_time": 35.95332784499624 }, { "clip_ratio/high_max": 0.020089285913854837, "clip_ratio/high_mean": 0.010044642956927419, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011997767956927419, "entropy": 0.16308189649134874, "epoch": 0.00576, "grad_norm": 0.9740341901779175, "kl": 0.7837924063205719, "learning_rate": 9.999882400929674e-06, "loss": -0.1047, "step": 288, "step_time": 5.759164831997623 }, { "clip_ratio/high_max": 0.013392857275903225, "clip_ratio/high_mean": 0.0066964286379516125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008649553637951612, "completions/clipped_ratio": 0.0, "completions/max_length": 1803.0, "completions/max_terminated_length": 1803.0, "completions/mean_length": 1571.25, "completions/mean_terminated_length": 1571.25, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.1844630278646946, "epoch": 0.00578, "frac_reward_zero_std": 0.125, "grad_norm": 1.0486916303634644, "kl": 0.8353649526834488, "learning_rate": 9.99988146575681e-06, "loss": -0.0224, "num_tokens": 10643454.0, "reward": 2.7757768630981445, "reward_std": 6.543875694274902, "rewards/rollout_reward_func/mean": 2.7757768630981445, "rewards/rollout_reward_func/std": 10.743654251098633, "sampling/importance_sampling_ratio/max": 2.0809102058410645, "sampling/importance_sampling_ratio/mean": 0.8418534994125366, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.082690954208374, "sampling/sampling_logp_difference/mean": 0.04390227794647217, "step": 289, "step_time": 34.91289532799965 }, { "clip_ratio/high_max": 0.017299107275903225, "clip_ratio/high_mean": 0.008649553637951612, "clip_ratio/low_mean": 0.0031250000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011774553684517741, "entropy": 0.18694760277867317, "epoch": 0.0058, "grad_norm": 1.0219008922576904, "kl": 0.8292638063430786, "learning_rate": 9.999880526880366e-06, "loss": -0.0233, "step": 290, "step_time": 5.847304276998329 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1715.9375, "completions/mean_terminated_length": 1715.9375, "completions/min_length": 1599.0, "completions/min_terminated_length": 1599.0, "entropy": 0.21534875221550465, "epoch": 0.00582, "frac_reward_zero_std": 0.0, "grad_norm": 1.4715776443481445, "kl": 0.5219082608819008, "learning_rate": 9.999879584300349e-06, "loss": -0.1126, "num_tokens": 10719360.0, "reward": 4.551000118255615, "reward_std": 5.435481071472168, "rewards/rollout_reward_func/mean": 4.551000118255615, "rewards/rollout_reward_func/std": 8.276861190795898, "sampling/importance_sampling_ratio/max": 1.947494387626648, "sampling/importance_sampling_ratio/mean": 1.0701745748519897, "sampling/importance_sampling_ratio/min": 0.39692434668540955, "sampling/sampling_logp_difference/max": 0.9076583385467529, "sampling/sampling_logp_difference/mean": 0.038155265152454376, "step": 291, "step_time": 37.32454300199788 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.21467485465109348, "epoch": 0.00584, "grad_norm": 1.3881194591522217, "kl": 0.511255357414484, "learning_rate": 9.999878638016756e-06, "loss": -0.1134, "step": 292, "step_time": 5.892084002003685 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1781.0, "completions/max_terminated_length": 1781.0, "completions/mean_length": 1696.09375, "completions/mean_terminated_length": 1696.09375, "completions/min_length": 1594.0, "completions/min_terminated_length": 1594.0, "entropy": 0.14797488693147898, "epoch": 0.00586, "frac_reward_zero_std": 0.25, "grad_norm": 1.0587458610534668, "kl": 0.39671463891863823, "learning_rate": 9.99987768802959e-06, "loss": 0.025, "num_tokens": 10794350.0, "reward": 5.8876543045043945, "reward_std": 4.631452560424805, "rewards/rollout_reward_func/mean": 5.8876543045043945, "rewards/rollout_reward_func/std": 9.654105186462402, "sampling/importance_sampling_ratio/max": 1.8918991088867188, "sampling/importance_sampling_ratio/mean": 1.0400714874267578, "sampling/importance_sampling_ratio/min": 0.2940644919872284, "sampling/sampling_logp_difference/max": 0.8748996257781982, "sampling/sampling_logp_difference/mean": 0.02706913650035858, "step": 293, "step_time": 36.40785624799901 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.14734902698546648, "epoch": 0.00588, "grad_norm": 1.0053006410598755, "kl": 0.3986317291855812, "learning_rate": 9.99987673433885e-06, "loss": 0.0219, "step": 294, "step_time": 5.827138864997323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1826.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 1693.59375, "completions/mean_terminated_length": 1693.59375, "completions/min_length": 1599.0, "completions/min_terminated_length": 1599.0, "entropy": 0.19574224390089512, "epoch": 0.0059, "frac_reward_zero_std": 0.125, "grad_norm": 1.5421663522720337, "kl": 0.4815644100308418, "learning_rate": 9.999875776944539e-06, "loss": -0.16, "num_tokens": 10869166.0, "reward": 3.4602537155151367, "reward_std": 4.304999351501465, "rewards/rollout_reward_func/mean": 3.4602537155151367, "rewards/rollout_reward_func/std": 7.722873210906982, "sampling/importance_sampling_ratio/max": 1.93669593334198, "sampling/importance_sampling_ratio/mean": 1.0370523929595947, "sampling/importance_sampling_ratio/min": 0.21605992317199707, "sampling/sampling_logp_difference/max": 1.3471612930297852, "sampling/sampling_logp_difference/mean": 0.036538854241371155, "step": 295, "step_time": 36.86570157999995 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.19377287477254868, "epoch": 0.00592, "grad_norm": 1.3266241550445557, "kl": 0.5180523283779621, "learning_rate": 9.999874815846656e-06, "loss": -0.1637, "step": 296, "step_time": 6.44450496600075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 1699.59375, "completions/mean_terminated_length": 1699.59375, "completions/min_length": 1464.0, "completions/min_terminated_length": 1464.0, "entropy": 0.22864390537142754, "epoch": 0.00594, "frac_reward_zero_std": 0.0, "grad_norm": 1.4940208196640015, "kl": 0.3851666897535324, "learning_rate": 9.999873851045202e-06, "loss": -0.0427, "num_tokens": 10944209.0, "reward": 0.1467803716659546, "reward_std": 6.4113945960998535, "rewards/rollout_reward_func/mean": 0.1467803716659546, "rewards/rollout_reward_func/std": 11.033947944641113, "sampling/importance_sampling_ratio/max": 2.8933799266815186, "sampling/importance_sampling_ratio/mean": 1.0202676057815552, "sampling/importance_sampling_ratio/min": 1.6225042143158674e-11, "sampling/sampling_logp_difference/max": 24.00332260131836, "sampling/sampling_logp_difference/mean": 0.08574045449495316, "step": 297, "step_time": 36.83895620099793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.22310753911733627, "epoch": 0.00596, "grad_norm": 1.392610788345337, "kl": 0.3954205773770809, "learning_rate": 9.999872882540181e-06, "loss": -0.0431, "step": 298, "step_time": 5.832336111001496 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1678.71875, "completions/mean_terminated_length": 1678.71875, "completions/min_length": 1547.0, "completions/min_terminated_length": 1547.0, "entropy": 0.18405075185000896, "epoch": 0.00598, "frac_reward_zero_std": 0.0, "grad_norm": 1.8699392080307007, "kl": 0.937393918633461, "learning_rate": 9.999871910331592e-06, "loss": 0.0681, "num_tokens": 11018366.0, "reward": 3.816401958465576, "reward_std": 3.8979978561401367, "rewards/rollout_reward_func/mean": 3.816401958465576, "rewards/rollout_reward_func/std": 7.810318946838379, "sampling/importance_sampling_ratio/max": 2.2833974361419678, "sampling/importance_sampling_ratio/mean": 1.0459489822387695, "sampling/importance_sampling_ratio/min": 0.1880369782447815, "sampling/sampling_logp_difference/max": 1.4234182834625244, "sampling/sampling_logp_difference/mean": 0.048677269369363785, "step": 299, "step_time": 36.90797988000122 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0038470644503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011659564450383186, "entropy": 0.18204295076429844, "epoch": 0.006, "grad_norm": 1.5673085451126099, "kl": 0.9782428927719593, "learning_rate": 9.999870934419434e-06, "loss": 0.0681, "step": 300, "step_time": 5.689609519002261 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009765625, "completions/clipped_ratio": 0.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 1530.09375, "completions/mean_terminated_length": 1530.09375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.18989801779389381, "epoch": 0.00602, "frac_reward_zero_std": 0.125, "grad_norm": 1.7532182931900024, "kl": 0.6885460149496794, "learning_rate": 9.999869954803708e-06, "loss": -0.0852, "num_tokens": 11088088.0, "reward": -2.989968776702881, "reward_std": 3.5043578147888184, "rewards/rollout_reward_func/mean": -2.989968776702881, "rewards/rollout_reward_func/std": 11.289237976074219, "sampling/importance_sampling_ratio/max": 2.31423020362854, "sampling/importance_sampling_ratio/mean": 0.9575661420822144, "sampling/importance_sampling_ratio/min": 0.04499343782663345, "sampling/sampling_logp_difference/max": 1.339418888092041, "sampling/sampling_logp_difference/mean": 0.05697993189096451, "step": 301, "step_time": 33.824989040998844 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.18932953476905823, "epoch": 0.00604, "grad_norm": 2.035459041595459, "kl": 0.6695586815476418, "learning_rate": 9.999868971484418e-06, "loss": -0.0882, "step": 302, "step_time": 6.704577034002796 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1802.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 1654.34375, "completions/mean_terminated_length": 1654.34375, "completions/min_length": 1392.0, "completions/min_terminated_length": 1392.0, "entropy": 0.16859493404626846, "epoch": 0.00606, "frac_reward_zero_std": 0.125, "grad_norm": 1.2433701753616333, "kl": 0.3330823089927435, "learning_rate": 9.999867984461565e-06, "loss": -0.1347, "num_tokens": 11161407.0, "reward": 13.324233055114746, "reward_std": 5.227249622344971, "rewards/rollout_reward_func/mean": 13.324233055114746, "rewards/rollout_reward_func/std": 18.111255645751953, "sampling/importance_sampling_ratio/max": 2.574556827545166, "sampling/importance_sampling_ratio/mean": 1.0179166793823242, "sampling/importance_sampling_ratio/min": 4.849272411824878e-19, "sampling/sampling_logp_difference/max": 21.414751052856445, "sampling/sampling_logp_difference/mean": 0.11061778664588928, "step": 303, "step_time": 35.15028436499961 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.16938963159918785, "epoch": 0.00608, "grad_norm": 1.2953479290008545, "kl": 0.32322092168033123, "learning_rate": 9.999866993735148e-06, "loss": -0.1376, "step": 304, "step_time": 5.814720251999461 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1763.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 1696.5, "completions/mean_terminated_length": 1696.5, "completions/min_length": 1519.0, "completions/min_terminated_length": 1519.0, "entropy": 0.17914600856602192, "epoch": 0.0061, "frac_reward_zero_std": 0.0, "grad_norm": 2.047422409057617, "kl": 0.9124260395765305, "learning_rate": 9.99986599930517e-06, "loss": 0.0776, "num_tokens": 11236260.0, "reward": -3.4361958503723145, "reward_std": 5.964578628540039, "rewards/rollout_reward_func/mean": -3.4361958503723145, "rewards/rollout_reward_func/std": 12.477890014648438, "sampling/importance_sampling_ratio/max": 1.9305469989776611, "sampling/importance_sampling_ratio/mean": 1.063968300819397, "sampling/importance_sampling_ratio/min": 0.14159531891345978, "sampling/sampling_logp_difference/max": 1.7825407981872559, "sampling/sampling_logp_difference/mean": 0.03782513365149498, "step": 305, "step_time": 36.04515673100104 }, { "clip_ratio/high_max": 0.01953125, "clip_ratio/high_mean": 0.009765625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.1830028723925352, "epoch": 0.00612, "grad_norm": 2.021735906600952, "kl": 0.8423058073967695, "learning_rate": 9.999865001171628e-06, "loss": 0.0726, "step": 306, "step_time": 5.666386218999833 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1692.84375, "completions/mean_terminated_length": 1692.84375, "completions/min_length": 1583.0, "completions/min_terminated_length": 1583.0, "entropy": 0.26686032861471176, "epoch": 0.00614, "frac_reward_zero_std": 0.0, "grad_norm": 1.5039609670639038, "kl": 0.6252574324607849, "learning_rate": 9.999863999334527e-06, "loss": -0.043, "num_tokens": 11311146.0, "reward": 0.6570155620574951, "reward_std": 3.567687749862671, "rewards/rollout_reward_func/mean": 0.6570155620574951, "rewards/rollout_reward_func/std": 5.788002014160156, "sampling/importance_sampling_ratio/max": 1.9915469884872437, "sampling/importance_sampling_ratio/mean": 0.9949770569801331, "sampling/importance_sampling_ratio/min": 0.28174740076065063, "sampling/sampling_logp_difference/max": 1.0719170570373535, "sampling/sampling_logp_difference/mean": 0.04079074412584305, "step": 307, "step_time": 36.82269417800126 }, { "clip_ratio/high_max": 0.017153532709926367, "clip_ratio/high_mean": 0.010529891587793827, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010529891587793827, "entropy": 0.2710601706057787, "epoch": 0.00616, "grad_norm": 1.2901983261108398, "kl": 0.6167760603129864, "learning_rate": 9.999862993793865e-06, "loss": -0.0489, "step": 308, "step_time": 6.247776632999376 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 1684.09375, "completions/mean_terminated_length": 1684.09375, "completions/min_length": 1566.0, "completions/min_terminated_length": 1566.0, "entropy": 0.19205584935843945, "epoch": 0.00618, "frac_reward_zero_std": 0.25, "grad_norm": 0.8880062103271484, "kl": 0.5570718720555305, "learning_rate": 9.999861984549646e-06, "loss": 0.0028, "num_tokens": 11385685.0, "reward": 1.6213198900222778, "reward_std": 5.165748596191406, "rewards/rollout_reward_func/mean": 1.6213198900222778, "rewards/rollout_reward_func/std": 10.621575355529785, "sampling/importance_sampling_ratio/max": 1.731754183769226, "sampling/importance_sampling_ratio/mean": 0.9596688747406006, "sampling/importance_sampling_ratio/min": 0.27524271607398987, "sampling/sampling_logp_difference/max": 0.9177696704864502, "sampling/sampling_logp_difference/mean": 0.040488895028829575, "step": 309, "step_time": 36.28644880200045 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.005859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.19744597002863884, "epoch": 0.0062, "grad_norm": 0.904339075088501, "kl": 0.5681257173418999, "learning_rate": 9.99986097160187e-06, "loss": -0.0008, "step": 310, "step_time": 5.7859738810002455 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1667.03125, "completions/mean_terminated_length": 1667.03125, "completions/min_length": 1510.0, "completions/min_terminated_length": 1510.0, "entropy": 0.19374503754079342, "epoch": 0.00622, "frac_reward_zero_std": 0.0, "grad_norm": 1.271896243095398, "kl": 0.5406992174685001, "learning_rate": 9.999859954950535e-06, "loss": 0.0642, "num_tokens": 11459322.0, "reward": 4.310371398925781, "reward_std": 5.449921607971191, "rewards/rollout_reward_func/mean": 4.310371398925781, "rewards/rollout_reward_func/std": 10.012842178344727, "sampling/importance_sampling_ratio/max": 2.3058712482452393, "sampling/importance_sampling_ratio/mean": 1.0564281940460205, "sampling/importance_sampling_ratio/min": 0.29696083068847656, "sampling/sampling_logp_difference/max": 1.0833686590194702, "sampling/sampling_logp_difference/mean": 0.03807468339800835, "step": 311, "step_time": 39.552110792998064 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.01171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01953125, "entropy": 0.19619728438556194, "epoch": 0.00624, "grad_norm": 1.1710026264190674, "kl": 0.5339053124189377, "learning_rate": 9.999858934595648e-06, "loss": 0.0602, "step": 312, "step_time": 5.784206759000881 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1820.0, "completions/max_terminated_length": 1820.0, "completions/mean_length": 1676.375, "completions/mean_terminated_length": 1676.375, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "entropy": 0.23158937133848667, "epoch": 0.00626, "frac_reward_zero_std": 0.0, "grad_norm": 1.2677243947982788, "kl": 0.8580169584602118, "learning_rate": 9.999857910537204e-06, "loss": -0.112, "num_tokens": 11533666.0, "reward": 4.958746433258057, "reward_std": 8.339164733886719, "rewards/rollout_reward_func/mean": 4.958746433258057, "rewards/rollout_reward_func/std": 11.135608673095703, "sampling/importance_sampling_ratio/max": 2.853358745574951, "sampling/importance_sampling_ratio/mean": 0.9754152297973633, "sampling/importance_sampling_ratio/min": 7.993090099672372e-18, "sampling/sampling_logp_difference/max": 19.442655563354492, "sampling/sampling_logp_difference/mean": 0.12714992463588715, "step": 313, "step_time": 36.64092221500323 }, { "clip_ratio/high_max": 0.020432692486792803, "clip_ratio/high_mean": 0.012169471243396401, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0160757212433964, "entropy": 0.22766240686178207, "epoch": 0.00628, "grad_norm": 1.0967687368392944, "kl": 0.8410563804209232, "learning_rate": 9.999856882775207e-06, "loss": -0.1182, "step": 314, "step_time": 6.334955206000814 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.004185267724096775, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006268601166084409, "completions/clipped_ratio": 0.0, "completions/max_length": 1786.0, "completions/max_terminated_length": 1786.0, "completions/mean_length": 1605.5625, "completions/mean_terminated_length": 1605.5625, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "entropy": 0.2169735934585333, "epoch": 0.0063, "frac_reward_zero_std": 0.0, "grad_norm": 1.8618820905685425, "kl": 0.4524347148835659, "learning_rate": 9.999855851309658e-06, "loss": -0.033, "num_tokens": 11605988.0, "reward": 4.734364032745361, "reward_std": 8.99662971496582, "rewards/rollout_reward_func/mean": 4.734364032745361, "rewards/rollout_reward_func/std": 19.494327545166016, "sampling/importance_sampling_ratio/max": 2.5793893337249756, "sampling/importance_sampling_ratio/mean": 1.028910756111145, "sampling/importance_sampling_ratio/min": 0.28705474734306335, "sampling/sampling_logp_difference/max": 1.0486016273498535, "sampling/sampling_logp_difference/mean": 0.04716923087835312, "step": 315, "step_time": 32.90857644800053 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.007942708441987634, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.013802083441987634, "entropy": 0.21472137793898582, "epoch": 0.00632, "grad_norm": 1.891473412513733, "kl": 0.4403880871832371, "learning_rate": 9.999854816140558e-06, "loss": -0.0362, "step": 316, "step_time": 5.758545238999432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1774.0, "completions/max_terminated_length": 1774.0, "completions/mean_length": 1642.8125, "completions/mean_terminated_length": 1642.8125, "completions/min_length": 1517.0, "completions/min_terminated_length": 1517.0, "entropy": 0.2397665549069643, "epoch": 0.00634, "frac_reward_zero_std": 0.0, "grad_norm": 1.6888549327850342, "kl": 0.5931244716048241, "learning_rate": 9.999853777267907e-06, "loss": -0.1138, "num_tokens": 11679099.0, "reward": -2.320553779602051, "reward_std": 8.914901733398438, "rewards/rollout_reward_func/mean": -2.320553779602051, "rewards/rollout_reward_func/std": 15.233270645141602, "sampling/importance_sampling_ratio/max": 2.8582732677459717, "sampling/importance_sampling_ratio/mean": 1.0699962377548218, "sampling/importance_sampling_ratio/min": 4.828970068867372e-14, "sampling/sampling_logp_difference/max": 30.196020126342773, "sampling/sampling_logp_difference/mean": 0.11480410397052765, "step": 317, "step_time": 35.76147220499843 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.009765625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.013671875, "entropy": 0.24086195416748524, "epoch": 0.00636, "grad_norm": 1.3944815397262573, "kl": 0.5775415897369385, "learning_rate": 9.999852734691707e-06, "loss": -0.1189, "step": 318, "step_time": 6.165533587000027 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1617.375, "completions/mean_terminated_length": 1617.375, "completions/min_length": 1386.0, "completions/min_terminated_length": 1386.0, "entropy": 0.2378612495958805, "epoch": 0.00638, "frac_reward_zero_std": 0.0, "grad_norm": 1.602665662765503, "kl": 0.5662538819015026, "learning_rate": 9.999851688411959e-06, "loss": 0.0126, "num_tokens": 11751577.0, "reward": 5.039939880371094, "reward_std": 6.29449462890625, "rewards/rollout_reward_func/mean": 5.039939880371094, "rewards/rollout_reward_func/std": 8.958051681518555, "sampling/importance_sampling_ratio/max": 2.9241015911102295, "sampling/importance_sampling_ratio/mean": 1.070831060409546, "sampling/importance_sampling_ratio/min": 0.13934408128261566, "sampling/sampling_logp_difference/max": 1.6112802028656006, "sampling/sampling_logp_difference/mean": 0.06019854545593262, "step": 319, "step_time": 36.286148504001176 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.00390625, "clip_ratio/region_mean": 0.009765625, "entropy": 0.23569470085203648, "epoch": 0.0064, "grad_norm": 1.5978208780288696, "kl": 0.5462238825857639, "learning_rate": 9.999850638428662e-06, "loss": 0.0095, "step": 320, "step_time": 6.246102994995454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1882.96875, "completions/mean_terminated_length": 1882.96875, "completions/min_length": 1658.0, "completions/min_terminated_length": 1658.0, "entropy": 0.20035061426460743, "epoch": 0.00642, "frac_reward_zero_std": 0.0, "grad_norm": 1.2168484926223755, "kl": 0.5520738363265991, "learning_rate": 9.99984958474182e-06, "loss": 0.0107, "num_tokens": 11832614.0, "reward": 1.5634541511535645, "reward_std": 4.704677581787109, "rewards/rollout_reward_func/mean": 1.5634541511535645, "rewards/rollout_reward_func/std": 6.349411964416504, "sampling/importance_sampling_ratio/max": 1.8423182964324951, "sampling/importance_sampling_ratio/mean": 0.8929311633110046, "sampling/importance_sampling_ratio/min": 2.0857898741510894e-11, "sampling/sampling_logp_difference/max": 21.57457160949707, "sampling/sampling_logp_difference/mean": 0.08621137589216232, "step": 321, "step_time": 38.96727589300099 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.00685307034291327, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008589181466959417, "entropy": 0.2013362180441618, "epoch": 0.00644, "grad_norm": 1.1177294254302979, "kl": 0.5334508754312992, "learning_rate": 9.999848527351434e-06, "loss": 0.0072, "step": 322, "step_time": 6.2201312969991704 }, { "clip_ratio/high_max": 0.0062806373462080956, "clip_ratio/high_mean": 0.0031403186731040478, "clip_ratio/low_mean": 0.0030381944961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006178513169288635, "completions/clipped_ratio": 0.0, "completions/max_length": 1941.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1820.03125, "completions/mean_terminated_length": 1820.03125, "completions/min_length": 1447.0, "completions/min_terminated_length": 1447.0, "entropy": 0.1882778126746416, "epoch": 0.00646, "frac_reward_zero_std": 0.0, "grad_norm": 1.5726302862167358, "kl": 0.49259715899825096, "learning_rate": 9.999847466257501e-06, "loss": -0.0134, "num_tokens": 11911368.0, "reward": 9.643357276916504, "reward_std": 7.6050519943237305, "rewards/rollout_reward_func/mean": 9.643357276916504, "rewards/rollout_reward_func/std": 13.524218559265137, "sampling/importance_sampling_ratio/max": 2.3086092472076416, "sampling/importance_sampling_ratio/mean": 0.9916382431983948, "sampling/importance_sampling_ratio/min": 1.4195225812507194e-19, "sampling/sampling_logp_difference/max": 22.288389205932617, "sampling/sampling_logp_difference/mean": 0.1148686632514, "step": 323, "step_time": 38.506465823003964 }, { "clip_ratio/high_max": 0.016697304090484977, "clip_ratio/high_mean": 0.008348652045242488, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015293096657842398, "entropy": 0.1865296196192503, "epoch": 0.00648, "grad_norm": 1.318217158317566, "kl": 0.4571525566279888, "learning_rate": 9.999846401460027e-06, "loss": -0.0185, "step": 324, "step_time": 6.599780938004187 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1880.78125, "completions/mean_terminated_length": 1880.78125, "completions/min_length": 1628.0, "completions/min_terminated_length": 1628.0, "entropy": 0.1915543656796217, "epoch": 0.0065, "frac_reward_zero_std": 0.0, "grad_norm": 1.4827704429626465, "kl": 0.4624154604971409, "learning_rate": 9.999845332959009e-06, "loss": 0.1599, "num_tokens": 11991754.0, "reward": 7.904694557189941, "reward_std": 11.988627433776855, "rewards/rollout_reward_func/mean": 7.904694557189941, "rewards/rollout_reward_func/std": 14.8968505859375, "sampling/importance_sampling_ratio/max": 2.7791874408721924, "sampling/importance_sampling_ratio/mean": 1.0366264581680298, "sampling/importance_sampling_ratio/min": 0.18443353474140167, "sampling/sampling_logp_difference/max": 1.049269437789917, "sampling/sampling_logp_difference/mean": 0.03614144027233124, "step": 325, "step_time": 38.895849883001574 }, { "clip_ratio/high_max": 0.024305555736646056, "clip_ratio/high_mean": 0.013888889108784497, "clip_ratio/low_mean": 0.015625000232830644, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.029513889458030462, "entropy": 0.1898924522101879, "epoch": 0.00652, "grad_norm": 1.1585363149642944, "kl": 0.4361311122775078, "learning_rate": 9.999844260754452e-06, "loss": 0.1548, "step": 326, "step_time": 6.17809536400091 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1976.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1838.875, "completions/mean_terminated_length": 1838.875, "completions/min_length": 1662.0, "completions/min_terminated_length": 1662.0, "entropy": 0.15969707537442446, "epoch": 0.00654, "frac_reward_zero_std": 0.0, "grad_norm": 0.8774885535240173, "kl": 0.4521552287042141, "learning_rate": 9.999843184846355e-06, "loss": -0.253, "num_tokens": 12071372.0, "reward": 5.943417549133301, "reward_std": 6.620966911315918, "rewards/rollout_reward_func/mean": 5.943417549133301, "rewards/rollout_reward_func/std": 13.051533699035645, "sampling/importance_sampling_ratio/max": 2.3298087120056152, "sampling/importance_sampling_ratio/mean": 1.028981328010559, "sampling/importance_sampling_ratio/min": 5.5275731908333015e-12, "sampling/sampling_logp_difference/max": 24.37394905090332, "sampling/sampling_logp_difference/mean": 0.08017994463443756, "step": 327, "step_time": 36.11553010699936 }, { "clip_ratio/high_max": 0.010438166558742523, "clip_ratio/high_mean": 0.005219083279371262, "clip_ratio/low_mean": 0.005219083279371262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010438166558742523, "entropy": 0.1542342221364379, "epoch": 0.00656, "grad_norm": 0.9030295014381409, "kl": 0.4645962119102478, "learning_rate": 9.999842105234718e-06, "loss": -0.2544, "step": 328, "step_time": 6.171020930996747 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1846.5, "completions/mean_terminated_length": 1846.5, "completions/min_length": 1606.0, "completions/min_terminated_length": 1606.0, "entropy": 0.1713305152952671, "epoch": 0.00658, "frac_reward_zero_std": 0.0, "grad_norm": 1.3594589233398438, "kl": 0.41948344372212887, "learning_rate": 9.999841021919543e-06, "loss": -0.1754, "num_tokens": 12150665.0, "reward": 6.950708866119385, "reward_std": 8.736629486083984, "rewards/rollout_reward_func/mean": 6.950708866119385, "rewards/rollout_reward_func/std": 12.616031646728516, "sampling/importance_sampling_ratio/max": 2.0336740016937256, "sampling/importance_sampling_ratio/mean": 1.0751011371612549, "sampling/importance_sampling_ratio/min": 8.473521770314615e-14, "sampling/sampling_logp_difference/max": 30.0931339263916, "sampling/sampling_logp_difference/mean": 0.08403386175632477, "step": 329, "step_time": 37.165045843999906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010325292591005564, "clip_ratio/low_min": 0.006761695956811309, "clip_ratio/region_mean": 0.010325292591005564, "entropy": 0.16616453044116497, "epoch": 0.0066, "grad_norm": 1.1647483110427856, "kl": 0.4359829295426607, "learning_rate": 9.999839934900832e-06, "loss": -0.179, "step": 330, "step_time": 6.586650393001037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1966.0, "completions/max_terminated_length": 1966.0, "completions/mean_length": 1874.34375, "completions/mean_terminated_length": 1874.34375, "completions/min_length": 1775.0, "completions/min_terminated_length": 1775.0, "entropy": 0.1693086363375187, "epoch": 0.00662, "frac_reward_zero_std": 0.0, "grad_norm": 1.4499868154525757, "kl": 0.6940734013915062, "learning_rate": 9.999838844178584e-06, "loss": -0.1581, "num_tokens": 12231256.0, "reward": 9.051456451416016, "reward_std": 5.1797261238098145, "rewards/rollout_reward_func/mean": 9.051456451416016, "rewards/rollout_reward_func/std": 9.840641975402832, "sampling/importance_sampling_ratio/max": 1.9182367324829102, "sampling/importance_sampling_ratio/mean": 0.8677526116371155, "sampling/importance_sampling_ratio/min": 0.21601787209510803, "sampling/sampling_logp_difference/max": 1.2895514965057373, "sampling/sampling_logp_difference/mean": 0.04901476204395294, "step": 331, "step_time": 40.41131205700003 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.01215277798473835, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.013888889108784497, "entropy": 0.1622045375406742, "epoch": 0.00664, "grad_norm": 1.332248330116272, "kl": 0.7565370872616768, "learning_rate": 9.999837749752804e-06, "loss": -0.16, "step": 332, "step_time": 6.1724167310003395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1871.0625, "completions/mean_terminated_length": 1871.0625, "completions/min_length": 1509.0, "completions/min_terminated_length": 1509.0, "entropy": 0.15144985355436802, "epoch": 0.00666, "frac_reward_zero_std": 0.125, "grad_norm": 1.2533303499221802, "kl": 0.31190009601414204, "learning_rate": 9.999836651623489e-06, "loss": 0.0864, "num_tokens": 12311600.0, "reward": 2.6044352054595947, "reward_std": 5.959412574768066, "rewards/rollout_reward_func/mean": 2.6044352054595947, "rewards/rollout_reward_func/std": 14.666269302368164, "sampling/importance_sampling_ratio/max": 1.9139474630355835, "sampling/importance_sampling_ratio/mean": 0.953168511390686, "sampling/importance_sampling_ratio/min": 1.1792734045952113e-14, "sampling/sampling_logp_difference/max": 17.824750900268555, "sampling/sampling_logp_difference/mean": 0.07734020054340363, "step": 333, "step_time": 39.06632298299701 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0029861110961064696, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006458333344198763, "entropy": 0.1482429150491953, "epoch": 0.00668, "grad_norm": 1.049380898475647, "kl": 0.31447366066277027, "learning_rate": 9.99983554979064e-06, "loss": 0.0833, "step": 334, "step_time": 6.242611449002652 }, { "clip_ratio/high_max": 0.01736111124046147, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1884.21875, "completions/mean_terminated_length": 1884.21875, "completions/min_length": 1752.0, "completions/min_terminated_length": 1752.0, "entropy": 0.1837616004049778, "epoch": 0.0067, "frac_reward_zero_std": 0.0, "grad_norm": 1.5765018463134766, "kl": 0.6941291987895966, "learning_rate": 9.999834444254261e-06, "loss": -0.0101, "num_tokens": 12392452.0, "reward": 2.9032256603240967, "reward_std": 8.3020601272583, "rewards/rollout_reward_func/mean": 2.9032256603240967, "rewards/rollout_reward_func/std": 10.438464164733887, "sampling/importance_sampling_ratio/max": 2.3026082515716553, "sampling/importance_sampling_ratio/mean": 0.8688405752182007, "sampling/importance_sampling_ratio/min": 0.06608447432518005, "sampling/sampling_logp_difference/max": 1.0330348014831543, "sampling/sampling_logp_difference/mean": 0.04626619443297386, "step": 335, "step_time": 38.017552094001076 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555620230734, "entropy": 0.1806764118373394, "epoch": 0.00672, "grad_norm": 1.4784622192382812, "kl": 0.6689932979643345, "learning_rate": 9.999833335014352e-06, "loss": -0.0126, "step": 336, "step_time": 6.23428071699891 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1826.21875, "completions/mean_terminated_length": 1826.21875, "completions/min_length": 1612.0, "completions/min_terminated_length": 1612.0, "entropy": 0.16721994522958994, "epoch": 0.00674, "frac_reward_zero_std": 0.0, "grad_norm": 1.0283710956573486, "kl": 0.5365333259105682, "learning_rate": 9.999832222070915e-06, "loss": -0.0433, "num_tokens": 12471329.0, "reward": 5.5746378898620605, "reward_std": 9.998064041137695, "rewards/rollout_reward_func/mean": 5.5746378898620605, "rewards/rollout_reward_func/std": 12.20602035522461, "sampling/importance_sampling_ratio/max": 1.3266735076904297, "sampling/importance_sampling_ratio/mean": 0.7684075832366943, "sampling/importance_sampling_ratio/min": 2.1960254242760603e-20, "sampling/sampling_logp_difference/max": 29.37782096862793, "sampling/sampling_logp_difference/mean": 0.16268780827522278, "step": 337, "step_time": 37.611019209001824 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888889108784497, "entropy": 0.16553544532507658, "epoch": 0.00676, "grad_norm": 1.0066049098968506, "kl": 0.5513498038053513, "learning_rate": 9.999831105423947e-06, "loss": -0.0434, "step": 338, "step_time": 6.205691562003267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1858.40625, "completions/mean_terminated_length": 1858.40625, "completions/min_length": 1771.0, "completions/min_terminated_length": 1771.0, "entropy": 0.13813489489257336, "epoch": 0.00678, "frac_reward_zero_std": 0.125, "grad_norm": 1.2954949140548706, "kl": 0.46643203496932983, "learning_rate": 9.999829985073454e-06, "loss": -0.0691, "num_tokens": 12551129.0, "reward": 4.567227363586426, "reward_std": 4.1049957275390625, "rewards/rollout_reward_func/mean": 4.567227363586426, "rewards/rollout_reward_func/std": 9.307647705078125, "sampling/importance_sampling_ratio/max": 2.0119011402130127, "sampling/importance_sampling_ratio/mean": 1.006131887435913, "sampling/importance_sampling_ratio/min": 0.2738450765609741, "sampling/sampling_logp_difference/max": 1.310835838317871, "sampling/sampling_logp_difference/mean": 0.03615020588040352, "step": 339, "step_time": 38.483633726000335 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.13802602514624596, "epoch": 0.0068, "grad_norm": 1.058447003364563, "kl": 0.43485408648848534, "learning_rate": 9.999828861019437e-06, "loss": -0.0715, "step": 340, "step_time": 6.148799068003427 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1823.65625, "completions/mean_terminated_length": 1823.65625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.14465732499957085, "epoch": 0.00682, "frac_reward_zero_std": 0.0, "grad_norm": 1.9615039825439453, "kl": 1.5087429098784924, "learning_rate": 9.999827733261892e-06, "loss": 0.0479, "num_tokens": 12629652.0, "reward": 2.9783196449279785, "reward_std": 6.66061544418335, "rewards/rollout_reward_func/mean": 2.9783196449279785, "rewards/rollout_reward_func/std": 10.88323974609375, "sampling/importance_sampling_ratio/max": 1.377437949180603, "sampling/importance_sampling_ratio/mean": 0.8548128604888916, "sampling/importance_sampling_ratio/min": 0.24347716569900513, "sampling/sampling_logp_difference/max": 1.1083741188049316, "sampling/sampling_logp_difference/mean": 0.030560657382011414, "step": 341, "step_time": 38.09668472299927 }, { "clip_ratio/high_max": 0.01609848509542644, "clip_ratio/high_mean": 0.011521464679390192, "clip_ratio/low_mean": 0.009785353671759367, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.02130681835114956, "entropy": 0.14760607294738293, "epoch": 0.00684, "grad_norm": 1.4034626483917236, "kl": 1.1786439195275307, "learning_rate": 9.999826601800824e-06, "loss": 0.0401, "step": 342, "step_time": 6.157314127001882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1998.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1769.34375, "completions/mean_terminated_length": 1769.34375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.15295273158699274, "epoch": 0.00686, "frac_reward_zero_std": 0.125, "grad_norm": 1.4741613864898682, "kl": 0.3553418479859829, "learning_rate": 9.999825466636233e-06, "loss": -0.0553, "num_tokens": 12707171.0, "reward": 10.171146392822266, "reward_std": 4.205057144165039, "rewards/rollout_reward_func/mean": 10.171146392822266, "rewards/rollout_reward_func/std": 19.68203353881836, "sampling/importance_sampling_ratio/max": 1.8806638717651367, "sampling/importance_sampling_ratio/mean": 1.025322437286377, "sampling/importance_sampling_ratio/min": 0.20550324022769928, "sampling/sampling_logp_difference/max": 0.8229107856750488, "sampling/sampling_logp_difference/mean": 0.025263220071792603, "step": 343, "step_time": 37.784647938999115 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 0.156339006498456, "epoch": 0.00688, "grad_norm": 1.2938135862350464, "kl": 0.3518723715096712, "learning_rate": 9.999824327768121e-06, "loss": -0.0577, "step": 344, "step_time": 6.225835901999744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0012499999720603228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012499999720603228, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1841.6875, "completions/mean_terminated_length": 1841.6875, "completions/min_length": 1579.0, "completions/min_terminated_length": 1579.0, "entropy": 0.15647383406758308, "epoch": 0.0069, "frac_reward_zero_std": 0.0, "grad_norm": 1.5851750373840332, "kl": 0.4456126391887665, "learning_rate": 9.99982318519649e-06, "loss": 0.0497, "num_tokens": 12786883.0, "reward": 4.234543323516846, "reward_std": 7.14749813079834, "rewards/rollout_reward_func/mean": 4.234543323516846, "rewards/rollout_reward_func/std": 11.683938980102539, "sampling/importance_sampling_ratio/max": 2.441897392272949, "sampling/importance_sampling_ratio/mean": 0.9281522035598755, "sampling/importance_sampling_ratio/min": 5.669828341647121e-19, "sampling/sampling_logp_difference/max": 24.185270309448242, "sampling/sampling_logp_difference/mean": 0.1490144282579422, "step": 345, "step_time": 38.19426675700015 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.1573875080794096, "epoch": 0.00692, "grad_norm": 1.2656595706939697, "kl": 0.42021266743540764, "learning_rate": 9.999822038921339e-06, "loss": 0.0471, "step": 346, "step_time": 6.839767702995232 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1904.75, "completions/mean_terminated_length": 1904.75, "completions/min_length": 1528.0, "completions/min_terminated_length": 1528.0, "entropy": 0.14398007839918137, "epoch": 0.00694, "frac_reward_zero_std": 0.0, "grad_norm": 1.1126422882080078, "kl": 0.39460230618715286, "learning_rate": 9.99982088894267e-06, "loss": -0.0541, "num_tokens": 12868503.0, "reward": 4.4482550621032715, "reward_std": 6.246906757354736, "rewards/rollout_reward_func/mean": 4.4482550621032715, "rewards/rollout_reward_func/std": 11.603610038757324, "sampling/importance_sampling_ratio/max": 1.5690655708312988, "sampling/importance_sampling_ratio/mean": 0.9412245750427246, "sampling/importance_sampling_ratio/min": 0.37497684359550476, "sampling/sampling_logp_difference/max": 1.0045862197875977, "sampling/sampling_logp_difference/mean": 0.0292842797935009, "step": 347, "step_time": 36.1456481869991 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.14603436551988125, "epoch": 0.00696, "grad_norm": 1.0782148838043213, "kl": 0.4032270349562168, "learning_rate": 9.999819735260483e-06, "loss": -0.0577, "step": 348, "step_time": 6.739517917001649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1886.75, "completions/mean_terminated_length": 1886.75, "completions/min_length": 1634.0, "completions/min_terminated_length": 1634.0, "entropy": 0.1570840571075678, "epoch": 0.00698, "frac_reward_zero_std": 0.25, "grad_norm": 2.835125684738159, "kl": 1.7887609116733074, "learning_rate": 9.999818577874782e-06, "loss": -0.1581, "num_tokens": 12949689.0, "reward": 9.66476821899414, "reward_std": 5.8628249168396, "rewards/rollout_reward_func/mean": 9.66476821899414, "rewards/rollout_reward_func/std": 10.391725540161133, "sampling/importance_sampling_ratio/max": 1.674692153930664, "sampling/importance_sampling_ratio/mean": 0.9785152673721313, "sampling/importance_sampling_ratio/min": 0.15405651926994324, "sampling/sampling_logp_difference/max": 1.66461181640625, "sampling/sampling_logp_difference/mean": 0.033790212124586105, "step": 349, "step_time": 38.58619033600189 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "entropy": 0.15997820533812046, "epoch": 0.007, "grad_norm": 2.284522771835327, "kl": 1.5180134773254395, "learning_rate": 9.999817416785565e-06, "loss": -0.1585, "step": 350, "step_time": 6.308404929000972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 1791.1875, "completions/mean_terminated_length": 1791.1875, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "entropy": 0.17174389213323593, "epoch": 0.00702, "frac_reward_zero_std": 0.0, "grad_norm": 1.4891026020050049, "kl": 1.113628275692463, "learning_rate": 9.999816251992836e-06, "loss": -0.0978, "num_tokens": 13027385.0, "reward": 2.3329522609710693, "reward_std": 7.594513416290283, "rewards/rollout_reward_func/mean": 2.3329522609710693, "rewards/rollout_reward_func/std": 12.151657104492188, "sampling/importance_sampling_ratio/max": 1.5326077938079834, "sampling/importance_sampling_ratio/mean": 0.874591052532196, "sampling/importance_sampling_ratio/min": 0.13500602543354034, "sampling/sampling_logp_difference/max": 1.4848179817199707, "sampling/sampling_logp_difference/mean": 0.03452059626579285, "step": 351, "step_time": 38.5899223109991 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.17624721303582191, "epoch": 0.00704, "grad_norm": 0.9316499829292297, "kl": 0.7885981798171997, "learning_rate": 9.999815083496593e-06, "loss": -0.1029, "step": 352, "step_time": 6.3719571820001875 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1968.0, "completions/max_terminated_length": 1968.0, "completions/mean_length": 1887.71875, "completions/mean_terminated_length": 1887.71875, "completions/min_length": 1804.0, "completions/min_terminated_length": 1804.0, "entropy": 0.18038272112607956, "epoch": 0.00706, "frac_reward_zero_std": 0.0, "grad_norm": 1.4058349132537842, "kl": 0.5268550999462605, "learning_rate": 9.99981391129684e-06, "loss": -0.1831, "num_tokens": 13108436.0, "reward": 7.138474941253662, "reward_std": 9.193456649780273, "rewards/rollout_reward_func/mean": 7.138474941253662, "rewards/rollout_reward_func/std": 11.277852058410645, "sampling/importance_sampling_ratio/max": 2.048379898071289, "sampling/importance_sampling_ratio/mean": 0.9898632168769836, "sampling/importance_sampling_ratio/min": 0.31908681988716125, "sampling/sampling_logp_difference/max": 1.2619354724884033, "sampling/sampling_logp_difference/mean": 0.04246928542852402, "step": 353, "step_time": 36.779780131000734 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 0.18332521431148052, "epoch": 0.00708, "grad_norm": 1.734959363937378, "kl": 0.6400851979851723, "learning_rate": 9.999812735393578e-06, "loss": -0.1861, "step": 354, "step_time": 6.657307188001141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1855.8125, "completions/mean_terminated_length": 1855.8125, "completions/min_length": 1417.0, "completions/min_terminated_length": 1417.0, "entropy": 0.15774557553231716, "epoch": 0.0071, "frac_reward_zero_std": 0.0, "grad_norm": 1.1801131963729858, "kl": 0.28479940071702003, "learning_rate": 9.999811555786805e-06, "loss": -0.0644, "num_tokens": 13188201.0, "reward": 6.061254501342773, "reward_std": 11.283492088317871, "rewards/rollout_reward_func/mean": 6.061254501342773, "rewards/rollout_reward_func/std": 20.36028480529785, "sampling/importance_sampling_ratio/max": 1.5566022396087646, "sampling/importance_sampling_ratio/mean": 0.9729775786399841, "sampling/importance_sampling_ratio/min": 0.22793923318386078, "sampling/sampling_logp_difference/max": 0.8676626682281494, "sampling/sampling_logp_difference/mean": 0.031204868108034134, "step": 355, "step_time": 35.57014237199837 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "entropy": 0.15768986009061337, "epoch": 0.00712, "grad_norm": 1.0285789966583252, "kl": 0.2967808712273836, "learning_rate": 9.999810372476526e-06, "loss": -0.0677, "step": 356, "step_time": 6.207597920001717 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1912.25, "completions/mean_terminated_length": 1912.25, "completions/min_length": 1727.0, "completions/min_terminated_length": 1727.0, "entropy": 0.15802463050931692, "epoch": 0.00714, "frac_reward_zero_std": 0.125, "grad_norm": 1.4909172058105469, "kl": 0.6946889795362949, "learning_rate": 9.99980918546274e-06, "loss": -0.064, "num_tokens": 13269840.0, "reward": 8.720025062561035, "reward_std": 8.605905532836914, "rewards/rollout_reward_func/mean": 8.720025062561035, "rewards/rollout_reward_func/std": 13.064443588256836, "sampling/importance_sampling_ratio/max": 2.113987922668457, "sampling/importance_sampling_ratio/mean": 0.9241708517074585, "sampling/importance_sampling_ratio/min": 0.17220593988895416, "sampling/sampling_logp_difference/max": 1.0862641334533691, "sampling/sampling_logp_difference/mean": 0.03762374818325043, "step": 357, "step_time": 39.26908052300132 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555620230734, "entropy": 0.15692736767232418, "epoch": 0.00716, "grad_norm": 1.3770283460617065, "kl": 0.682164192199707, "learning_rate": 9.999807994745449e-06, "loss": -0.0657, "step": 358, "step_time": 6.277640965996397 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1804.21875, "completions/mean_terminated_length": 1804.21875, "completions/min_length": 1351.0, "completions/min_terminated_length": 1351.0, "entropy": 0.15549181029200554, "epoch": 0.00718, "frac_reward_zero_std": 0.125, "grad_norm": 0.9340437650680542, "kl": 0.30343155562877655, "learning_rate": 9.999806800324652e-06, "loss": -0.0304, "num_tokens": 13347887.0, "reward": -1.006221055984497, "reward_std": 8.046869277954102, "rewards/rollout_reward_func/mean": -1.006221055984497, "rewards/rollout_reward_func/std": 12.83417797088623, "sampling/importance_sampling_ratio/max": 1.6891496181488037, "sampling/importance_sampling_ratio/mean": 0.9606433510780334, "sampling/importance_sampling_ratio/min": 3.726897696704201e-12, "sampling/sampling_logp_difference/max": 25.532629013061523, "sampling/sampling_logp_difference/mean": 0.06780634075403214, "step": 359, "step_time": 37.24431940499744 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.007161458255723119, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.010633680503815413, "entropy": 0.15154966711997986, "epoch": 0.0072, "grad_norm": 0.7721959948539734, "kl": 0.30717028118669987, "learning_rate": 9.999805602200355e-06, "loss": -0.0332, "step": 360, "step_time": 6.086639444998582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1784.75, "completions/mean_terminated_length": 1784.75, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "entropy": 0.1756394449621439, "epoch": 0.00722, "frac_reward_zero_std": 0.0, "grad_norm": 1.9486719369888306, "kl": 0.5046870671212673, "learning_rate": 9.999804400372553e-06, "loss": 0.0591, "num_tokens": 13425212.0, "reward": 1.9548215866088867, "reward_std": 7.599405765533447, "rewards/rollout_reward_func/mean": 1.9548215866088867, "rewards/rollout_reward_func/std": 11.06623363494873, "sampling/importance_sampling_ratio/max": 2.453582525253296, "sampling/importance_sampling_ratio/mean": 1.1315981149673462, "sampling/importance_sampling_ratio/min": 0.2927221357822418, "sampling/sampling_logp_difference/max": 0.9818147420883179, "sampling/sampling_logp_difference/mean": 0.02989993989467621, "step": 361, "step_time": 36.09027150499969 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.006944444612599909, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01215277798473835, "entropy": 0.17177880927920341, "epoch": 0.00724, "grad_norm": 1.6040363311767578, "kl": 0.5513662751764059, "learning_rate": 9.999803194841253e-06, "loss": 0.0551, "step": 362, "step_time": 6.112345547999212 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1889.8125, "completions/mean_terminated_length": 1889.8125, "completions/min_length": 1789.0, "completions/min_terminated_length": 1789.0, "entropy": 0.1901466716080904, "epoch": 0.00726, "frac_reward_zero_std": 0.0, "grad_norm": 2.559001922607422, "kl": 1.604296986013651, "learning_rate": 9.999801985606451e-06, "loss": -0.0359, "num_tokens": 13506437.0, "reward": 12.421402931213379, "reward_std": 7.902709484100342, "rewards/rollout_reward_func/mean": 12.421402931213379, "rewards/rollout_reward_func/std": 11.493805885314941, "sampling/importance_sampling_ratio/max": 2.2942121028900146, "sampling/importance_sampling_ratio/mean": 0.9992972612380981, "sampling/importance_sampling_ratio/min": 0.1637697070837021, "sampling/sampling_logp_difference/max": 1.1397209167480469, "sampling/sampling_logp_difference/mean": 0.038260094821453094, "step": 363, "step_time": 38.05201607199888 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.18919656239449978, "epoch": 0.00728, "grad_norm": 2.6743459701538086, "kl": 1.4820591136813164, "learning_rate": 9.999800772668154e-06, "loss": -0.038, "step": 364, "step_time": 6.24283971099976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1859.03125, "completions/mean_terminated_length": 1859.03125, "completions/min_length": 1485.0, "completions/min_terminated_length": 1485.0, "entropy": 0.16470451280474663, "epoch": 0.0073, "frac_reward_zero_std": 0.0, "grad_norm": 1.2065528631210327, "kl": 0.41300537437200546, "learning_rate": 9.999799556026358e-06, "loss": -0.0763, "num_tokens": 13586301.0, "reward": 11.647848129272461, "reward_std": 5.665955543518066, "rewards/rollout_reward_func/mean": 11.647848129272461, "rewards/rollout_reward_func/std": 16.600069046020508, "sampling/importance_sampling_ratio/max": 1.5297025442123413, "sampling/importance_sampling_ratio/mean": 1.0060566663742065, "sampling/importance_sampling_ratio/min": 0.3258414566516876, "sampling/sampling_logp_difference/max": 0.5945889353752136, "sampling/sampling_logp_difference/mean": 0.02704280987381935, "step": 365, "step_time": 38.81224459800069 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 0.16255084611475468, "epoch": 0.00732, "grad_norm": 0.9401413202285767, "kl": 0.4195688497275114, "learning_rate": 9.999798335681066e-06, "loss": -0.0813, "step": 366, "step_time": 6.165453665998939 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1833.46875, "completions/mean_terminated_length": 1833.46875, "completions/min_length": 1624.0, "completions/min_terminated_length": 1624.0, "entropy": 0.17754663713276386, "epoch": 0.00734, "frac_reward_zero_std": 0.0, "grad_norm": 1.9974569082260132, "kl": 1.130824089050293, "learning_rate": 9.99979711163228e-06, "loss": -0.0937, "num_tokens": 13665217.0, "reward": 5.195849418640137, "reward_std": 10.393113136291504, "rewards/rollout_reward_func/mean": 5.195849418640137, "rewards/rollout_reward_func/std": 12.448369026184082, "sampling/importance_sampling_ratio/max": 1.792784571647644, "sampling/importance_sampling_ratio/mean": 0.8298041820526123, "sampling/importance_sampling_ratio/min": 4.1797170108673343e-13, "sampling/sampling_logp_difference/max": 27.452041625976562, "sampling/sampling_logp_difference/mean": 0.1302998661994934, "step": 367, "step_time": 36.412708622001446 }, { "clip_ratio/high_max": 0.010233918204903603, "clip_ratio/high_mean": 0.005116959102451801, "clip_ratio/low_mean": 0.0016891892300918698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006806148332543671, "entropy": 0.17670008912682533, "epoch": 0.00736, "grad_norm": 1.4283668994903564, "kl": 0.8111900072544813, "learning_rate": 9.999795883880002e-06, "loss": -0.0963, "step": 368, "step_time": 6.141397028004576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1903.15625, "completions/mean_terminated_length": 1903.15625, "completions/min_length": 1771.0, "completions/min_terminated_length": 1771.0, "entropy": 0.14921395294368267, "epoch": 0.00738, "frac_reward_zero_std": 0.125, "grad_norm": 1.2093360424041748, "kl": 0.44058969616889954, "learning_rate": 9.999794652424228e-06, "loss": 0.0713, "num_tokens": 13746632.0, "reward": 11.435522079467773, "reward_std": 3.840212345123291, "rewards/rollout_reward_func/mean": 11.435522079467773, "rewards/rollout_reward_func/std": 13.30839729309082, "sampling/importance_sampling_ratio/max": 2.1941890716552734, "sampling/importance_sampling_ratio/mean": 0.9962909817695618, "sampling/importance_sampling_ratio/min": 0.30016008019447327, "sampling/sampling_logp_difference/max": 0.8217124938964844, "sampling/sampling_logp_difference/mean": 0.029038339853286743, "step": 369, "step_time": 37.36318951800058 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.01215277798473835, "entropy": 0.14946964103728533, "epoch": 0.0074, "grad_norm": 1.5157506465911865, "kl": 0.4827451854944229, "learning_rate": 9.999793417264967e-06, "loss": 0.0668, "step": 370, "step_time": 6.7108045629993285 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1862.59375, "completions/mean_terminated_length": 1862.59375, "completions/min_length": 1764.0, "completions/min_terminated_length": 1764.0, "entropy": 0.1261264430359006, "epoch": 0.00742, "frac_reward_zero_std": 0.0, "grad_norm": 1.1795105934143066, "kl": 0.7585394252091646, "learning_rate": 9.999792178402215e-06, "loss": -0.0116, "num_tokens": 13827147.0, "reward": 7.017940521240234, "reward_std": 5.937340259552002, "rewards/rollout_reward_func/mean": 7.017940521240234, "rewards/rollout_reward_func/std": 14.372506141662598, "sampling/importance_sampling_ratio/max": 1.6552984714508057, "sampling/importance_sampling_ratio/mean": 0.9012160301208496, "sampling/importance_sampling_ratio/min": 0.2452741116285324, "sampling/sampling_logp_difference/max": 1.3038992881774902, "sampling/sampling_logp_difference/mean": 0.02553473599255085, "step": 371, "step_time": 37.5200551709986 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.12507631909102201, "epoch": 0.00744, "grad_norm": 1.228213906288147, "kl": 0.8153098970651627, "learning_rate": 9.999790935835974e-06, "loss": -0.0135, "step": 372, "step_time": 6.234621702000368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1880.25, "completions/mean_terminated_length": 1880.25, "completions/min_length": 1754.0, "completions/min_terminated_length": 1754.0, "entropy": 0.11569633707404137, "epoch": 0.00746, "frac_reward_zero_std": 0.125, "grad_norm": 1.3457225561141968, "kl": 0.887390311807394, "learning_rate": 9.999789689566245e-06, "loss": -0.0406, "num_tokens": 13907765.0, "reward": 12.74990463256836, "reward_std": 5.970038414001465, "rewards/rollout_reward_func/mean": 12.74990463256836, "rewards/rollout_reward_func/std": 13.113883018493652, "sampling/importance_sampling_ratio/max": 1.5790919065475464, "sampling/importance_sampling_ratio/mean": 0.9145287275314331, "sampling/importance_sampling_ratio/min": 2.7017030648258944e-13, "sampling/sampling_logp_difference/max": 28.88240623474121, "sampling/sampling_logp_difference/mean": 0.07740553468465805, "step": 373, "step_time": 37.90379121299884 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.11660288367420435, "epoch": 0.00748, "grad_norm": 1.325151801109314, "kl": 0.7547343485057354, "learning_rate": 9.999788439593031e-06, "loss": -0.0431, "step": 374, "step_time": 6.616651901998921 }, { "clip_ratio/high_max": 0.011111111380159855, "clip_ratio/high_mean": 0.0055555556900799274, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007291666814126074, "completions/clipped_ratio": 0.0, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1847.59375, "completions/mean_terminated_length": 1847.59375, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "entropy": 0.17600342631340027, "epoch": 0.0075, "frac_reward_zero_std": 0.0, "grad_norm": 1.814244031906128, "kl": 0.8092060312628746, "learning_rate": 9.999787185916332e-06, "loss": 0.0845, "num_tokens": 13987733.0, "reward": 5.100118160247803, "reward_std": 9.183606147766113, "rewards/rollout_reward_func/mean": 5.100118160247803, "rewards/rollout_reward_func/std": 14.02592658996582, "sampling/importance_sampling_ratio/max": 1.662461519241333, "sampling/importance_sampling_ratio/mean": 0.9842205047607422, "sampling/importance_sampling_ratio/min": 0.22752133011817932, "sampling/sampling_logp_difference/max": 0.6112067699432373, "sampling/sampling_logp_difference/mean": 0.027879422530531883, "step": 375, "step_time": 36.84108008799922 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 0.17556224018335342, "epoch": 0.00752, "grad_norm": 1.3789916038513184, "kl": 0.7915375307202339, "learning_rate": 9.999785928536149e-06, "loss": 0.0834, "step": 376, "step_time": 6.73390211300466 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 2020.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1849.84375, "completions/mean_terminated_length": 1849.84375, "completions/min_length": 1106.0, "completions/min_terminated_length": 1106.0, "entropy": 0.1529197357594967, "epoch": 0.00754, "frac_reward_zero_std": 0.0, "grad_norm": 1.711385726928711, "kl": 0.8515200167894363, "learning_rate": 9.999784667452484e-06, "loss": -0.1104, "num_tokens": 14067130.0, "reward": 4.861234664916992, "reward_std": 7.06196403503418, "rewards/rollout_reward_func/mean": 4.861234664916992, "rewards/rollout_reward_func/std": 13.409940719604492, "sampling/importance_sampling_ratio/max": 1.5862915515899658, "sampling/importance_sampling_ratio/mean": 0.9584461450576782, "sampling/importance_sampling_ratio/min": 0.14316152036190033, "sampling/sampling_logp_difference/max": 1.0026142597198486, "sampling/sampling_logp_difference/mean": 0.03037048876285553, "step": 377, "step_time": 36.54281629899742 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.15627212449908257, "epoch": 0.00756, "grad_norm": 1.1648268699645996, "kl": 0.8427795469760895, "learning_rate": 9.999783402665337e-06, "loss": -0.1098, "step": 378, "step_time": 6.265584359001878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.03125, "completions/max_length": 1941.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1823.875, "completions/mean_terminated_length": 1823.1290283203125, "completions/min_length": 1391.0, "completions/min_terminated_length": 1391.0, "entropy": 0.14846901781857014, "epoch": 0.00758, "frac_reward_zero_std": 0.0, "grad_norm": 1.3327255249023438, "kl": 0.391092361882329, "learning_rate": 9.999782134174711e-06, "loss": 0.0327, "num_tokens": 14146165.0, "reward": -0.792694091796875, "reward_std": 8.190790176391602, "rewards/rollout_reward_func/mean": -0.792694091796875, "rewards/rollout_reward_func/std": 14.544111251831055, "sampling/importance_sampling_ratio/max": 1.8940695524215698, "sampling/importance_sampling_ratio/mean": 0.9296935200691223, "sampling/importance_sampling_ratio/min": 0.392696738243103, "sampling/sampling_logp_difference/max": 0.906486988067627, "sampling/sampling_logp_difference/mean": 0.025111418217420578, "step": 379, "step_time": 35.90425824399972 }, { "clip_ratio/high_max": 0.007378472248092294, "clip_ratio/high_mean": 0.005425347131676972, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007161458255723119, "entropy": 0.1500188522040844, "epoch": 0.0076, "grad_norm": 1.1629754304885864, "kl": 0.3764154892414808, "learning_rate": 9.999780861980606e-06, "loss": 0.0307, "step": 380, "step_time": 6.591761033001603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0012499999720603228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012499999720603228, "completions/clipped_ratio": 0.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1859.5, "completions/mean_terminated_length": 1859.5, "completions/min_length": 1667.0, "completions/min_terminated_length": 1667.0, "entropy": 0.16048664320260286, "epoch": 0.00762, "frac_reward_zero_std": 0.125, "grad_norm": 1.7579671144485474, "kl": 0.33646042086184025, "learning_rate": 9.999779586083026e-06, "loss": -0.198, "num_tokens": 14226076.0, "reward": 5.697696685791016, "reward_std": 4.209428787231445, "rewards/rollout_reward_func/mean": 5.697696685791016, "rewards/rollout_reward_func/std": 13.195571899414062, "sampling/importance_sampling_ratio/max": 1.7836365699768066, "sampling/importance_sampling_ratio/mean": 1.0036367177963257, "sampling/importance_sampling_ratio/min": 1.983860382622171e-20, "sampling/sampling_logp_difference/max": 24.610870361328125, "sampling/sampling_logp_difference/mean": 0.09851048141717911, "step": 381, "step_time": 37.674988107999525 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0012499999720603228, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0047222222201526165, "entropy": 0.16013844590634108, "epoch": 0.00764, "grad_norm": 1.1893305778503418, "kl": 0.32176281698048115, "learning_rate": 9.999778306481967e-06, "loss": -0.2009, "step": 382, "step_time": 6.6418502909982635 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 2022.0, "completions/max_terminated_length": 2022.0, "completions/mean_length": 1888.46875, "completions/mean_terminated_length": 1888.46875, "completions/min_length": 1740.0, "completions/min_terminated_length": 1740.0, "entropy": 0.1790903713554144, "epoch": 0.00766, "frac_reward_zero_std": 0.0, "grad_norm": 1.2004278898239136, "kl": 0.6035262942314148, "learning_rate": 9.999777023177434e-06, "loss": -0.0882, "num_tokens": 14306910.0, "reward": 5.2071757316589355, "reward_std": 9.08076286315918, "rewards/rollout_reward_func/mean": 5.2071757316589355, "rewards/rollout_reward_func/std": 11.35473918914795, "sampling/importance_sampling_ratio/max": 1.9644986391067505, "sampling/importance_sampling_ratio/mean": 0.9806682467460632, "sampling/importance_sampling_ratio/min": 0.4284982681274414, "sampling/sampling_logp_difference/max": 0.6676270961761475, "sampling/sampling_logp_difference/mean": 0.029503734782338142, "step": 383, "step_time": 38.31665977399825 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.17896115221083164, "epoch": 0.00768, "grad_norm": 1.2121306657791138, "kl": 0.6167402379214764, "learning_rate": 9.999775736169428e-06, "loss": -0.0902, "step": 384, "step_time": 6.277814937000585 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1908.3125, "completions/mean_terminated_length": 1908.3125, "completions/min_length": 1820.0, "completions/min_terminated_length": 1820.0, "entropy": 0.1345885144546628, "epoch": 0.0077, "frac_reward_zero_std": 0.125, "grad_norm": 1.196207880973816, "kl": 0.572353832423687, "learning_rate": 9.99977444545795e-06, "loss": -0.1616, "num_tokens": 14389050.0, "reward": 7.360363006591797, "reward_std": 7.1288251876831055, "rewards/rollout_reward_func/mean": 7.360363006591797, "rewards/rollout_reward_func/std": 11.084699630737305, "sampling/importance_sampling_ratio/max": 1.8938790559768677, "sampling/importance_sampling_ratio/mean": 1.0094572305679321, "sampling/importance_sampling_ratio/min": 0.4132111072540283, "sampling/sampling_logp_difference/max": 1.0937575101852417, "sampling/sampling_logp_difference/mean": 0.02933787927031517, "step": 385, "step_time": 39.433605923999494 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.13119102269411087, "epoch": 0.00772, "grad_norm": 0.9113112688064575, "kl": 0.5914033465087414, "learning_rate": 9.999773151043e-06, "loss": -0.1639, "step": 386, "step_time": 6.731554907000827 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1888.1875, "completions/mean_terminated_length": 1888.1875, "completions/min_length": 1796.0, "completions/min_terminated_length": 1796.0, "entropy": 0.14891765639185905, "epoch": 0.00774, "frac_reward_zero_std": 0.0, "grad_norm": 1.6409744024276733, "kl": 0.8021159842610359, "learning_rate": 9.999771852924581e-06, "loss": -0.0082, "num_tokens": 14469970.0, "reward": 6.270630359649658, "reward_std": 8.007278442382812, "rewards/rollout_reward_func/mean": 6.270630359649658, "rewards/rollout_reward_func/std": 12.005971908569336, "sampling/importance_sampling_ratio/max": 1.5516520738601685, "sampling/importance_sampling_ratio/mean": 0.9231661558151245, "sampling/importance_sampling_ratio/min": 0.45910143852233887, "sampling/sampling_logp_difference/max": 0.6149110794067383, "sampling/sampling_logp_difference/mean": 0.027469176799058914, "step": 387, "step_time": 36.89286300000276 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01736111124046147, "entropy": 0.142063251696527, "epoch": 0.00776, "grad_norm": 1.1793173551559448, "kl": 0.7793877236545086, "learning_rate": 9.999770551102692e-06, "loss": -0.0135, "step": 388, "step_time": 6.243275303997507 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1997.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1927.0625, "completions/mean_terminated_length": 1927.0625, "completions/min_length": 1840.0, "completions/min_terminated_length": 1840.0, "entropy": 0.15283752977848053, "epoch": 0.00778, "frac_reward_zero_std": 0.0, "grad_norm": 1.9657156467437744, "kl": 0.3845696784555912, "learning_rate": 9.999769245577337e-06, "loss": -0.2581, "num_tokens": 14552332.0, "reward": 10.149282455444336, "reward_std": 9.99736213684082, "rewards/rollout_reward_func/mean": 10.149282455444336, "rewards/rollout_reward_func/std": 15.273449897766113, "sampling/importance_sampling_ratio/max": 2.829928398132324, "sampling/importance_sampling_ratio/mean": 1.1655187606811523, "sampling/importance_sampling_ratio/min": 0.29218825697898865, "sampling/sampling_logp_difference/max": 1.0828132629394531, "sampling/sampling_logp_difference/mean": 0.03259303420782089, "step": 389, "step_time": 35.86770247999448 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.15223741345107555, "epoch": 0.0078, "grad_norm": 1.65627121925354, "kl": 0.39045586064457893, "learning_rate": 9.999767936348516e-06, "loss": -0.2629, "step": 390, "step_time": 6.266667372001393 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1907.625, "completions/mean_terminated_length": 1907.625, "completions/min_length": 1783.0, "completions/min_terminated_length": 1783.0, "entropy": 0.14941192418336868, "epoch": 0.00782, "frac_reward_zero_std": 0.0, "grad_norm": 1.701319694519043, "kl": 0.4128375668078661, "learning_rate": 9.999766623416231e-06, "loss": -0.0513, "num_tokens": 14633831.0, "reward": 12.501077651977539, "reward_std": 6.659691333770752, "rewards/rollout_reward_func/mean": 12.501077651977539, "rewards/rollout_reward_func/std": 15.406060218811035, "sampling/importance_sampling_ratio/max": 1.780453085899353, "sampling/importance_sampling_ratio/mean": 1.020936131477356, "sampling/importance_sampling_ratio/min": 0.2042788863182068, "sampling/sampling_logp_difference/max": 0.9652459621429443, "sampling/sampling_logp_difference/mean": 0.027463870123028755, "step": 391, "step_time": 37.91702412400082 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.1454289434477687, "epoch": 0.00784, "grad_norm": 1.6639035940170288, "kl": 0.4206230044364929, "learning_rate": 9.999765306780483e-06, "loss": -0.0547, "step": 392, "step_time": 6.735865983999247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 1883.46875, "completions/mean_terminated_length": 1883.46875, "completions/min_length": 1663.0, "completions/min_terminated_length": 1663.0, "entropy": 0.1706612892448902, "epoch": 0.00786, "frac_reward_zero_std": 0.0, "grad_norm": 1.9211045503616333, "kl": 0.5242457240819931, "learning_rate": 9.999763986441271e-06, "loss": -0.2397, "num_tokens": 14714639.0, "reward": 5.736878395080566, "reward_std": 10.025382995605469, "rewards/rollout_reward_func/mean": 5.736878395080566, "rewards/rollout_reward_func/std": 15.901412963867188, "sampling/importance_sampling_ratio/max": 2.1215860843658447, "sampling/importance_sampling_ratio/mean": 0.9944747686386108, "sampling/importance_sampling_ratio/min": 0.3210683763027191, "sampling/sampling_logp_difference/max": 1.1642231941223145, "sampling/sampling_logp_difference/mean": 0.03731720149517059, "step": 393, "step_time": 38.35620686900438 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 0.16822869144380093, "epoch": 0.00788, "grad_norm": 1.7900792360305786, "kl": 0.563917126506567, "learning_rate": 9.999762662398599e-06, "loss": -0.2418, "step": 394, "step_time": 6.265376314000605 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009027778054587543, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1748.75, "completions/mean_terminated_length": 1748.75, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "entropy": 0.15377911366522312, "epoch": 0.0079, "frac_reward_zero_std": 0.0, "grad_norm": 1.2754374742507935, "kl": 0.6537161991000175, "learning_rate": 9.999761334652469e-06, "loss": 0.037, "num_tokens": 14791302.0, "reward": 3.2958414554595947, "reward_std": 6.388683319091797, "rewards/rollout_reward_func/mean": 3.2958414554595947, "rewards/rollout_reward_func/std": 14.59126091003418, "sampling/importance_sampling_ratio/max": 1.8531484603881836, "sampling/importance_sampling_ratio/mean": 0.9730424880981445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.1739468574523926, "sampling/sampling_logp_difference/mean": 0.030640186741948128, "step": 395, "step_time": 34.660050579997915 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555620230734, "entropy": 0.15109152905642986, "epoch": 0.00792, "grad_norm": 1.3109846115112305, "kl": 0.6577336862683296, "learning_rate": 9.999760003202882e-06, "loss": 0.0344, "step": 396, "step_time": 6.238461292003194 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 1997.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1870.09375, "completions/mean_terminated_length": 1870.09375, "completions/min_length": 1662.0, "completions/min_terminated_length": 1662.0, "entropy": 0.15775778237730265, "epoch": 0.00794, "frac_reward_zero_std": 0.0, "grad_norm": 1.4070671796798706, "kl": 0.6230524256825447, "learning_rate": 9.999758668049834e-06, "loss": -0.0548, "num_tokens": 14871639.0, "reward": 10.42796802520752, "reward_std": 8.001551628112793, "rewards/rollout_reward_func/mean": 10.42796802520752, "rewards/rollout_reward_func/std": 13.233992576599121, "sampling/importance_sampling_ratio/max": 1.9538705348968506, "sampling/importance_sampling_ratio/mean": 0.9155502319335938, "sampling/importance_sampling_ratio/min": 9.152499071130027e-12, "sampling/sampling_logp_difference/max": 24.498069763183594, "sampling/sampling_logp_difference/mean": 0.10757236927747726, "step": 397, "step_time": 37.941033778999554 }, { "clip_ratio/high_max": 0.006761695956811309, "clip_ratio/high_mean": 0.0033808479784056544, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008589181350544095, "entropy": 0.15627853386104107, "epoch": 0.00796, "grad_norm": 1.5508379936218262, "kl": 0.6376640871167183, "learning_rate": 9.999757329193334e-06, "loss": -0.0563, "step": 398, "step_time": 6.230076591997204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1998.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1835.84375, "completions/mean_terminated_length": 1835.84375, "completions/min_length": 1691.0, "completions/min_terminated_length": 1691.0, "entropy": 0.13853330817073584, "epoch": 0.00798, "frac_reward_zero_std": 0.125, "grad_norm": 1.629981517791748, "kl": 0.5573707222938538, "learning_rate": 9.999755986633378e-06, "loss": 0.0474, "num_tokens": 14950879.0, "reward": 2.0207910537719727, "reward_std": 7.94326114654541, "rewards/rollout_reward_func/mean": 2.0207910537719727, "rewards/rollout_reward_func/std": 15.21265983581543, "sampling/importance_sampling_ratio/max": 1.5181382894515991, "sampling/importance_sampling_ratio/mean": 0.9647513628005981, "sampling/importance_sampling_ratio/min": 0.04904457926750183, "sampling/sampling_logp_difference/max": 1.2537705898284912, "sampling/sampling_logp_difference/mean": 0.03419237583875656, "step": 399, "step_time": 37.17983689799985 }, { "clip_ratio/high_max": 0.01736111124046147, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.14058438315987587, "epoch": 0.008, "grad_norm": 0.6722509264945984, "kl": 0.5397394970059395, "learning_rate": 9.999754640369969e-06, "loss": 0.0452, "step": 400, "step_time": 6.206834619999427 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.03125, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1883.46875, "completions/mean_terminated_length": 1882.0, "completions/min_length": 1799.0, "completions/min_terminated_length": 1799.0, "entropy": 0.13088950607925653, "epoch": 0.00802, "frac_reward_zero_std": 0.0, "grad_norm": 2.2955660820007324, "kl": 0.44351304322481155, "learning_rate": 9.99975329040311e-06, "loss": -0.0213, "num_tokens": 15031668.0, "reward": 7.0081658363342285, "reward_std": 4.4600934982299805, "rewards/rollout_reward_func/mean": 7.0081658363342285, "rewards/rollout_reward_func/std": 6.16019344329834, "sampling/importance_sampling_ratio/max": 1.7051112651824951, "sampling/importance_sampling_ratio/mean": 1.0153717994689941, "sampling/importance_sampling_ratio/min": 0.27369314432144165, "sampling/sampling_logp_difference/max": 1.2073643207550049, "sampling/sampling_logp_difference/mean": 0.02993970364332199, "step": 401, "step_time": 37.11414306700135 }, { "clip_ratio/high_max": 0.01736111124046147, "clip_ratio/high_mean": 0.010416666744276881, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000116415322, "entropy": 0.13208830822259188, "epoch": 0.00804, "grad_norm": 1.393228530883789, "kl": 0.4656967334449291, "learning_rate": 9.9997519367328e-06, "loss": -0.0256, "step": 402, "step_time": 6.654871525000999 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1908.125, "completions/mean_terminated_length": 1908.125, "completions/min_length": 1816.0, "completions/min_terminated_length": 1816.0, "entropy": 0.10632855352014303, "epoch": 0.00806, "frac_reward_zero_std": 0.375, "grad_norm": 1.1967393159866333, "kl": 0.6505869217216969, "learning_rate": 9.999750579359042e-06, "loss": 0.0394, "num_tokens": 15113625.0, "reward": 7.391304969787598, "reward_std": 4.511048316955566, "rewards/rollout_reward_func/mean": 7.391304969787598, "rewards/rollout_reward_func/std": 12.610478401184082, "sampling/importance_sampling_ratio/max": 1.6840932369232178, "sampling/importance_sampling_ratio/mean": 1.005039930343628, "sampling/importance_sampling_ratio/min": 0.2297038733959198, "sampling/sampling_logp_difference/max": 1.0730221271514893, "sampling/sampling_logp_difference/mean": 0.023349303752183914, "step": 403, "step_time": 37.35070921799888 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.10929813794791698, "epoch": 0.00808, "grad_norm": 0.7991507649421692, "kl": 0.564202968031168, "learning_rate": 9.999749218281836e-06, "loss": 0.0374, "step": 404, "step_time": 6.741287571996509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1994.0, "completions/max_terminated_length": 1994.0, "completions/mean_length": 1666.90625, "completions/mean_terminated_length": 1666.90625, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "entropy": 0.15686850529164076, "epoch": 0.0081, "frac_reward_zero_std": 0.125, "grad_norm": 3.0047717094421387, "kl": 0.44304889999330044, "learning_rate": 9.999747853501184e-06, "loss": 0.0822, "num_tokens": 15187523.0, "reward": 12.523664474487305, "reward_std": 6.427609443664551, "rewards/rollout_reward_func/mean": 12.523664474487305, "rewards/rollout_reward_func/std": 15.645023345947266, "sampling/importance_sampling_ratio/max": 2.3568639755249023, "sampling/importance_sampling_ratio/mean": 1.1489346027374268, "sampling/importance_sampling_ratio/min": 0.22255395352840424, "sampling/sampling_logp_difference/max": 0.6474602222442627, "sampling/sampling_logp_difference/mean": 0.030321069061756134, "step": 405, "step_time": 33.77085295799952 }, { "clip_ratio/high_max": 0.0130876072216779, "clip_ratio/high_mean": 0.008279914851300418, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015224359231069684, "entropy": 0.16008016001433134, "epoch": 0.00812, "grad_norm": 1.3276525735855103, "kl": 0.4322241246700287, "learning_rate": 9.999746485017087e-06, "loss": 0.08, "step": 406, "step_time": 6.167374660999485 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 1787.96875, "completions/mean_terminated_length": 1787.96875, "completions/min_length": 675.0, "completions/min_terminated_length": 675.0, "entropy": 0.1374366506934166, "epoch": 0.00814, "frac_reward_zero_std": 0.0, "grad_norm": 1.628904938697815, "kl": 0.4086805544793606, "learning_rate": 9.999745112829547e-06, "loss": -0.128, "num_tokens": 15265477.0, "reward": 9.787149429321289, "reward_std": 11.904574394226074, "rewards/rollout_reward_func/mean": 9.787149429321289, "rewards/rollout_reward_func/std": 14.022072792053223, "sampling/importance_sampling_ratio/max": 2.199753999710083, "sampling/importance_sampling_ratio/mean": 1.1246109008789062, "sampling/importance_sampling_ratio/min": 0.6423606276512146, "sampling/sampling_logp_difference/max": 0.681222677230835, "sampling/sampling_logp_difference/mean": 0.022375933825969696, "step": 407, "step_time": 36.12690065800416 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 0.13620022125542164, "epoch": 0.00816, "grad_norm": 1.2189370393753052, "kl": 0.39171487279236317, "learning_rate": 9.999743736938565e-06, "loss": -0.1329, "step": 408, "step_time": 6.64129104499807 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1771.5, "completions/mean_terminated_length": 1771.5, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "entropy": 0.16864918265491724, "epoch": 0.00818, "frac_reward_zero_std": 0.125, "grad_norm": 0.9522514939308167, "kl": 0.5428624674677849, "learning_rate": 9.999742357344142e-06, "loss": -0.0073, "num_tokens": 15342877.0, "reward": 9.139594078063965, "reward_std": 3.8079919815063477, "rewards/rollout_reward_func/mean": 9.139594078063965, "rewards/rollout_reward_func/std": 14.156567573547363, "sampling/importance_sampling_ratio/max": 2.7841055393218994, "sampling/importance_sampling_ratio/mean": 0.9111255407333374, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.997159481048584, "sampling/sampling_logp_difference/mean": 0.03262122720479965, "step": 409, "step_time": 36.596215775998644 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "entropy": 0.16705580614507198, "epoch": 0.0082, "grad_norm": 1.053119421005249, "kl": 0.5281503275036812, "learning_rate": 9.999740974046281e-06, "loss": -0.0093, "step": 410, "step_time": 6.708477361000405 }, { "clip_ratio/high_max": 0.011054942850023508, "clip_ratio/high_mean": 0.005527471425011754, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005527471425011754, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1785.96875, "completions/mean_terminated_length": 1785.96875, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "entropy": 0.18522152118384838, "epoch": 0.00822, "frac_reward_zero_std": 0.125, "grad_norm": 1.4419904947280884, "kl": 0.6027393564581871, "learning_rate": 9.999739587044982e-06, "loss": -0.0636, "num_tokens": 15420534.0, "reward": 2.8079020977020264, "reward_std": 6.569200038909912, "rewards/rollout_reward_func/mean": 2.8079020977020264, "rewards/rollout_reward_func/std": 14.606973648071289, "sampling/importance_sampling_ratio/max": 1.9993548393249512, "sampling/importance_sampling_ratio/mean": 0.8649469614028931, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 27.178010940551758, "sampling/sampling_logp_difference/mean": 0.08464864641427994, "step": 411, "step_time": 35.07087018799393 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.010542306350544095, "clip_ratio/low_min": 0.003289473708719015, "clip_ratio/region_mean": 0.014014528598636389, "entropy": 0.18445036374032497, "epoch": 0.00824, "grad_norm": 1.382118582725525, "kl": 0.6093035973608494, "learning_rate": 9.999738196340246e-06, "loss": -0.0667, "step": 412, "step_time": 6.159181504002845 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 1912.59375, "completions/mean_terminated_length": 1912.59375, "completions/min_length": 1538.0, "completions/min_terminated_length": 1538.0, "entropy": 0.15822596568614244, "epoch": 0.00826, "frac_reward_zero_std": 0.125, "grad_norm": 1.5164858102798462, "kl": 0.3464791551232338, "learning_rate": 9.999736801932072e-06, "loss": -0.092, "num_tokens": 15502267.0, "reward": 10.420038223266602, "reward_std": 5.580462455749512, "rewards/rollout_reward_func/mean": 10.420038223266602, "rewards/rollout_reward_func/std": 16.989425659179688, "sampling/importance_sampling_ratio/max": 2.6757559776306152, "sampling/importance_sampling_ratio/mean": 0.9447764158248901, "sampling/importance_sampling_ratio/min": 1.2121374737272816e-10, "sampling/sampling_logp_difference/max": 22.602230072021484, "sampling/sampling_logp_difference/mean": 0.06792205572128296, "step": 413, "step_time": 37.483222671000476 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.005116959218867123, "clip_ratio/low_min": 0.003289473708719015, "clip_ratio/region_mean": 0.00685307034291327, "entropy": 0.15620272234082222, "epoch": 0.00828, "grad_norm": 1.027227759361267, "kl": 0.34561512246727943, "learning_rate": 9.999735403820467e-06, "loss": -0.0951, "step": 414, "step_time": 6.721208181003021 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.003689236007630825, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005425347131676972, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1852.9375, "completions/mean_terminated_length": 1852.9375, "completions/min_length": 1307.0, "completions/min_terminated_length": 1307.0, "entropy": 0.19122237898409367, "epoch": 0.0083, "frac_reward_zero_std": 0.0, "grad_norm": 2.5304763317108154, "kl": 0.4231012538075447, "learning_rate": 9.999734002005428e-06, "loss": -0.0365, "num_tokens": 15581901.0, "reward": 6.599876403808594, "reward_std": 4.858182907104492, "rewards/rollout_reward_func/mean": 6.599876403808594, "rewards/rollout_reward_func/std": 10.637002944946289, "sampling/importance_sampling_ratio/max": 1.754501223564148, "sampling/importance_sampling_ratio/mean": 1.1050997972488403, "sampling/importance_sampling_ratio/min": 0.25575336813926697, "sampling/sampling_logp_difference/max": 0.7238872051239014, "sampling/sampling_logp_difference/mean": 0.03033018298447132, "step": 415, "step_time": 36.794177785999636 }, { "clip_ratio/high_max": 0.013134058099240065, "clip_ratio/high_mean": 0.008303140290081501, "clip_ratio/low_mean": 0.008680555620230734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016983695910312235, "entropy": 0.18879046849906445, "epoch": 0.00832, "grad_norm": 1.5424190759658813, "kl": 0.4105800986289978, "learning_rate": 9.99973259648696e-06, "loss": -0.0417, "step": 416, "step_time": 6.229540733997055 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1897.0, "completions/mean_terminated_length": 1897.0, "completions/min_length": 1626.0, "completions/min_terminated_length": 1626.0, "entropy": 0.12648662831634283, "epoch": 0.00834, "frac_reward_zero_std": 0.25, "grad_norm": 1.2874664068222046, "kl": 0.5799643807113171, "learning_rate": 9.999731187265061e-06, "loss": -0.0132, "num_tokens": 15663076.0, "reward": 9.055243492126465, "reward_std": 5.7032575607299805, "rewards/rollout_reward_func/mean": 9.055243492126465, "rewards/rollout_reward_func/std": 21.282480239868164, "sampling/importance_sampling_ratio/max": 1.8616880178451538, "sampling/importance_sampling_ratio/mean": 1.0218424797058105, "sampling/importance_sampling_ratio/min": 0.2051515281200409, "sampling/sampling_logp_difference/max": 0.9839389324188232, "sampling/sampling_logp_difference/mean": 0.025066856294870377, "step": 417, "step_time": 36.59973449099925 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.005310457549057901, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008782679797150195, "entropy": 0.12579865287989378, "epoch": 0.00836, "grad_norm": 1.2702187299728394, "kl": 0.5678221099078655, "learning_rate": 9.999729774339734e-06, "loss": -0.0158, "step": 418, "step_time": 6.2664286570015975 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555620230734, "completions/clipped_ratio": 0.0, "completions/max_length": 1971.0, "completions/max_terminated_length": 1971.0, "completions/mean_length": 1882.1875, "completions/mean_terminated_length": 1882.1875, "completions/min_length": 1722.0, "completions/min_terminated_length": 1722.0, "entropy": 0.1713708434253931, "epoch": 0.00838, "frac_reward_zero_std": 0.0, "grad_norm": 1.8853299617767334, "kl": 0.40060317888855934, "learning_rate": 9.999728357710979e-06, "loss": -0.1656, "num_tokens": 15743610.0, "reward": -1.3480520248413086, "reward_std": 6.245339393615723, "rewards/rollout_reward_func/mean": -1.3480520248413086, "rewards/rollout_reward_func/std": 14.265408515930176, "sampling/importance_sampling_ratio/max": 2.072874069213867, "sampling/importance_sampling_ratio/mean": 0.9604393243789673, "sampling/importance_sampling_ratio/min": 0.2704717814922333, "sampling/sampling_logp_difference/max": 1.3232176303863525, "sampling/sampling_logp_difference/mean": 0.03345388546586037, "step": 419, "step_time": 38.095200251995266 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.008680555620230734, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.010416666860692203, "entropy": 0.16807264648377895, "epoch": 0.0084, "grad_norm": 1.7478289604187012, "kl": 0.4048056434839964, "learning_rate": 9.999726937378799e-06, "loss": -0.1689, "step": 420, "step_time": 6.216570853004669 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.004315476398915052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006398809840902686, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 1700.3125, "completions/mean_terminated_length": 1700.3125, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "entropy": 0.13674810901284218, "epoch": 0.00842, "frac_reward_zero_std": 0.125, "grad_norm": 2.3826522827148438, "kl": 0.8743790201842785, "learning_rate": 9.999725513343196e-06, "loss": 0.1654, "num_tokens": 15818763.0, "reward": 5.071457862854004, "reward_std": 5.412266254425049, "rewards/rollout_reward_func/mean": 5.071457862854004, "rewards/rollout_reward_func/std": 18.597326278686523, "sampling/importance_sampling_ratio/max": 2.0363876819610596, "sampling/importance_sampling_ratio/mean": 1.0418556928634644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9595718383789062, "sampling/sampling_logp_difference/mean": 0.029438920319080353, "step": 421, "step_time": 32.92347357299877 }, { "clip_ratio/high_max": 0.02738095331005752, "clip_ratio/high_mean": 0.015426587779074907, "clip_ratio/low_mean": 0.007787698763422668, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.023214286658912897, "entropy": 0.13682854734361172, "epoch": 0.00844, "grad_norm": 1.5287761688232422, "kl": 0.8074955753982067, "learning_rate": 9.99972408560417e-06, "loss": 0.1613, "step": 422, "step_time": 6.16248918799829 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1831.90625, "completions/mean_terminated_length": 1831.90625, "completions/min_length": 1436.0, "completions/min_terminated_length": 1436.0, "entropy": 0.14473330415785313, "epoch": 0.00846, "frac_reward_zero_std": 0.0, "grad_norm": 1.183998703956604, "kl": 0.39037713408470154, "learning_rate": 9.999722654161723e-06, "loss": -0.0322, "num_tokens": 15897649.0, "reward": 12.873044967651367, "reward_std": 3.1303045749664307, "rewards/rollout_reward_func/mean": 12.873044967651367, "rewards/rollout_reward_func/std": 16.81613540649414, "sampling/importance_sampling_ratio/max": 2.278865337371826, "sampling/importance_sampling_ratio/mean": 0.936629593372345, "sampling/importance_sampling_ratio/min": 1.0781457107297832e-12, "sampling/sampling_logp_difference/max": 27.504920959472656, "sampling/sampling_logp_difference/mean": 0.08040793240070343, "step": 423, "step_time": 34.50863980300164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003689236124046147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003689236124046147, "entropy": 0.14420728944242, "epoch": 0.00848, "grad_norm": 1.123458743095398, "kl": 0.3863917402923107, "learning_rate": 9.999721219015855e-06, "loss": -0.0343, "step": 424, "step_time": 6.203808428001139 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006944444612599909, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1907.59375, "completions/mean_terminated_length": 1907.59375, "completions/min_length": 1846.0, "completions/min_terminated_length": 1846.0, "entropy": 0.1717237215489149, "epoch": 0.0085, "frac_reward_zero_std": 0.0, "grad_norm": 2.2574822902679443, "kl": 0.45831191912293434, "learning_rate": 9.999719780166568e-06, "loss": -0.0116, "num_tokens": 15979198.0, "reward": 12.526996612548828, "reward_std": 7.190093994140625, "rewards/rollout_reward_func/mean": 12.526996612548828, "rewards/rollout_reward_func/std": 11.75661563873291, "sampling/importance_sampling_ratio/max": 2.549557685852051, "sampling/importance_sampling_ratio/mean": 1.156769037246704, "sampling/importance_sampling_ratio/min": 0.5638684034347534, "sampling/sampling_logp_difference/max": 0.8117094039916992, "sampling/sampling_logp_difference/mean": 0.031035717576742172, "step": 425, "step_time": 37.6721021070025 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.17174869030714035, "epoch": 0.00852, "grad_norm": 1.9043995141983032, "kl": 0.4626782052218914, "learning_rate": 9.999718337613866e-06, "loss": -0.0129, "step": 426, "step_time": 6.2495746619970305 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1981.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1863.90625, "completions/mean_terminated_length": 1863.90625, "completions/min_length": 1682.0, "completions/min_terminated_length": 1682.0, "entropy": 0.15208253264427185, "epoch": 0.00854, "frac_reward_zero_std": 0.0, "grad_norm": 1.7153781652450562, "kl": 0.6745771653950214, "learning_rate": 9.999716891357747e-06, "loss": 0.0219, "num_tokens": 16059187.0, "reward": 5.894711017608643, "reward_std": 5.455031394958496, "rewards/rollout_reward_func/mean": 5.894711017608643, "rewards/rollout_reward_func/std": 12.694339752197266, "sampling/importance_sampling_ratio/max": 2.351686716079712, "sampling/importance_sampling_ratio/mean": 1.042860746383667, "sampling/importance_sampling_ratio/min": 5.802730258103184e-13, "sampling/sampling_logp_difference/max": 27.137590408325195, "sampling/sampling_logp_difference/mean": 0.0840056762099266, "step": 427, "step_time": 37.95969708299526 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555620230734, "entropy": 0.1548395473510027, "epoch": 0.00856, "grad_norm": 1.2103983163833618, "kl": 0.6520356982946396, "learning_rate": 9.999715441398214e-06, "loss": 0.0201, "step": 428, "step_time": 6.195645353001964 }, { "clip_ratio/high_max": 0.006761695956811309, "clip_ratio/high_mean": 0.0033808479784056544, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005116959102451801, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1780.53125, "completions/mean_terminated_length": 1780.53125, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "entropy": 0.13885898049920797, "epoch": 0.00858, "frac_reward_zero_std": 0.0, "grad_norm": 2.0375118255615234, "kl": 0.5098150111734867, "learning_rate": 9.99971398773527e-06, "loss": -0.0296, "num_tokens": 16136334.0, "reward": 12.420064926147461, "reward_std": 8.215323448181152, "rewards/rollout_reward_func/mean": 12.420064926147461, "rewards/rollout_reward_func/std": 17.566680908203125, "sampling/importance_sampling_ratio/max": 2.0209267139434814, "sampling/importance_sampling_ratio/mean": 1.0481691360473633, "sampling/importance_sampling_ratio/min": 9.511323152688878e-12, "sampling/sampling_logp_difference/max": 24.24110984802246, "sampling/sampling_logp_difference/mean": 0.07432256639003754, "step": 429, "step_time": 34.64603473199713 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.01215277798473835, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.013797514839097857, "entropy": 0.1355869797989726, "epoch": 0.0086, "grad_norm": 1.5481940507888794, "kl": 0.5054982472211123, "learning_rate": 9.999712530368912e-06, "loss": -0.0355, "step": 430, "step_time": 6.177134515002763 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.03125, "completions/max_length": 1976.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1879.0, "completions/mean_terminated_length": 1880.4193115234375, "completions/min_length": 1756.0, "completions/min_terminated_length": 1756.0, "entropy": 0.15621676482260227, "epoch": 0.00862, "frac_reward_zero_std": 0.0, "grad_norm": 1.5531624555587769, "kl": 0.5274980738759041, "learning_rate": 9.999711069299145e-06, "loss": -0.0229, "num_tokens": 16217117.0, "reward": 10.379293441772461, "reward_std": 6.365335464477539, "rewards/rollout_reward_func/mean": 10.379293441772461, "rewards/rollout_reward_func/std": 9.409771919250488, "sampling/importance_sampling_ratio/max": 1.8403072357177734, "sampling/importance_sampling_ratio/mean": 0.9792887568473816, "sampling/importance_sampling_ratio/min": 0.1641586869955063, "sampling/sampling_logp_difference/max": 1.2717455625534058, "sampling/sampling_logp_difference/mean": 0.0406506210565567, "step": 431, "step_time": 36.80640686400329 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.006458333344198763, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009930555592291057, "entropy": 0.1552266152575612, "epoch": 0.00864, "grad_norm": 1.1488696336746216, "kl": 0.5495500713586807, "learning_rate": 9.999709604525971e-06, "loss": -0.027, "step": 432, "step_time": 6.695867144002477 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1859.1875, "completions/mean_terminated_length": 1859.1875, "completions/min_length": 1568.0, "completions/min_terminated_length": 1568.0, "entropy": 0.1606634296476841, "epoch": 0.00866, "frac_reward_zero_std": 0.0, "grad_norm": 2.1394176483154297, "kl": 0.5905432924628258, "learning_rate": 9.999708136049389e-06, "loss": -0.0213, "num_tokens": 16297219.0, "reward": 10.730379104614258, "reward_std": 10.182910919189453, "rewards/rollout_reward_func/mean": 10.730379104614258, "rewards/rollout_reward_func/std": 14.116110801696777, "sampling/importance_sampling_ratio/max": 1.9054006338119507, "sampling/importance_sampling_ratio/mean": 1.0243314504623413, "sampling/importance_sampling_ratio/min": 0.28989264369010925, "sampling/sampling_logp_difference/max": 1.0515797138214111, "sampling/sampling_logp_difference/mean": 0.03440077230334282, "step": 433, "step_time": 37.53456605399697 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.008782679797150195, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.012254902045242488, "entropy": 0.1584420707076788, "epoch": 0.00868, "grad_norm": 1.6995211839675903, "kl": 0.6047380901873112, "learning_rate": 9.9997066638694e-06, "loss": -0.0252, "step": 434, "step_time": 6.179048164000051 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005425347248092294, "completions/clipped_ratio": 0.0, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 1828.78125, "completions/mean_terminated_length": 1828.78125, "completions/min_length": 1343.0, "completions/min_terminated_length": 1343.0, "entropy": 0.12900556158274412, "epoch": 0.0087, "frac_reward_zero_std": 0.125, "grad_norm": 1.3848490715026855, "kl": 0.5210922621190548, "learning_rate": 9.99970518798601e-06, "loss": -0.0456, "num_tokens": 16375969.0, "reward": 9.6665620803833, "reward_std": 6.750264644622803, "rewards/rollout_reward_func/mean": 9.6665620803833, "rewards/rollout_reward_func/std": 19.660367965698242, "sampling/importance_sampling_ratio/max": 2.6867897510528564, "sampling/importance_sampling_ratio/mean": 1.0070596933364868, "sampling/importance_sampling_ratio/min": 0.13064204156398773, "sampling/sampling_logp_difference/max": 1.4016146659851074, "sampling/sampling_logp_difference/mean": 0.03491336107254028, "step": 435, "step_time": 35.39866553399952 }, { "clip_ratio/high_max": 0.018229166977107525, "clip_ratio/high_mean": 0.009114583488553762, "clip_ratio/low_mean": 0.007161458255723119, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.01627604174427688, "entropy": 0.12771923653781414, "epoch": 0.00872, "grad_norm": 1.2393711805343628, "kl": 0.5349139347672462, "learning_rate": 9.999703708399216e-06, "loss": -0.0498, "step": 436, "step_time": 6.720962448001956 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1850.8125, "completions/mean_terminated_length": 1850.8125, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "entropy": 0.14382985513657331, "epoch": 0.00874, "frac_reward_zero_std": 0.0, "grad_norm": 1.190502405166626, "kl": 0.5796581096947193, "learning_rate": 9.99970222510902e-06, "loss": -0.0328, "num_tokens": 16456143.0, "reward": 9.858323097229004, "reward_std": 9.86341381072998, "rewards/rollout_reward_func/mean": 9.858323097229004, "rewards/rollout_reward_func/std": 15.427154541015625, "sampling/importance_sampling_ratio/max": 1.8721684217453003, "sampling/importance_sampling_ratio/mean": 1.100501537322998, "sampling/importance_sampling_ratio/min": 0.25606006383895874, "sampling/sampling_logp_difference/max": 0.986447811126709, "sampling/sampling_logp_difference/mean": 0.031652357429265976, "step": 437, "step_time": 35.963873100005 }, { "clip_ratio/high_max": 0.007638889132067561, "clip_ratio/high_mean": 0.005555555806495249, "clip_ratio/low_mean": 0.008680555620230734, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.014236111426725984, "entropy": 0.13910409063100815, "epoch": 0.00876, "grad_norm": 1.3543274402618408, "kl": 0.6213464625179768, "learning_rate": 9.999700738115424e-06, "loss": -0.0356, "step": 438, "step_time": 6.740783157998521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1871.875, "completions/mean_terminated_length": 1871.875, "completions/min_length": 1788.0, "completions/min_terminated_length": 1788.0, "entropy": 0.14204278402030468, "epoch": 0.00878, "frac_reward_zero_std": 0.125, "grad_norm": 2.5433712005615234, "kl": 0.40115803107619286, "learning_rate": 9.999699247418431e-06, "loss": 0.0396, "num_tokens": 16536454.0, "reward": 3.6463284492492676, "reward_std": 7.665958404541016, "rewards/rollout_reward_func/mean": 3.6463284492492676, "rewards/rollout_reward_func/std": 10.984175682067871, "sampling/importance_sampling_ratio/max": 1.6518703699111938, "sampling/importance_sampling_ratio/mean": 1.0737829208374023, "sampling/importance_sampling_ratio/min": 0.4997839033603668, "sampling/sampling_logp_difference/max": 0.924556314945221, "sampling/sampling_logp_difference/mean": 0.02629711665213108, "step": 439, "step_time": 38.01396525000018 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.14073345065116882, "epoch": 0.0088, "grad_norm": 1.6631256341934204, "kl": 0.4030180908739567, "learning_rate": 9.999697753018042e-06, "loss": 0.0307, "step": 440, "step_time": 6.178545857001154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1798.34375, "completions/mean_terminated_length": 1798.34375, "completions/min_length": 1601.0, "completions/min_terminated_length": 1601.0, "entropy": 0.10215338412672281, "epoch": 0.00882, "frac_reward_zero_std": 0.125, "grad_norm": 1.50302255153656, "kl": 0.37106287851929665, "learning_rate": 9.999696254914256e-06, "loss": -0.0679, "num_tokens": 16614920.0, "reward": 6.441880226135254, "reward_std": 6.458446502685547, "rewards/rollout_reward_func/mean": 6.441880226135254, "rewards/rollout_reward_func/std": 15.004520416259766, "sampling/importance_sampling_ratio/max": 2.006425380706787, "sampling/importance_sampling_ratio/mean": 1.0931893587112427, "sampling/importance_sampling_ratio/min": 0.43766847252845764, "sampling/sampling_logp_difference/max": 0.5878493785858154, "sampling/sampling_logp_difference/mean": 0.014898103661835194, "step": 441, "step_time": 36.059925193998424 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.10211750213056803, "epoch": 0.00884, "grad_norm": 1.5077581405639648, "kl": 0.3891483061015606, "learning_rate": 9.999694753107077e-06, "loss": -0.0722, "step": 442, "step_time": 6.373549443997035 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1874.3125, "completions/mean_terminated_length": 1874.3125, "completions/min_length": 1676.0, "completions/min_terminated_length": 1676.0, "entropy": 0.12804421968758106, "epoch": 0.00886, "frac_reward_zero_std": 0.25, "grad_norm": 1.4372204542160034, "kl": 0.612975912168622, "learning_rate": 9.999693247596505e-06, "loss": -0.0372, "num_tokens": 16695143.0, "reward": 9.780478477478027, "reward_std": 10.539976119995117, "rewards/rollout_reward_func/mean": 9.780478477478027, "rewards/rollout_reward_func/std": 18.792755126953125, "sampling/importance_sampling_ratio/max": 1.8610286712646484, "sampling/importance_sampling_ratio/mean": 0.9242000579833984, "sampling/importance_sampling_ratio/min": 7.589735326295589e-18, "sampling/sampling_logp_difference/max": 23.898208618164062, "sampling/sampling_logp_difference/mean": 0.09211704879999161, "step": 443, "step_time": 40.27333003700005 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008680555620230734, "entropy": 0.13046453520655632, "epoch": 0.00888, "grad_norm": 1.1817734241485596, "kl": 0.6209859363734722, "learning_rate": 9.999691738382544e-06, "loss": -0.0414, "step": 444, "step_time": 6.1905242890006775 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1881.84375, "completions/mean_terminated_length": 1881.84375, "completions/min_length": 1777.0, "completions/min_terminated_length": 1777.0, "entropy": 0.10738935228437185, "epoch": 0.0089, "frac_reward_zero_std": 0.25, "grad_norm": 1.587774634361267, "kl": 0.6463839821517467, "learning_rate": 9.999690225465193e-06, "loss": -0.1007, "num_tokens": 16776207.0, "reward": 3.2240819931030273, "reward_std": 3.1243185997009277, "rewards/rollout_reward_func/mean": 3.2240819931030273, "rewards/rollout_reward_func/std": 8.129782676696777, "sampling/importance_sampling_ratio/max": 2.0063812732696533, "sampling/importance_sampling_ratio/mean": 0.9566425681114197, "sampling/importance_sampling_ratio/min": 0.057423632591962814, "sampling/sampling_logp_difference/max": 1.6200556755065918, "sampling/sampling_logp_difference/mean": 0.027168117463588715, "step": 445, "step_time": 39.72193665600025 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625000232830644, "entropy": 0.10969764646142721, "epoch": 0.00892, "grad_norm": 1.0914807319641113, "kl": 0.7086209692060947, "learning_rate": 9.999688708844452e-06, "loss": -0.1024, "step": 446, "step_time": 6.31956128499769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1893.3125, "completions/mean_terminated_length": 1893.3125, "completions/min_length": 1636.0, "completions/min_terminated_length": 1636.0, "entropy": 0.13962982781231403, "epoch": 0.00894, "frac_reward_zero_std": 0.125, "grad_norm": 1.1110786199569702, "kl": 0.31114745885133743, "learning_rate": 9.999687188520328e-06, "loss": -0.0955, "num_tokens": 16857449.0, "reward": 9.09254264831543, "reward_std": 6.603111267089844, "rewards/rollout_reward_func/mean": 9.09254264831543, "rewards/rollout_reward_func/std": 12.929021835327148, "sampling/importance_sampling_ratio/max": 1.9707322120666504, "sampling/importance_sampling_ratio/mean": 0.9114285111427307, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 25.772886276245117, "sampling/sampling_logp_difference/mean": 0.07303920388221741, "step": 447, "step_time": 38.44917293900107 }, { "clip_ratio/high_max": 0.020833333488553762, "clip_ratio/high_mean": 0.010416666744276881, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.14250141568481922, "epoch": 0.00896, "grad_norm": 0.9470910429954529, "kl": 0.30660218372941017, "learning_rate": 9.999685664492816e-06, "loss": -0.0989, "step": 448, "step_time": 6.258842459999869 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.003574346425011754, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005310457549057901, "completions/clipped_ratio": 0.0, "completions/max_length": 1969.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1866.1875, "completions/mean_terminated_length": 1866.1875, "completions/min_length": 1566.0, "completions/min_terminated_length": 1566.0, "entropy": 0.1336675025522709, "epoch": 0.00898, "frac_reward_zero_std": 0.0, "grad_norm": 1.4246325492858887, "kl": 0.35916537791490555, "learning_rate": 9.999684136761924e-06, "loss": 0.043, "num_tokens": 16937767.0, "reward": 2.6347501277923584, "reward_std": 11.172698974609375, "rewards/rollout_reward_func/mean": 2.6347501277923584, "rewards/rollout_reward_func/std": 17.202756881713867, "sampling/importance_sampling_ratio/max": 1.9445165395736694, "sampling/importance_sampling_ratio/mean": 1.0331871509552002, "sampling/importance_sampling_ratio/min": 0.4090438485145569, "sampling/sampling_logp_difference/max": 0.6792287826538086, "sampling/sampling_logp_difference/mean": 0.019970860332250595, "step": 449, "step_time": 37.926194604002376 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.008986928150989115, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.01766748377121985, "entropy": 0.13738374412059784, "epoch": 0.009, "grad_norm": 1.1024707555770874, "kl": 0.357426542788744, "learning_rate": 9.999682605327648e-06, "loss": 0.0382, "step": 450, "step_time": 6.199961440999687 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 1895.5, "completions/mean_terminated_length": 1895.5, "completions/min_length": 1636.0, "completions/min_terminated_length": 1636.0, "entropy": 0.15979362651705742, "epoch": 0.00902, "frac_reward_zero_std": 0.0, "grad_norm": 1.3939236402511597, "kl": 0.31783993169665337, "learning_rate": 9.99968107018999e-06, "loss": -0.028, "num_tokens": 17019020.0, "reward": 19.81011199951172, "reward_std": 7.101508140563965, "rewards/rollout_reward_func/mean": 19.81011199951172, "rewards/rollout_reward_func/std": 21.871509552001953, "sampling/importance_sampling_ratio/max": 2.6384923458099365, "sampling/importance_sampling_ratio/mean": 1.0206067562103271, "sampling/importance_sampling_ratio/min": 2.8838329294011977e-11, "sampling/sampling_logp_difference/max": 24.231542587280273, "sampling/sampling_logp_difference/mean": 0.06482739746570587, "step": 451, "step_time": 37.558782669999346 }, { "clip_ratio/high_max": 0.003289473708719015, "clip_ratio/high_mean": 0.0016447368543595076, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033808479784056544, "entropy": 0.15879021026194096, "epoch": 0.00904, "grad_norm": 1.243542194366455, "kl": 0.31884971633553505, "learning_rate": 9.999679531348956e-06, "loss": -0.032, "step": 452, "step_time": 6.199976066998715 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1731.46875, "completions/mean_terminated_length": 1731.46875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "entropy": 0.12962097860872746, "epoch": 0.00906, "frac_reward_zero_std": 0.375, "grad_norm": 1.4479097127914429, "kl": 0.382347758859396, "learning_rate": 9.999677988804544e-06, "loss": -0.056, "num_tokens": 17094633.0, "reward": 7.968657970428467, "reward_std": 5.176619529724121, "rewards/rollout_reward_func/mean": 7.968657970428467, "rewards/rollout_reward_func/std": 13.325113296508789, "sampling/importance_sampling_ratio/max": 1.7371830940246582, "sampling/importance_sampling_ratio/mean": 0.9786045551300049, "sampling/importance_sampling_ratio/min": 0.3828772306442261, "sampling/sampling_logp_difference/max": 0.9057672023773193, "sampling/sampling_logp_difference/mean": 0.021090377122163773, "step": 453, "step_time": 35.78325894400041 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.01215277798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01736111135687679, "entropy": 0.1288425624370575, "epoch": 0.00908, "grad_norm": 0.9137100577354431, "kl": 0.39299850165843964, "learning_rate": 9.999676442556757e-06, "loss": -0.0591, "step": 454, "step_time": 6.22740626700579 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0017361111240461469, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006944444612599909, "completions/clipped_ratio": 0.0, "completions/max_length": 2023.0, "completions/max_terminated_length": 2023.0, "completions/mean_length": 1862.90625, "completions/mean_terminated_length": 1862.90625, "completions/min_length": 1430.0, "completions/min_terminated_length": 1430.0, "entropy": 0.10491521190851927, "epoch": 0.0091, "frac_reward_zero_std": 0.125, "grad_norm": 1.0899358987808228, "kl": 0.327092919498682, "learning_rate": 9.999674892605596e-06, "loss": -0.1829, "num_tokens": 17174752.0, "reward": 11.411552429199219, "reward_std": 6.536225318908691, "rewards/rollout_reward_func/mean": 11.411552429199219, "rewards/rollout_reward_func/std": 17.084299087524414, "sampling/importance_sampling_ratio/max": 1.9590842723846436, "sampling/importance_sampling_ratio/mean": 0.8986088037490845, "sampling/importance_sampling_ratio/min": 0.19634754955768585, "sampling/sampling_logp_difference/max": 1.189605712890625, "sampling/sampling_logp_difference/mean": 0.02513366937637329, "step": 455, "step_time": 36.65933866699925 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012152777868323028, "entropy": 0.10256014112383127, "epoch": 0.00912, "grad_norm": 1.0379598140716553, "kl": 0.3651005197316408, "learning_rate": 9.99967333895106e-06, "loss": -0.1862, "step": 456, "step_time": 6.2405210320011975 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0047222222201526165, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008194444584660232, "completions/clipped_ratio": 0.0, "completions/max_length": 2046.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1919.5, "completions/mean_terminated_length": 1919.5, "completions/min_length": 1692.0, "completions/min_terminated_length": 1692.0, "entropy": 0.1095434408634901, "epoch": 0.00914, "frac_reward_zero_std": 0.0, "grad_norm": 1.1724107265472412, "kl": 0.329721899703145, "learning_rate": 9.999671781593154e-06, "loss": 0.0069, "num_tokens": 17256864.0, "reward": 11.119063377380371, "reward_std": 8.48747730255127, "rewards/rollout_reward_func/mean": 11.119063377380371, "rewards/rollout_reward_func/std": 14.583742141723633, "sampling/importance_sampling_ratio/max": 1.7749106884002686, "sampling/importance_sampling_ratio/mean": 0.9545077085494995, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.036113739013672, "sampling/sampling_logp_difference/mean": 0.08479472994804382, "step": 457, "step_time": 38.51757043299949 }, { "clip_ratio/high_max": 0.009444444440305233, "clip_ratio/high_mean": 0.0047222222201526165, "clip_ratio/low_mean": 0.01340277784038335, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.018125000409781933, "entropy": 0.10692847240716219, "epoch": 0.00916, "grad_norm": 1.1692982912063599, "kl": 0.3310940358787775, "learning_rate": 9.999670220531878e-06, "loss": 0.0021, "step": 458, "step_time": 6.407080308001241 }, { "clip_ratio/high_max": 0.010928362840786576, "clip_ratio/high_mean": 0.005464181420393288, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005464181420393288, "completions/clipped_ratio": 0.0, "completions/max_length": 1974.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1843.53125, "completions/mean_terminated_length": 1843.53125, "completions/min_length": 1070.0, "completions/min_terminated_length": 1070.0, "entropy": 0.10005591344088316, "epoch": 0.00918, "frac_reward_zero_std": 0.125, "grad_norm": 1.392657995223999, "kl": 0.6404759250581264, "learning_rate": 9.999668655767235e-06, "loss": 0.0567, "num_tokens": 17336639.0, "reward": 10.774712562561035, "reward_std": 5.9175920486450195, "rewards/rollout_reward_func/mean": 10.774712562561035, "rewards/rollout_reward_func/std": 13.042983055114746, "sampling/importance_sampling_ratio/max": 1.5456091165542603, "sampling/importance_sampling_ratio/mean": 0.9551990032196045, "sampling/importance_sampling_ratio/min": 4.766808692560631e-13, "sampling/sampling_logp_difference/max": 27.484663009643555, "sampling/sampling_logp_difference/mean": 0.06486958265304565, "step": 459, "step_time": 37.665269682000144 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0033808479784056544, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005116959102451801, "entropy": 0.09817711263895035, "epoch": 0.0092, "grad_norm": 1.214490532875061, "kl": 0.5807801727205515, "learning_rate": 9.999667087299225e-06, "loss": 0.0526, "step": 460, "step_time": 6.204574634999517 }, { "clip_ratio/high_max": 0.0034722222480922937, "clip_ratio/high_mean": 0.0017361111240461469, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017361111240461469, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1865.46875, "completions/mean_terminated_length": 1865.46875, "completions/min_length": 1549.0, "completions/min_terminated_length": 1549.0, "entropy": 0.12156101502478123, "epoch": 0.00922, "frac_reward_zero_std": 0.125, "grad_norm": 1.4056850671768188, "kl": 0.5392452217638493, "learning_rate": 9.999665515127852e-06, "loss": -0.043, "num_tokens": 17416764.0, "reward": 11.62852668762207, "reward_std": 6.9695281982421875, "rewards/rollout_reward_func/mean": 11.62852668762207, "rewards/rollout_reward_func/std": 13.752988815307617, "sampling/importance_sampling_ratio/max": 2.6276750564575195, "sampling/importance_sampling_ratio/mean": 0.9863981604576111, "sampling/importance_sampling_ratio/min": 7.647822804733584e-13, "sampling/sampling_logp_difference/max": 26.564056396484375, "sampling/sampling_logp_difference/mean": 0.07334530353546143, "step": 461, "step_time": 39.81283560100019 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.010416666860692203, "clip_ratio/low_min": 0.0034722222480922937, "clip_ratio/region_mean": 0.015625000116415322, "entropy": 0.12390031665563583, "epoch": 0.00924, "grad_norm": 1.2167932987213135, "kl": 0.5474216639995575, "learning_rate": 9.999663939253113e-06, "loss": -0.0466, "step": 462, "step_time": 6.207449816998633 }, { "clip_ratio/high_max": 0.0069444444961845875, "clip_ratio/high_mean": 0.0034722222480922937, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 1851.40625, "completions/mean_terminated_length": 1851.40625, "completions/min_length": 1767.0, "completions/min_terminated_length": 1767.0, "entropy": 0.1078771585598588, "epoch": 0.00926, "frac_reward_zero_std": 0.125, "grad_norm": 1.4674164056777954, "kl": 0.5507515668869019, "learning_rate": 9.999662359675012e-06, "loss": 0.0633, "num_tokens": 17496530.0, "reward": 14.070526123046875, "reward_std": 9.017099380493164, "rewards/rollout_reward_func/mean": 14.070526123046875, "rewards/rollout_reward_func/std": 21.767820358276367, "sampling/importance_sampling_ratio/max": 1.8225663900375366, "sampling/importance_sampling_ratio/mean": 0.9397376775741577, "sampling/importance_sampling_ratio/min": 0.2997850179672241, "sampling/sampling_logp_difference/max": 0.9843077659606934, "sampling/sampling_logp_difference/mean": 0.029502304270863533, "step": 463, "step_time": 35.89330491099827 }, { "clip_ratio/high_max": 0.01736111124046147, "clip_ratio/high_mean": 0.008680555620230734, "clip_ratio/low_mean": 0.008680555620230734, "clip_ratio/low_min": 0.0069444444961845875, "clip_ratio/region_mean": 0.01736111124046147, "entropy": 0.10700647812336683, "epoch": 0.00928, "grad_norm": 1.0905040502548218, "kl": 0.5623909011483192, "learning_rate": 9.999660776393551e-06, "loss": 0.0596, "step": 464, "step_time": 6.629168002002189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005208333372138441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005208333372138441, "completions/clipped_ratio": 0.0, "completions/max_length": 1961.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1759.375, "completions/mean_terminated_length": 1759.375, "completions/min_length": 1081.0, "completions/min_terminated_length": 1081.0, "entropy": 0.13902374915778637, "epoch": 0.0093, "frac_reward_zero_std": 0.0, "grad_norm": 1.728746771812439, "kl": 0.6727790012955666, "learning_rate": 9.999659189408732e-06, "loss": -0.0095, "num_tokens": 17573003.0, "reward": 2.6619279384613037, "reward_std": 5.373678207397461, "rewards/rollout_reward_func/mean": 2.6619279384613037, "rewards/rollout_reward_func/std": 12.139829635620117, "sampling/importance_sampling_ratio/max": 2.036257028579712, "sampling/importance_sampling_ratio/mean": 1.0116875171661377, "sampling/importance_sampling_ratio/min": 0.31144610047340393, "sampling/sampling_logp_difference/max": 1.2303881645202637, "sampling/sampling_logp_difference/mean": 0.02412721887230873, "step": 465, "step_time": 37.21715446900271 }, { "clip_ratio/high_max": 0.010850694496184587, "clip_ratio/high_mean": 0.005425347248092294, "clip_ratio/low_mean": 0.007291666814126074, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012717014062218368, "entropy": 0.13923931494355202, "epoch": 0.00932, "grad_norm": 1.3814212083816528, "kl": 0.5176205858588219, "learning_rate": 9.999657598720554e-06, "loss": -0.0149, "step": 466, "step_time": 6.155722220999451 }, { "clip_ratio/high_max": 0.010850694496184587, "clip_ratio/high_mean": 0.005425347248092294, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005425347248092294, "completions/clipped_ratio": 0.0, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 1842.5, "completions/mean_terminated_length": 1842.5, "completions/min_length": 1138.0, "completions/min_terminated_length": 1138.0, "entropy": 0.10431700479239225, "epoch": 0.00934, "frac_reward_zero_std": 0.25, "grad_norm": 1.7164621353149414, "kl": 0.38612409122288227, "learning_rate": 9.999656004329023e-06, "loss": -0.1128, "num_tokens": 17652584.0, "reward": 4.594000816345215, "reward_std": 4.123822212219238, "rewards/rollout_reward_func/mean": 4.594000816345215, "rewards/rollout_reward_func/std": 17.150468826293945, "sampling/importance_sampling_ratio/max": 1.9271286725997925, "sampling/importance_sampling_ratio/mean": 1.0217385292053223, "sampling/importance_sampling_ratio/min": 0.3627711236476898, "sampling/sampling_logp_difference/max": 1.0998306274414062, "sampling/sampling_logp_difference/mean": 0.016824834048748016, "step": 467, "step_time": 36.32474399700004 }, { "clip_ratio/high_max": 0.010416666744276881, "clip_ratio/high_mean": 0.005208333372138441, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01215277798473835, "entropy": 0.10505765955895185, "epoch": 0.00936, "grad_norm": 1.5758742094039917, "kl": 0.41707968339324, "learning_rate": 9.999654406234138e-06, "loss": -0.1151, "step": 468, "step_time": 6.322126661998482 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 17652584, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }