{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5172413793103449, "eval_steps": 500, "global_step": 210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 957.265625, "completions/mean_terminated_length": 969.1905517578125, "completions/min_length": 206.0, "completions/min_terminated_length": 225.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7929632365703583, "epoch": 0.0024630541871921183, "frac_reward_zero_std": 0.0, "grad_norm": 0.007080809166602687, "kl": 0.0, "learning_rate": 0.0, "loss": -0.031547293066978455, "num_tokens": 146065.0, "reward": 0.73046875, "reward_std": 0.5586702823638916, "rewards/reward_func/mean": 0.08116319444444445, "rewards/reward_func/std": 0.0780774603287379, "sampling/importance_sampling_ratio/max": 2.999567985534668, "sampling/importance_sampling_ratio/mean": 0.951445460319519, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.061712265014648, "sampling/sampling_logp_difference/mean": 0.20331552624702454, "step": 1, "step_time": 187.38077754108235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2655.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 536.3125, "completions/mean_terminated_length": 519.1935424804688, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7466610074043274, "epoch": 0.0049261083743842365, "frac_reward_zero_std": 0.0, "grad_norm": 0.009326234328873431, "kl": 0.0, "learning_rate": 1e-05, "loss": -0.03247833997011185, "num_tokens": 258293.0, "reward": 0.75, "reward_std": 0.48591265082359314, "rewards/reward_func/mean": 0.08333333333333333, "rewards/reward_func/std": 0.06759786936971876, "sampling/importance_sampling_ratio/max": 2.996082305908203, "sampling/importance_sampling_ratio/mean": 0.9654816389083862, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.417738914489746, "sampling/sampling_logp_difference/mean": 0.17964275181293488, "step": 2, "step_time": 80.001450516982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3812.0, "completions/mean_length": 1312.59375, "completions/mean_terminated_length": 1249.2542724609375, "completions/min_length": 24.0, "completions/min_terminated_length": 268.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6912166476249695, "epoch": 0.007389162561576354, "frac_reward_zero_std": 0.0, "grad_norm": 0.0061682845082898985, "kl": 0.00023736981165711768, "learning_rate": 2e-05, "loss": -0.05372963100671768, "num_tokens": 430203.0, "reward": 0.6484375, "reward_std": 0.5988062620162964, "rewards/reward_func/mean": 0.0720486111111111, "rewards/reward_func/std": 0.08277501662572224, "sampling/importance_sampling_ratio/max": 2.998338222503662, "sampling/importance_sampling_ratio/mean": 0.9439487457275391, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.194351196289062, "sampling/sampling_logp_difference/mean": 0.20593464374542236, "step": 3, "step_time": 162.00668290187605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 986.9375, "completions/mean_terminated_length": 937.5873413085938, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6697156727313995, "epoch": 0.009852216748768473, "frac_reward_zero_std": 0.0, "grad_norm": 0.006919383894978601, "kl": 0.00022338035705615766, "learning_rate": 3e-05, "loss": -0.03669855743646622, "num_tokens": 578007.0, "reward": 0.71875, "reward_std": 0.4807814359664917, "rewards/reward_func/mean": 0.0798611111111111, "rewards/reward_func/std": 0.0677249828974406, "sampling/importance_sampling_ratio/max": 2.9946367740631104, "sampling/importance_sampling_ratio/mean": 0.9518617987632751, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.811179161071777, "sampling/sampling_logp_difference/mean": 0.1946113407611847, "step": 4, "step_time": 117.74331320659257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2739.0, "completions/mean_length": 937.640625, "completions/mean_terminated_length": 863.0491333007812, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7166256010532379, "epoch": 0.012315270935960592, "frac_reward_zero_std": 0.0, "grad_norm": 0.009349939603109444, "kl": 0.0002510682497813832, "learning_rate": 4e-05, "loss": -0.0009028250351548195, "num_tokens": 726288.0, "reward": 0.9375, "reward_std": 0.863731324672699, "rewards/reward_func/mean": 0.10416666666666667, "rewards/reward_func/std": 0.139545609553655, "sampling/importance_sampling_ratio/max": 2.9990389347076416, "sampling/importance_sampling_ratio/mean": 0.9520124197006226, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.812490463256836, "sampling/sampling_logp_difference/mean": 0.19966453313827515, "step": 5, "step_time": 158.4150711328257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 832.828125, "completions/mean_terminated_length": 673.7333984375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7161838710308075, "epoch": 0.014778325123152709, "frac_reward_zero_std": 0.0, "grad_norm": 0.006622132589767797, "kl": 0.00024209270850406028, "learning_rate": 5e-05, "loss": -0.05239104479551315, "num_tokens": 857573.0, "reward": 0.73828125, "reward_std": 0.5259716510772705, "rewards/reward_func/mean": 0.08203125, "rewards/reward_func/std": 0.07359342939323849, "sampling/importance_sampling_ratio/max": 2.9946517944335938, "sampling/importance_sampling_ratio/mean": 0.9595122933387756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.93506908416748, "sampling/sampling_logp_difference/mean": 0.18810752034187317, "step": 6, "step_time": 184.07614227291197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 901.796875, "completions/mean_terminated_length": 795.3500366210938, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6692915558815002, "epoch": 0.017241379310344827, "frac_reward_zero_std": 0.0, "grad_norm": 0.006483913581931664, "kl": 0.000280275329714641, "learning_rate": 4.999995293306428e-05, "loss": -0.041869841516017914, "num_tokens": 997496.0, "reward": 0.734375, "reward_std": 0.4997519254684448, "rewards/reward_func/mean": 0.08159722222222222, "rewards/reward_func/std": 0.06919422745704651, "sampling/importance_sampling_ratio/max": 2.991281509399414, "sampling/importance_sampling_ratio/mean": 0.9599131345748901, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.254425048828125, "sampling/sampling_logp_difference/mean": 0.18058966100215912, "step": 7, "step_time": 125.35071262088604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 1014.3125, "completions/mean_terminated_length": 926.475341796875, "completions/min_length": 209.0, "completions/min_terminated_length": 214.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6333879828453064, "epoch": 0.019704433497536946, "frac_reward_zero_std": 0.0, "grad_norm": 0.00658642427929086, "kl": 0.00029417510086204857, "learning_rate": 4.999981173243434e-05, "loss": -0.006957275792956352, "num_tokens": 1137180.0, "reward": 0.875, "reward_std": 0.6696362495422363, "rewards/reward_func/mean": 0.09722222222222222, "rewards/reward_func/std": 0.12841258777512443, "sampling/importance_sampling_ratio/max": 2.997835159301758, "sampling/importance_sampling_ratio/mean": 0.9580835103988647, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.393613815307617, "sampling/sampling_logp_difference/mean": 0.17572419345378876, "step": 8, "step_time": 119.33149944525212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 902.03125, "completions/mean_terminated_length": 834.5573120117188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6275147348642349, "epoch": 0.022167487684729065, "frac_reward_zero_std": 0.0, "grad_norm": 0.006323226527766412, "kl": 0.0006948229565750808, "learning_rate": 4.999957639864185e-05, "loss": 0.011378014460206032, "num_tokens": 1284430.0, "reward": 0.828125, "reward_std": 0.47114139795303345, "rewards/reward_func/mean": 0.0920138888888889, "rewards/reward_func/std": 0.06752209034230974, "sampling/importance_sampling_ratio/max": 2.9957845211029053, "sampling/importance_sampling_ratio/mean": 0.9539204835891724, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.203951835632324, "sampling/sampling_logp_difference/mean": 0.18070682883262634, "step": 9, "step_time": 165.76451331260614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 1058.421875, "completions/mean_terminated_length": 1003.590087890625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6733803153038025, "epoch": 0.024630541871921183, "frac_reward_zero_std": 0.0, "grad_norm": 0.004642930827864921, "kl": 0.0006831231439718977, "learning_rate": 4.999924693257293e-05, "loss": -0.051798924803733826, "num_tokens": 1434985.0, "reward": 0.94921875, "reward_std": 0.4923616647720337, "rewards/reward_func/mean": 0.10546875, "rewards/reward_func/std": 0.07276467482248943, "sampling/importance_sampling_ratio/max": 2.998112916946411, "sampling/importance_sampling_ratio/mean": 0.9511500597000122, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.552619934082031, "sampling/sampling_logp_difference/mean": 0.19445407390594482, "step": 10, "step_time": 176.47617132030427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3757.0, "completions/max_terminated_length": 3757.0, "completions/mean_length": 764.625, "completions/mean_terminated_length": 759.71435546875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7308266162872314, "epoch": 0.027093596059113302, "frac_reward_zero_std": 0.0, "grad_norm": 0.014920178186134305, "kl": 0.010335339815355837, "learning_rate": 4.9998823335468127e-05, "loss": -0.028729835525155067, "num_tokens": 1573361.0, "reward": 0.83984375, "reward_std": 0.46583086252212524, "rewards/reward_func/mean": 0.09331597222222222, "rewards/reward_func/std": 0.06697534686989254, "sampling/importance_sampling_ratio/max": 2.998915195465088, "sampling/importance_sampling_ratio/mean": 0.9559616446495056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.61943531036377, "sampling/sampling_logp_difference/mean": 0.19377702474594116, "step": 11, "step_time": 111.2789166229777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3448.0, "completions/mean_length": 950.359375, "completions/mean_terminated_length": 900.4286499023438, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6989224255084991, "epoch": 0.029556650246305417, "frac_reward_zero_std": 0.25, "grad_norm": 0.0055010500205562194, "kl": 0.0006455547700170428, "learning_rate": 4.9998305608922444e-05, "loss": 0.006468920968472958, "num_tokens": 1726488.0, "reward": 0.98828125, "reward_std": 0.425920307636261, "rewards/reward_func/mean": 0.10980902777777778, "rewards/reward_func/std": 0.06307367483774821, "sampling/importance_sampling_ratio/max": 2.995755672454834, "sampling/importance_sampling_ratio/mean": 0.9539859294891357, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.935945510864258, "sampling/sampling_logp_difference/mean": 0.19343537092208862, "step": 12, "step_time": 143.94482827885076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3894.0, "completions/mean_length": 1003.15625, "completions/mean_terminated_length": 963.758056640625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6788552850484848, "epoch": 0.03201970443349754, "frac_reward_zero_std": 0.25, "grad_norm": 0.004715533017209702, "kl": 0.0009007068583741784, "learning_rate": 4.99976937548853e-05, "loss": 0.02753077633678913, "num_tokens": 1874386.0, "reward": 0.9296875, "reward_std": 0.31644338369369507, "rewards/reward_func/mean": 0.1032986111111111, "rewards/reward_func/std": 0.0473594069480896, "sampling/importance_sampling_ratio/max": 2.998652696609497, "sampling/importance_sampling_ratio/mean": 0.9536670446395874, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.655500411987305, "sampling/sampling_logp_difference/mean": 0.1883814036846161, "step": 13, "step_time": 118.49153742892668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1714.0, "completions/mean_length": 1036.515625, "completions/mean_terminated_length": 777.2373046875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7153487950563431, "epoch": 0.034482758620689655, "frac_reward_zero_std": 0.25, "grad_norm": 0.0027371846864771214, "kl": 0.0010056617320515215, "learning_rate": 4.999698777566055e-05, "loss": -0.013990852981805801, "num_tokens": 2045299.0, "reward": 1.00390625, "reward_std": 0.38186100125312805, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.05746811628341675, "sampling/importance_sampling_ratio/max": 2.99708890914917, "sampling/importance_sampling_ratio/mean": 0.945040225982666, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.899123191833496, "sampling/sampling_logp_difference/mean": 0.21787062287330627, "step": 14, "step_time": 145.6305006260518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 1146.296875, "completions/mean_terminated_length": 1105.54833984375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6484042406082153, "epoch": 0.03694581280788178, "frac_reward_zero_std": 0.0, "grad_norm": 0.0035634324528942235, "kl": 0.0010466112871654332, "learning_rate": 4.9996187673906445e-05, "loss": -0.03319694846868515, "num_tokens": 2212214.0, "reward": 0.9921875, "reward_std": 0.46711021661758423, "rewards/reward_func/mean": 0.11024305555555555, "rewards/reward_func/std": 0.06887222660912408, "sampling/importance_sampling_ratio/max": 2.9985616207122803, "sampling/importance_sampling_ratio/mean": 0.9446097612380981, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.617145538330078, "sampling/sampling_logp_difference/mean": 0.2042129635810852, "step": 15, "step_time": 184.45763081498444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3909.0, "completions/mean_length": 1403.515625, "completions/mean_terminated_length": 1360.77783203125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6306434720754623, "epoch": 0.03940886699507389, "frac_reward_zero_std": 0.25, "grad_norm": 0.0020010963111962766, "kl": 0.0005926049634581432, "learning_rate": 4.9995293452635664e-05, "loss": -0.016749005764722824, "num_tokens": 2384055.0, "reward": 1.015625, "reward_std": 0.3615305721759796, "rewards/reward_func/mean": 0.11284722222222222, "rewards/reward_func/std": 0.05430084632502662, "sampling/importance_sampling_ratio/max": 2.9997754096984863, "sampling/importance_sampling_ratio/mean": 0.953073263168335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.554804801940918, "sampling/sampling_logp_difference/mean": 0.18350985646247864, "step": 16, "step_time": 128.69480849499814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3903.0, "completions/mean_length": 1115.609375, "completions/mean_terminated_length": 1033.80322265625, "completions/min_length": 145.0, "completions/min_terminated_length": 178.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7165143638849258, "epoch": 0.04187192118226601, "frac_reward_zero_std": 0.0, "grad_norm": 0.002698123496280166, "kl": 0.0011966502643190324, "learning_rate": 4.999430511521525e-05, "loss": -0.01774909719824791, "num_tokens": 2534526.0, "reward": 1.12109375, "reward_std": 0.45423969626426697, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.06652174724472894, "sampling/importance_sampling_ratio/max": 2.9969332218170166, "sampling/importance_sampling_ratio/mean": 0.945965588092804, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.221734046936035, "sampling/sampling_logp_difference/mean": 0.207554429769516, "step": 17, "step_time": 153.94046954950318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 1181.28125, "completions/mean_terminated_length": 986.9667358398438, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7423214614391327, "epoch": 0.04433497536945813, "frac_reward_zero_std": 0.5, "grad_norm": 0.0023429990820167043, "kl": 0.0010119961225427687, "learning_rate": 4.999322266536666e-05, "loss": -0.009780586697161198, "num_tokens": 2689040.0, "reward": 1.078125, "reward_std": 0.39559829235076904, "rewards/reward_func/mean": 0.11979166666666667, "rewards/reward_func/std": 0.05736829009321001, "sampling/importance_sampling_ratio/max": 2.9975063800811768, "sampling/importance_sampling_ratio/mean": 0.9511388540267944, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.217132568359375, "sampling/sampling_logp_difference/mean": 0.20317873358726501, "step": 18, "step_time": 148.3335123800207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 1244.03125, "completions/mean_terminated_length": 1152.0322265625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6846257448196411, "epoch": 0.046798029556650245, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024025660850242043, "kl": 0.0009544179702061228, "learning_rate": 4.9992046107165705e-05, "loss": 0.0017357771284878254, "num_tokens": 2861074.0, "reward": 0.98828125, "reward_std": 0.24164485931396484, "rewards/reward_func/mean": 0.10980902777777778, "rewards/reward_func/std": 0.039622714122136436, "sampling/importance_sampling_ratio/max": 2.997622013092041, "sampling/importance_sampling_ratio/mean": 0.947391152381897, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.733199119567871, "sampling/sampling_logp_difference/mean": 0.2056819498538971, "step": 19, "step_time": 136.16764776594937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3447.0, "completions/mean_length": 1302.53125, "completions/mean_terminated_length": 1165.1474609375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7186716794967651, "epoch": 0.04926108374384237, "frac_reward_zero_std": 0.0, "grad_norm": 0.000971565986448778, "kl": 0.0007161840621847659, "learning_rate": 4.999077544504252e-05, "loss": -0.009501606225967407, "num_tokens": 3029444.0, "reward": 1.01953125, "reward_std": 0.2284553349018097, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.035912372171878815, "sampling/importance_sampling_ratio/max": 2.9968159198760986, "sampling/importance_sampling_ratio/mean": 0.9446607828140259, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.439801216125488, "sampling/sampling_logp_difference/mean": 0.21503275632858276, "step": 20, "step_time": 122.81685300194658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2639.0, "completions/mean_length": 968.984375, "completions/mean_terminated_length": 865.8196411132812, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7783543467521667, "epoch": 0.05172413793103448, "frac_reward_zero_std": 0.25, "grad_norm": 0.0038032007547465566, "kl": 0.017548598028952256, "learning_rate": 4.998941068378163e-05, "loss": 0.0010344665497541428, "num_tokens": 3175379.0, "reward": 1.1796875, "reward_std": 0.46604710817337036, "rewards/reward_func/mean": 0.1310763888888889, "rewards/reward_func/std": 0.06429070068730248, "sampling/importance_sampling_ratio/max": 2.9981436729431152, "sampling/importance_sampling_ratio/mean": 0.9482499361038208, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.149746894836426, "sampling/sampling_logp_difference/mean": 0.20681576430797577, "step": 21, "step_time": 130.43352678511292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3854.0, "completions/mean_length": 975.015625, "completions/mean_terminated_length": 875.1966552734375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "degenerate_groups_filtered": 0.0, "entropy": 0.750398576259613, "epoch": 0.054187192118226604, "frac_reward_zero_std": 0.5, "grad_norm": 0.000523910546696439, "kl": 0.001042805059114471, "learning_rate": 4.998795182852183e-05, "loss": -0.008543891832232475, "num_tokens": 3323732.0, "reward": 1.125, "reward_std": 0.35912150144577026, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.049859102401468486, "sampling/importance_sampling_ratio/max": 2.9976632595062256, "sampling/importance_sampling_ratio/mean": 0.956694483757019, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.91028881072998, "sampling/sampling_logp_difference/mean": 0.19543185830116272, "step": 22, "step_time": 125.63274311483838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3903.0, "completions/mean_length": 1227.0625, "completions/mean_terminated_length": 1085.9671630859375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6305443197488785, "epoch": 0.05665024630541872, "frac_reward_zero_std": 0.5, "grad_norm": 0.004696302820715364, "kl": 0.0008546650788048282, "learning_rate": 4.998639888475621e-05, "loss": 0.04576673358678818, "num_tokens": 3490456.0, "reward": 1.26171875, "reward_std": 0.7379046678543091, "rewards/reward_func/mean": 0.1401909722222222, "rewards/reward_func/std": 0.11325099567572276, "sampling/importance_sampling_ratio/max": 2.999181032180786, "sampling/importance_sampling_ratio/mean": 0.9515511989593506, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.505047798156738, "sampling/sampling_logp_difference/mean": 0.19298961758613586, "step": 23, "step_time": 133.18944385577925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 1265.4375, "completions/mean_terminated_length": 1174.1290283203125, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6693638265132904, "epoch": 0.059113300492610835, "frac_reward_zero_std": 0.25, "grad_norm": 0.0013212910135926548, "kl": 0.0009473708487348631, "learning_rate": 4.998475185833219e-05, "loss": 0.008236411958932877, "num_tokens": 3657892.0, "reward": 1.03515625, "reward_std": 0.2741045653820038, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.039349371360407934, "sampling/importance_sampling_ratio/max": 2.9955687522888184, "sampling/importance_sampling_ratio/mean": 0.9495745897293091, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.479448318481445, "sampling/sampling_logp_difference/mean": 0.1977420151233673, "step": 24, "step_time": 128.52692431304604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2757.0, "completions/mean_length": 1035.78125, "completions/mean_terminated_length": 988.5806274414062, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "degenerate_groups_filtered": 1.0, "entropy": 0.673497810959816, "epoch": 0.06157635467980296, "frac_reward_zero_std": 0.25, "grad_norm": 0.0025486905634899714, "kl": 0.0008044984133448452, "learning_rate": 4.9983010755451386e-05, "loss": -0.005573897622525692, "num_tokens": 3804278.0, "reward": 1.1796875, "reward_std": 0.46604710817337036, "rewards/reward_func/mean": 0.1310763888888889, "rewards/reward_func/std": 0.06429070068730248, "sampling/importance_sampling_ratio/max": 2.997753620147705, "sampling/importance_sampling_ratio/mean": 0.9530193209648132, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.126824378967285, "sampling/sampling_logp_difference/mean": 0.19949191808700562, "step": 25, "step_time": 125.30965802492574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 997.15625, "completions/mean_terminated_length": 997.15625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6242654323577881, "epoch": 0.06403940886699508, "frac_reward_zero_std": 0.75, "grad_norm": 0.0006874978805650674, "kl": 0.0009008395281853154, "learning_rate": 4.998117558266968e-05, "loss": 0.003014655550941825, "num_tokens": 3958688.0, "reward": 1.04296875, "reward_std": 0.19697457551956177, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.027143559522098966, "sampling/importance_sampling_ratio/max": 2.9919300079345703, "sampling/importance_sampling_ratio/mean": 0.9529856443405151, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.685388565063477, "sampling/sampling_logp_difference/mean": 0.1835816651582718, "step": 26, "step_time": 87.82440122170374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 1417.03125, "completions/mean_terminated_length": 1285.278564453125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6958920955657959, "epoch": 0.0665024630541872, "frac_reward_zero_std": 0.25, "grad_norm": 0.00867741418581629, "kl": 0.000814533414086327, "learning_rate": 4.9979246346897136e-05, "loss": -0.02824692241847515, "num_tokens": 4136322.0, "reward": 1.203125, "reward_std": 0.6375719904899597, "rewards/reward_func/mean": 0.13368055555555555, "rewards/reward_func/std": 0.11306073599391514, "sampling/importance_sampling_ratio/max": 2.999945640563965, "sampling/importance_sampling_ratio/mean": 0.943295955657959, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.382047653198242, "sampling/sampling_logp_difference/mean": 0.21225795149803162, "step": 27, "step_time": 158.52877362072468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 1063.046875, "completions/mean_terminated_length": 965.2096557617188, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "degenerate_groups_filtered": 0.0, "entropy": 0.73707315325737, "epoch": 0.06896551724137931, "frac_reward_zero_std": 0.25, "grad_norm": 0.0006812576623849655, "kl": 0.0010022705682786182, "learning_rate": 4.997722305539802e-05, "loss": -0.0040407185442745686, "num_tokens": 4287797.0, "reward": 1.125, "reward_std": 0.35073620080947876, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.04570846714907222, "sampling/importance_sampling_ratio/max": 2.9977564811706543, "sampling/importance_sampling_ratio/mean": 0.9494884014129639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.191810607910156, "sampling/sampling_logp_difference/mean": 0.2048904001712799, "step": 28, "step_time": 118.83388412673958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3365.0, "completions/mean_length": 1107.609375, "completions/mean_terminated_length": 960.6392822265625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7131416946649551, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.0, "grad_norm": 0.0022908098026776604, "kl": 0.0011166162439621985, "learning_rate": 4.997510571579074e-05, "loss": 0.00837898999452591, "num_tokens": 4453148.0, "reward": 1.08984375, "reward_std": 0.3628786504268646, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.05258376399676005, "sampling/importance_sampling_ratio/max": 2.9968960285186768, "sampling/importance_sampling_ratio/mean": 0.9544211626052856, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.943284034729004, "sampling/sampling_logp_difference/mean": 0.1991732120513916, "step": 29, "step_time": 143.43660160107538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 1034.375, "completions/mean_terminated_length": 985.77783203125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "degenerate_groups_filtered": 1.0, "entropy": 0.729461207985878, "epoch": 0.07389162561576355, "frac_reward_zero_std": 0.75, "grad_norm": 0.005696858334664288, "kl": 0.0011717368324752897, "learning_rate": 4.997289433604783e-05, "loss": 0.02414151094853878, "num_tokens": 4621172.0, "reward": 1.2578125, "reward_std": 0.7439821362495422, "rewards/reward_func/mean": 0.13975694444444445, "rewards/reward_func/std": 0.11646586159865062, "sampling/importance_sampling_ratio/max": 2.9982309341430664, "sampling/importance_sampling_ratio/mean": 0.9456791877746582, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.812464714050293, "sampling/sampling_logp_difference/mean": 0.21633002161979675, "step": 30, "step_time": 128.25970319425687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 945.78125, "completions/mean_terminated_length": 844.1612548828125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7076129615306854, "epoch": 0.07635467980295567, "frac_reward_zero_std": 0.5, "grad_norm": 0.0005820405360045273, "kl": 0.0008675850986037403, "learning_rate": 4.997058892449591e-05, "loss": -0.0038843925576657057, "num_tokens": 4768086.0, "reward": 1.1171875, "reward_std": 0.3391420841217041, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.04190837426318063, "sampling/importance_sampling_ratio/max": 2.9953014850616455, "sampling/importance_sampling_ratio/mean": 0.9568973779678345, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.485434532165527, "sampling/sampling_logp_difference/mean": 0.18533028662204742, "step": 31, "step_time": 115.76905426895246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4024.0, "completions/max_terminated_length": 4024.0, "completions/mean_length": 693.03125, "completions/mean_terminated_length": 699.3175048828125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5862796604633331, "epoch": 0.07881773399014778, "frac_reward_zero_std": 0.5, "grad_norm": 0.005239994985586592, "kl": 0.0010869551915675402, "learning_rate": 4.99681894898157e-05, "loss": -0.00011015844211215153, "num_tokens": 4896296.0, "reward": 1.10546875, "reward_std": 0.4427652359008789, "rewards/reward_func/mean": 0.1228298611111111, "rewards/reward_func/std": 0.07224722537729475, "sampling/importance_sampling_ratio/max": 2.999147653579712, "sampling/importance_sampling_ratio/mean": 0.9641219973564148, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.479804992675781, "sampling/sampling_logp_difference/mean": 0.16228708624839783, "step": 32, "step_time": 129.93058712827042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 1020.71875, "completions/mean_terminated_length": 864.0167236328125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "degenerate_groups_filtered": 0.0, "entropy": 0.684396505355835, "epoch": 0.0812807881773399, "frac_reward_zero_std": 0.75, "grad_norm": 0.0023055066573911117, "kl": 0.0011950426560360938, "learning_rate": 4.99656960410419e-05, "loss": 0.01441088318824768, "num_tokens": 5051142.0, "reward": 1.01953125, "reward_std": 0.20071640610694885, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.02990201198392444, "sampling/importance_sampling_ratio/max": 2.998185396194458, "sampling/importance_sampling_ratio/mean": 0.9508187770843506, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.62383270263672, "sampling/sampling_logp_difference/mean": 0.20294925570487976, "step": 33, "step_time": 124.96260152198374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3760.0, "completions/mean_length": 1340.171875, "completions/mean_terminated_length": 1233.1500244140625, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7469504326581955, "epoch": 0.08374384236453201, "frac_reward_zero_std": 0.75, "grad_norm": 0.005973168634825336, "kl": 0.0012111573305446655, "learning_rate": 4.9963108587563226e-05, "loss": 0.012999728322029114, "num_tokens": 5232177.0, "reward": 1.125, "reward_std": 0.34503278136253357, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.0486740552716785, "sampling/importance_sampling_ratio/max": 2.9891550540924072, "sampling/importance_sampling_ratio/mean": 0.9441696405410767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.990606307983398, "sampling/sampling_logp_difference/mean": 0.21126891672611237, "step": 34, "step_time": 194.91840827674605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 721.109375, "completions/mean_terminated_length": 721.109375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "degenerate_groups_filtered": 0.0, "entropy": 0.574622169137001, "epoch": 0.08620689655172414, "frac_reward_zero_std": 0.5, "grad_norm": 0.002111501240666468, "kl": 0.0016787934000603855, "learning_rate": 4.996042713912238e-05, "loss": 0.03297015279531479, "num_tokens": 5364056.0, "reward": 1.0390625, "reward_std": 0.27174752950668335, "rewards/reward_func/mean": 0.1154513888888889, "rewards/reward_func/std": 0.038944005138344236, "sampling/importance_sampling_ratio/max": 2.992124080657959, "sampling/importance_sampling_ratio/mean": 0.9650247693061829, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.683228492736816, "sampling/sampling_logp_difference/mean": 0.15935879945755005, "step": 35, "step_time": 88.56324595375918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 758.765625, "completions/mean_terminated_length": 651.1128540039062, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6567760854959488, "epoch": 0.08866995073891626, "frac_reward_zero_std": 0.5, "grad_norm": 0.0008471869813655381, "kl": 0.0014283077616710216, "learning_rate": 4.995765170581595e-05, "loss": -0.005755479913204908, "num_tokens": 5490809.0, "reward": 1.046875, "reward_std": 0.25539806485176086, "rewards/reward_func/mean": 0.11631944444444445, "rewards/reward_func/std": 0.03388542061050733, "sampling/importance_sampling_ratio/max": 2.9882869720458984, "sampling/importance_sampling_ratio/mean": 0.9635649919509888, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.370882034301758, "sampling/sampling_logp_difference/mean": 0.16954657435417175, "step": 36, "step_time": 180.74639308801852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3187.0, "completions/mean_length": 1229.5, "completions/mean_terminated_length": 1088.5245361328125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7335378378629684, "epoch": 0.09113300492610837, "frac_reward_zero_std": 0.25, "grad_norm": 0.004487204028022401, "kl": 0.0041939748043660074, "learning_rate": 4.995478229809444e-05, "loss": 0.051113568246364594, "num_tokens": 5660169.0, "reward": 1.17578125, "reward_std": 0.7150309085845947, "rewards/reward_func/mean": 0.1306423611111111, "rewards/reward_func/std": 0.10943113929695553, "sampling/importance_sampling_ratio/max": 2.999530792236328, "sampling/importance_sampling_ratio/mean": 0.9497649669647217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.937071800231934, "sampling/sampling_logp_difference/mean": 0.19850803911685944, "step": 37, "step_time": 135.75389188993722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 1152.984375, "completions/mean_terminated_length": 956.7833862304688, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7015815377235413, "epoch": 0.09359605911330049, "frac_reward_zero_std": 0.0, "grad_norm": 0.001804205516908683, "kl": 0.0018779803649522364, "learning_rate": 4.9951818926762174e-05, "loss": 0.005440297070890665, "num_tokens": 5837448.0, "reward": 1.046875, "reward_std": 0.329968124628067, "rewards/reward_func/mean": 0.11631944444444445, "rewards/reward_func/std": 0.04896413783232371, "sampling/importance_sampling_ratio/max": 2.999505043029785, "sampling/importance_sampling_ratio/mean": 0.9455422163009644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.106887817382812, "sampling/sampling_logp_difference/mean": 0.2156333029270172, "step": 38, "step_time": 190.32753542577848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 941.140625, "completions/mean_terminated_length": 839.3709716796875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "degenerate_groups_filtered": 1.0, "entropy": 0.652764156460762, "epoch": 0.0960591133004926, "frac_reward_zero_std": 0.5, "grad_norm": 0.002124967192293937, "kl": 0.0012375268270261586, "learning_rate": 4.99487616029773e-05, "loss": -0.002344203647226095, "num_tokens": 5985777.0, "reward": 1.0546875, "reward_std": 0.3068941533565521, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.044668421149253845, "sampling/importance_sampling_ratio/max": 2.999897003173828, "sampling/importance_sampling_ratio/mean": 0.9565781354904175, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.855171203613281, "sampling/sampling_logp_difference/mean": 0.18129438161849976, "step": 39, "step_time": 119.2174817638006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3406.0, "completions/mean_length": 925.71875, "completions/mean_terminated_length": 815.901611328125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6335614919662476, "epoch": 0.09852216748768473, "frac_reward_zero_std": 0.25, "grad_norm": 0.003501411960275662, "kl": 0.0029698072466999292, "learning_rate": 4.994561033825174e-05, "loss": 0.014414285309612751, "num_tokens": 6126367.0, "reward": 1.05859375, "reward_std": 0.3355349600315094, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.04818948441081577, "sampling/importance_sampling_ratio/max": 2.997518539428711, "sampling/importance_sampling_ratio/mean": 0.9628660082817078, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.993983268737793, "sampling/sampling_logp_difference/mean": 0.16173504292964935, "step": 40, "step_time": 131.86127198208123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 939.421875, "completions/mean_terminated_length": 893.1128540039062, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6877728551626205, "epoch": 0.10098522167487685, "frac_reward_zero_std": 0.5, "grad_norm": 0.000779252641297956, "kl": 0.0010590426390990615, "learning_rate": 4.99423651444511e-05, "loss": -0.000718248076736927, "num_tokens": 6272426.0, "reward": 1.12890625, "reward_std": 0.3590999245643616, "rewards/reward_func/mean": 0.1254340277777778, "rewards/reward_func/std": 0.044849217351939946, "sampling/importance_sampling_ratio/max": 2.997779369354248, "sampling/importance_sampling_ratio/mean": 0.9570625424385071, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.874835968017578, "sampling/sampling_logp_difference/mean": 0.18544772267341614, "step": 41, "step_time": 122.42929228907451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4095.0, "completions/mean_length": 1331.609375, "completions/mean_terminated_length": 1097.3389892578125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8357381373643875, "epoch": 0.10344827586206896, "frac_reward_zero_std": 0.5, "grad_norm": 0.000931053055557495, "kl": 0.0011489106400404125, "learning_rate": 4.993902603379471e-05, "loss": -0.019008180126547813, "num_tokens": 6443425.0, "reward": 1.09375, "reward_std": 0.3713446259498596, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.05229175090789795, "sampling/importance_sampling_ratio/max": 2.9981436729431152, "sampling/importance_sampling_ratio/mean": 0.9422184228897095, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.15316390991211, "sampling/sampling_logp_difference/mean": 0.22885316610336304, "step": 42, "step_time": 147.6898850449361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 977.359375, "completions/mean_terminated_length": 823.9835815429688, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7929796725511551, "epoch": 0.10591133004926108, "frac_reward_zero_std": 0.0, "grad_norm": 0.0010182376320781244, "kl": 0.002078307850752026, "learning_rate": 4.99355930188555e-05, "loss": -0.028957121074199677, "num_tokens": 6585592.0, "reward": 1.11328125, "reward_std": 0.4176884591579437, "rewards/reward_func/mean": 0.12369791666666667, "rewards/reward_func/std": 0.058884123961130776, "sampling/importance_sampling_ratio/max": 2.9990317821502686, "sampling/importance_sampling_ratio/mean": 0.9565147757530212, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.686131477355957, "sampling/sampling_logp_difference/mean": 0.19290070235729218, "step": 43, "step_time": 121.78506018640473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 1173.28125, "completions/mean_terminated_length": 1079.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7569215148687363, "epoch": 0.10837438423645321, "frac_reward_zero_std": 1.0, "grad_norm": 6.888280513936598e-05, "kl": 0.0013799294829368591, "learning_rate": 4.9932066112559975e-05, "loss": 1.0725540050771087e-05, "num_tokens": 6764346.0, "reward": 1.15625, "reward_std": 0.36596253514289856, "rewards/reward_func/mean": 0.1284722222222222, "rewards/reward_func/std": 0.04066250390476651, "sampling/importance_sampling_ratio/max": 2.994110107421875, "sampling/importance_sampling_ratio/mean": 0.9449698328971863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.999063491821289, "sampling/sampling_logp_difference/mean": 0.22023621201515198, "step": 44, "step_time": 191.58390807081014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 766.96875, "completions/mean_terminated_length": 659.5806274414062, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "degenerate_groups_filtered": 1.0, "entropy": 0.769265666604042, "epoch": 0.11083743842364532, "frac_reward_zero_std": 0.25, "grad_norm": 0.003624385236832457, "kl": 0.0033626087242737412, "learning_rate": 4.992844532818821e-05, "loss": 0.011339845135807991, "num_tokens": 6889224.0, "reward": 1.2109375, "reward_std": 0.7443154454231262, "rewards/reward_func/mean": 0.1345486111111111, "rewards/reward_func/std": 0.11726083523697323, "sampling/importance_sampling_ratio/max": 2.9944660663604736, "sampling/importance_sampling_ratio/mean": 0.9590526819229126, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.834656715393066, "sampling/sampling_logp_difference/mean": 0.18281063437461853, "step": 45, "step_time": 118.58491391129792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3434.0, "completions/mean_length": 1011.671875, "completions/mean_terminated_length": 859.9835815429688, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7145812660455704, "epoch": 0.11330049261083744, "frac_reward_zero_std": 0.75, "grad_norm": 0.0008810215585302653, "kl": 0.0015595734002999961, "learning_rate": 4.9924730679373735e-05, "loss": 0.010057137347757816, "num_tokens": 7041267.0, "reward": 1.18359375, "reward_std": 0.3965180516242981, "rewards/reward_func/mean": 0.13151041666666666, "rewards/reward_func/std": 0.047183099720213145, "sampling/importance_sampling_ratio/max": 2.9970216751098633, "sampling/importance_sampling_ratio/mean": 0.95362788438797, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.471074104309082, "sampling/sampling_logp_difference/mean": 0.19335752725601196, "step": 46, "step_time": 132.00769766583107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3317.0, "completions/max_terminated_length": 3317.0, "completions/mean_length": 970.09375, "completions/mean_terminated_length": 970.09375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7778498083353043, "epoch": 0.11576354679802955, "frac_reward_zero_std": 0.5, "grad_norm": 0.0061553118849422455, "kl": 0.0065233102650381625, "learning_rate": 4.992092218010351e-05, "loss": 0.036174844950437546, "num_tokens": 7188905.0, "reward": 1.21484375, "reward_std": 0.7635900974273682, "rewards/reward_func/mean": 0.1349826388888889, "rewards/reward_func/std": 0.12284477055072784, "sampling/importance_sampling_ratio/max": 2.9998605251312256, "sampling/importance_sampling_ratio/mean": 0.9465634822845459, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.34210205078125, "sampling/sampling_logp_difference/mean": 0.21574868261814117, "step": 47, "step_time": 99.85584617522545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3715.0, "completions/mean_length": 1071.703125, "completions/mean_terminated_length": 829.9661254882812, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7317904829978943, "epoch": 0.11822660098522167, "frac_reward_zero_std": 0.75, "grad_norm": 0.00015393564669603422, "kl": 0.0016383329930249602, "learning_rate": 4.991701984471789e-05, "loss": -0.0043869917280972, "num_tokens": 7348502.0, "reward": 1.15234375, "reward_std": 0.3689786195755005, "rewards/reward_func/mean": 0.12803819444444445, "rewards/reward_func/std": 0.04413472612698873, "sampling/importance_sampling_ratio/max": 2.9766175746917725, "sampling/importance_sampling_ratio/mean": 0.949319064617157, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.05463981628418, "sampling/sampling_logp_difference/mean": 0.2019147127866745, "step": 48, "step_time": 164.48299563932233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 776.296875, "completions/mean_terminated_length": 723.6032104492188, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6619300991296768, "epoch": 0.1206896551724138, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016658057286295187, "kl": 0.001641769689740613, "learning_rate": 4.9913023687910575e-05, "loss": 0.005893378518521786, "num_tokens": 7474137.0, "reward": 1.12890625, "reward_std": 0.33627331256866455, "rewards/reward_func/mean": 0.1254340277777778, "rewards/reward_func/std": 0.044849217351939946, "sampling/importance_sampling_ratio/max": 2.999603748321533, "sampling/importance_sampling_ratio/mean": 0.9636247158050537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.411972045898438, "sampling/sampling_logp_difference/mean": 0.17460203170776367, "step": 49, "step_time": 131.9607803169638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 1223.515625, "completions/mean_terminated_length": 1032.0167236328125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7410552501678467, "epoch": 0.12315270935960591, "frac_reward_zero_std": 0.25, "grad_norm": 0.002313620014665204, "kl": 0.0017931896727532148, "learning_rate": 4.990893372472849e-05, "loss": -0.016107279807329178, "num_tokens": 7647882.0, "reward": 1.09375, "reward_std": 0.41187721490859985, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.05926263497935401, "sampling/importance_sampling_ratio/max": 2.994967460632324, "sampling/importance_sampling_ratio/mean": 0.9434908628463745, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.531850814819336, "sampling/sampling_logp_difference/mean": 0.22060488164424896, "step": 50, "step_time": 151.6255073894281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 1066.484375, "completions/mean_terminated_length": 815.8103637695312, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "degenerate_groups_filtered": 1.0, "entropy": 0.771567702293396, "epoch": 0.12561576354679804, "frac_reward_zero_std": 0.75, "grad_norm": 0.0003253046604313312, "kl": 0.0013065032253507525, "learning_rate": 4.99047499705718e-05, "loss": -0.007344153709709644, "num_tokens": 7797529.0, "reward": 1.08203125, "reward_std": 0.3119787275791168, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.043059426049391426, "sampling/importance_sampling_ratio/max": 2.9988787174224854, "sampling/importance_sampling_ratio/mean": 0.9490600228309631, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.559198379516602, "sampling/sampling_logp_difference/mean": 0.20981940627098083, "step": 51, "step_time": 121.97355275088921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 1041.515625, "completions/mean_terminated_length": 942.9838256835938, "completions/min_length": 23.0, "completions/min_terminated_length": 23.0, "degenerate_groups_filtered": 1.0, "entropy": 0.670068770647049, "epoch": 0.12807881773399016, "frac_reward_zero_std": 0.5, "grad_norm": 0.008313600173241441, "kl": 0.006259864254388958, "learning_rate": 4.990047244119383e-05, "loss": -0.0173809751868248, "num_tokens": 7953674.0, "reward": 1.1015625, "reward_std": 0.781378984451294, "rewards/reward_func/mean": 0.12239583333333333, "rewards/reward_func/std": 0.10772978928354052, "sampling/importance_sampling_ratio/max": 2.999814748764038, "sampling/importance_sampling_ratio/mean": 0.9522294998168945, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.490184783935547, "sampling/sampling_logp_difference/mean": 0.18839803338050842, "step": 52, "step_time": 127.30351705593057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 903.265625, "completions/mean_terminated_length": 801.5573120117188, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6703495234251022, "epoch": 0.13054187192118227, "frac_reward_zero_std": 0.5, "grad_norm": 0.0007236686160723223, "kl": 0.001405259157763794, "learning_rate": 4.9896101152701e-05, "loss": -0.0052278656512498856, "num_tokens": 8088731.0, "reward": 1.16015625, "reward_std": 0.3892585337162018, "rewards/reward_func/mean": 0.12890625, "rewards/reward_func/std": 0.04816830199625757, "sampling/importance_sampling_ratio/max": 2.995981454849243, "sampling/importance_sampling_ratio/mean": 0.9509227275848389, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.751556396484375, "sampling/sampling_logp_difference/mean": 0.1959969401359558, "step": 53, "step_time": 127.24764031497762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3157.0, "completions/mean_length": 935.578125, "completions/mean_terminated_length": 780.1475219726562, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7959064245223999, "epoch": 0.1330049261083744, "frac_reward_zero_std": 0.75, "grad_norm": 0.0005867268570703819, "kl": 0.002575396472821012, "learning_rate": 4.9891636121552745e-05, "loss": 0.003615929512307048, "num_tokens": 8238144.0, "reward": 1.19921875, "reward_std": 0.40868473052978516, "rewards/reward_func/mean": 0.1332465277777778, "rewards/reward_func/std": 0.04852836661868625, "sampling/importance_sampling_ratio/max": 2.997250556945801, "sampling/importance_sampling_ratio/mean": 0.9515563249588013, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.81047534942627, "sampling/sampling_logp_difference/mean": 0.2022194117307663, "step": 54, "step_time": 131.37923206575215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 1119.9375, "completions/mean_terminated_length": 979.2373046875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7834322899580002, "epoch": 0.1354679802955665, "frac_reward_zero_std": 0.75, "grad_norm": 0.000250132466078681, "kl": 0.001461994950659573, "learning_rate": 4.988707736456151e-05, "loss": -0.0006330913747660816, "num_tokens": 8393452.0, "reward": 1.13671875, "reward_std": 0.3533560335636139, "rewards/reward_func/mean": 0.12630208333333334, "rewards/reward_func/std": 0.04240360524919298, "sampling/importance_sampling_ratio/max": 2.9995980262756348, "sampling/importance_sampling_ratio/mean": 0.9429638981819153, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.937179565429688, "sampling/sampling_logp_difference/mean": 0.22264911234378815, "step": 55, "step_time": 136.44824583176523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1084.0625, "completions/mean_terminated_length": 1003.0654907226562, "completions/min_length": 1.0, "completions/min_terminated_length": 194.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6966681033372879, "epoch": 0.13793103448275862, "frac_reward_zero_std": 0.5, "grad_norm": 0.000776864777284884, "kl": 0.0010701149585656822, "learning_rate": 4.9882424898892635e-05, "loss": -0.0030254418961703777, "num_tokens": 8565104.0, "reward": 1.07421875, "reward_std": 0.32635459303855896, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.04690552916791704, "sampling/importance_sampling_ratio/max": 2.995293140411377, "sampling/importance_sampling_ratio/mean": 0.9448047876358032, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.740140914916992, "sampling/sampling_logp_difference/mean": 0.21466103196144104, "step": 56, "step_time": 169.7448099392932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 1082.09375, "completions/mean_terminated_length": 984.8709106445312, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7558799386024475, "epoch": 0.14039408866995073, "frac_reward_zero_std": 0.75, "grad_norm": 0.0006483312619414802, "kl": 0.0011281462502665818, "learning_rate": 4.987767874206428e-05, "loss": 0.005442922003567219, "num_tokens": 8719014.0, "reward": 1.16796875, "reward_std": 0.38331958651542664, "rewards/reward_func/mean": 0.12977430555555555, "rewards/reward_func/std": 0.04572268989351061, "sampling/importance_sampling_ratio/max": 2.9972448348999023, "sampling/importance_sampling_ratio/mean": 0.9498982429504395, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.618826866149902, "sampling/sampling_logp_difference/mean": 0.21131965517997742, "step": 57, "step_time": 175.06015671789646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2919.0, "completions/mean_length": 1007.796875, "completions/mean_terminated_length": 908.1773681640625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7677361369132996, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.25, "grad_norm": 0.008613100011594328, "kl": 0.0014947211020626128, "learning_rate": 4.987283891194743e-05, "loss": 0.11646324396133423, "num_tokens": 8858233.0, "reward": 1.19921875, "reward_std": 0.9302880764007568, "rewards/reward_func/mean": 0.1332465277777778, "rewards/reward_func/std": 0.13260910908381143, "sampling/importance_sampling_ratio/max": 2.9999563694000244, "sampling/importance_sampling_ratio/mean": 0.9526713490486145, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.495641708374023, "sampling/sampling_logp_difference/mean": 0.21066075563430786, "step": 58, "step_time": 172.44660172308795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 1208.90625, "completions/mean_terminated_length": 1115.774169921875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6357712894678116, "epoch": 0.14532019704433496, "frac_reward_zero_std": 0.75, "grad_norm": 0.0003580177247054395, "kl": 0.0012669183051912114, "learning_rate": 4.986790542676576e-05, "loss": -0.0009904210455715656, "num_tokens": 9021795.0, "reward": 1.08984375, "reward_std": 0.2966987192630768, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.0361149807771047, "sampling/importance_sampling_ratio/max": 2.997753620147705, "sampling/importance_sampling_ratio/mean": 0.9547065496444702, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.183061599731445, "sampling/sampling_logp_difference/mean": 0.18249759078025818, "step": 59, "step_time": 127.31623220816255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1695.0, "completions/mean_length": 654.28125, "completions/mean_terminated_length": 543.258056640625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6762199103832245, "epoch": 0.1477832512315271, "frac_reward_zero_std": 0.5, "grad_norm": 0.002547014851253121, "kl": 0.0022673878411296755, "learning_rate": 4.986287830509558e-05, "loss": 0.004557657055556774, "num_tokens": 9153077.0, "reward": 1.0625, "reward_std": 0.2920915186405182, "rewards/reward_func/mean": 0.11805555555555555, "rewards/reward_func/std": 0.04098213298453225, "sampling/importance_sampling_ratio/max": 2.998140811920166, "sampling/importance_sampling_ratio/mean": 0.9566666483879089, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.62492847442627, "sampling/sampling_logp_difference/mean": 0.18817751109600067, "step": 60, "step_time": 122.69049222487956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3154.0, "completions/mean_length": 1090.25, "completions/mean_terminated_length": 1042.539794921875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7119808942079544, "epoch": 0.15024630541871922, "frac_reward_zero_std": 0.25, "grad_norm": 0.0029762470152537818, "kl": 0.001506138069089502, "learning_rate": 4.985775756586581e-05, "loss": -0.013122936710715294, "num_tokens": 9308901.0, "reward": 1.03515625, "reward_std": 0.31455180048942566, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.04827603366639879, "sampling/importance_sampling_ratio/max": 2.997075319290161, "sampling/importance_sampling_ratio/mean": 0.9534555673599243, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.6150484085083, "sampling/sampling_logp_difference/mean": 0.19639095664024353, "step": 61, "step_time": 115.97744632000104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3658.0, "completions/mean_length": 1041.125, "completions/mean_terminated_length": 884.4667358398438, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "degenerate_groups_filtered": 1.0, "entropy": 0.681881383061409, "epoch": 0.15270935960591134, "frac_reward_zero_std": 1.0, "grad_norm": 5.3362222264957966e-05, "kl": 0.0011868184083141387, "learning_rate": 4.9852543228357835e-05, "loss": 8.767043254920281e-06, "num_tokens": 9467821.0, "reward": 1.046875, "reward_std": 0.2130420207977295, "rewards/reward_func/mean": 0.11631944444444445, "rewards/reward_func/std": 0.023671337299876742, "sampling/importance_sampling_ratio/max": 2.998835802078247, "sampling/importance_sampling_ratio/mean": 0.950027585029602, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.164039611816406, "sampling/sampling_logp_difference/mean": 0.1998741328716278, "step": 62, "step_time": 128.09420190914534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3469.0, "completions/max_terminated_length": 3469.0, "completions/mean_length": 897.0625, "completions/mean_terminated_length": 897.0625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6641940772533417, "epoch": 0.15517241379310345, "frac_reward_zero_std": 0.75, "grad_norm": 0.0009879127765257943, "kl": 0.0019843199115712196, "learning_rate": 4.9847235312205484e-05, "loss": 0.004818388726562262, "num_tokens": 9608273.0, "reward": 1.06640625, "reward_std": 0.2790367007255554, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.035972247935003705, "sampling/importance_sampling_ratio/max": 2.998894453048706, "sampling/importance_sampling_ratio/mean": 0.9591097831726074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.58657169342041, "sampling/sampling_logp_difference/mean": 0.172621488571167, "step": 63, "step_time": 126.96359702572227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3346.0, "completions/mean_length": 889.734375, "completions/mean_terminated_length": 838.84130859375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7291716188192368, "epoch": 0.15763546798029557, "frac_reward_zero_std": 0.25, "grad_norm": 0.005158850574939295, "kl": 0.0012882646697107702, "learning_rate": 4.984183383739496e-05, "loss": -0.0065579283982515335, "num_tokens": 9745936.0, "reward": 1.140625, "reward_std": 0.7904125452041626, "rewards/reward_func/mean": 0.1267361111111111, "rewards/reward_func/std": 0.12136938919623692, "sampling/importance_sampling_ratio/max": 2.9999992847442627, "sampling/importance_sampling_ratio/mean": 0.9599671363830566, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.868587493896484, "sampling/sampling_logp_difference/mean": 0.18142351508140564, "step": 64, "step_time": 145.68370983726345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3697.0, "completions/mean_length": 1619.484375, "completions/mean_terminated_length": 1454.3834228515625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7143156677484512, "epoch": 0.16009852216748768, "frac_reward_zero_std": 0.5, "grad_norm": 0.005760896054601062, "kl": 0.0011219466978218406, "learning_rate": 4.983633882426471e-05, "loss": 0.05116764456033707, "num_tokens": 9952287.0, "reward": 1.359375, "reward_std": 1.1152576208114624, "rewards/reward_func/mean": 0.15104166666666666, "rewards/reward_func/std": 0.1629431736138132, "sampling/importance_sampling_ratio/max": 2.9972517490386963, "sampling/importance_sampling_ratio/mean": 0.9454111456871033, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.621063232421875, "sampling/sampling_logp_difference/mean": 0.21347495913505554, "step": 65, "step_time": 186.1511984670069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 1117.59375, "completions/mean_terminated_length": 1052.774169921875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6963524371385574, "epoch": 0.1625615763546798, "frac_reward_zero_std": 0.5, "grad_norm": 0.001653339708279588, "kl": 0.0012707496352959424, "learning_rate": 4.983075029350542e-05, "loss": 0.01102248951792717, "num_tokens": 10104389.0, "reward": 1.09375, "reward_std": 0.33481812477111816, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.045880657931168876, "sampling/importance_sampling_ratio/max": 2.997091054916382, "sampling/importance_sampling_ratio/mean": 0.9539859294891357, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.87465476989746, "sampling/sampling_logp_difference/mean": 0.1939973533153534, "step": 66, "step_time": 123.42807900626212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 931.90625, "completions/mean_terminated_length": 834.1638793945312, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6701111942529678, "epoch": 0.16502463054187191, "frac_reward_zero_std": 1.0, "grad_norm": 8.6437440455173e-05, "kl": 0.001773051859345287, "learning_rate": 4.9825068266159894e-05, "loss": 1.2892927770735696e-05, "num_tokens": 10248559.0, "reward": 1.078125, "reward_std": 0.2704896926879883, "rewards/reward_func/mean": 0.11979166666666667, "rewards/reward_func/std": 0.030054413610034518, "sampling/importance_sampling_ratio/max": 2.997563600540161, "sampling/importance_sampling_ratio/mean": 0.9578502178192139, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.782278060913086, "sampling/sampling_logp_difference/mean": 0.18533891439437866, "step": 67, "step_time": 128.0075939442031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 931.625, "completions/mean_terminated_length": 824.7540283203125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7462280243635178, "epoch": 0.16748768472906403, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013239457038290433, "kl": 0.0021935105323791504, "learning_rate": 4.981929276362298e-05, "loss": 1.589039311511442e-05, "num_tokens": 10389415.0, "reward": 1.109375, "reward_std": 0.3145764470100403, "rewards/reward_func/mean": 0.1232638888888889, "rewards/reward_func/std": 0.03495293855667114, "sampling/importance_sampling_ratio/max": 2.9994449615478516, "sampling/importance_sampling_ratio/mean": 0.9569523334503174, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.203958511352539, "sampling/sampling_logp_difference/mean": 0.19418185949325562, "step": 68, "step_time": 124.90443813405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3161.0, "completions/max_terminated_length": 3161.0, "completions/mean_length": 1039.765625, "completions/mean_terminated_length": 1039.765625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7052541077136993, "epoch": 0.16995073891625614, "frac_reward_zero_std": 0.75, "grad_norm": 0.0007086844449368942, "kl": 0.002509061130695045, "learning_rate": 4.981342380764149e-05, "loss": 0.006688619032502174, "num_tokens": 10550360.0, "reward": 1.10546875, "reward_std": 0.31749480962753296, "rewards/reward_func/mean": 0.1228298611111111, "rewards/reward_func/std": 0.038425160778893366, "sampling/importance_sampling_ratio/max": 2.998634099960327, "sampling/importance_sampling_ratio/mean": 0.9502047300338745, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.249970436096191, "sampling/sampling_logp_difference/mean": 0.20042559504508972, "step": 69, "step_time": 101.4746579460334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3996.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 928.03125, "completions/mean_terminated_length": 928.03125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7934342622756958, "epoch": 0.1724137931034483, "frac_reward_zero_std": 0.5, "grad_norm": 0.0009891554642842337, "kl": 0.0018119575106538832, "learning_rate": 4.980746142031414e-05, "loss": 0.007167218253016472, "num_tokens": 10698890.0, "reward": 1.1640625, "reward_std": 0.3651992976665497, "rewards/reward_func/mean": 0.1293402777777778, "rewards/reward_func/std": 0.04712180379364225, "sampling/importance_sampling_ratio/max": 2.998882293701172, "sampling/importance_sampling_ratio/mean": 0.9477431774139404, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.083467483520508, "sampling/sampling_logp_difference/mean": 0.2261689305305481, "step": 70, "step_time": 171.54467244585976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3915.0, "completions/mean_length": 1171.96875, "completions/mean_terminated_length": 1077.6451416015625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7315615117549896, "epoch": 0.1748768472906404, "frac_reward_zero_std": 0.5, "grad_norm": 0.00439498054004489, "kl": 0.0017650375666562468, "learning_rate": 4.980140562409141e-05, "loss": 0.009830066934227943, "num_tokens": 10869880.0, "reward": 1.17578125, "reward_std": 0.5130822062492371, "rewards/reward_func/mean": 0.1306423611111111, "rewards/reward_func/std": 0.09101471718814638, "sampling/importance_sampling_ratio/max": 2.9981822967529297, "sampling/importance_sampling_ratio/mean": 0.9450452923774719, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.777099609375, "sampling/sampling_logp_difference/mean": 0.22317034006118774, "step": 71, "step_time": 131.98447898984887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2886.0, "completions/mean_length": 1031.53125, "completions/mean_terminated_length": 982.888916015625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7074902355670929, "epoch": 0.17733990147783252, "frac_reward_zero_std": 0.5, "grad_norm": 0.002222973231863952, "kl": 0.0013612315233331174, "learning_rate": 4.979525644177554e-05, "loss": -0.005394880194216967, "num_tokens": 11026666.0, "reward": 1.15234375, "reward_std": 0.39994341135025024, "rewards/reward_func/mean": 0.12803819444444445, "rewards/reward_func/std": 0.05651323828432295, "sampling/importance_sampling_ratio/max": 2.9978277683258057, "sampling/importance_sampling_ratio/mean": 0.9466830492019653, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.806804656982422, "sampling/sampling_logp_difference/mean": 0.20748589932918549, "step": 72, "step_time": 130.5002006436698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3772.0, "completions/mean_length": 961.75, "completions/mean_terminated_length": 902.6451416015625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "degenerate_groups_filtered": 0.0, "entropy": 0.61832594871521, "epoch": 0.17980295566502463, "frac_reward_zero_std": 0.5, "grad_norm": 0.010038654463100299, "kl": 0.009182059322483838, "learning_rate": 4.978901389652039e-05, "loss": -0.05658572167158127, "num_tokens": 11175818.0, "reward": 1.18359375, "reward_std": 0.6889752745628357, "rewards/reward_func/mean": 0.13151041666666666, "rewards/reward_func/std": 0.10786960522333781, "sampling/importance_sampling_ratio/max": 2.996751070022583, "sampling/importance_sampling_ratio/mean": 0.9549329280853271, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.932666778564453, "sampling/sampling_logp_difference/mean": 0.17912691831588745, "step": 73, "step_time": 125.78719549998641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3745.0, "completions/max_terminated_length": 3745.0, "completions/mean_length": 952.8125, "completions/mean_terminated_length": 952.8125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7050873339176178, "epoch": 0.18226600985221675, "frac_reward_zero_std": 0.75, "grad_norm": 0.0005601993541825297, "kl": 0.0018221104983240366, "learning_rate": 4.978267801183133e-05, "loss": 7.19567178748548e-05, "num_tokens": 11323678.0, "reward": 1.16796875, "reward_std": 0.3728235363960266, "rewards/reward_func/mean": 0.12977430555555555, "rewards/reward_func/std": 0.04572268989351061, "sampling/importance_sampling_ratio/max": 2.9927818775177, "sampling/importance_sampling_ratio/mean": 0.952770471572876, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.401138305664062, "sampling/sampling_logp_difference/mean": 0.1930316686630249, "step": 74, "step_time": 110.4740979031194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 552.109375, "completions/mean_terminated_length": 538.6032104492188, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7133064270019531, "epoch": 0.18472906403940886, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011366120432484819, "kl": 0.0015789342869538814, "learning_rate": 4.977624881156524e-05, "loss": 1.5052077287691645e-05, "num_tokens": 11453589.0, "reward": 1.0625, "reward_std": 0.24397501349449158, "rewards/reward_func/mean": 0.11805555555555555, "rewards/reward_func/std": 0.027108336488405865, "sampling/importance_sampling_ratio/max": 2.990895986557007, "sampling/importance_sampling_ratio/mean": 0.9565259218215942, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.616744041442871, "sampling/sampling_logp_difference/mean": 0.1869342029094696, "step": 75, "step_time": 76.67042307183146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3307.0, "completions/mean_length": 1050.421875, "completions/mean_terminated_length": 952.1773681640625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6937734186649323, "epoch": 0.18719211822660098, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016237223301883853, "kl": 0.0011773691221605986, "learning_rate": 4.976972631993033e-05, "loss": -0.0016344873001798987, "num_tokens": 11610176.0, "reward": 1.07421875, "reward_std": 0.3171039819717407, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.044041900171173945, "sampling/importance_sampling_ratio/max": 2.994739055633545, "sampling/importance_sampling_ratio/mean": 0.9488315582275391, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.017475128173828, "sampling/sampling_logp_difference/mean": 0.20064926147460938, "step": 76, "step_time": 226.78311094199307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2795.0, "completions/max_terminated_length": 2795.0, "completions/mean_length": 755.515625, "completions/mean_terminated_length": 755.515625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6844090074300766, "epoch": 0.1896551724137931, "frac_reward_zero_std": 0.5, "grad_norm": 0.002431004974290889, "kl": 0.002143903257092461, "learning_rate": 4.976311056148609e-05, "loss": -0.013195082545280457, "num_tokens": 11738369.0, "reward": 1.046875, "reward_std": 0.32082173228263855, "rewards/reward_func/mean": 0.11631944444444445, "rewards/reward_func/std": 0.0475527693827947, "sampling/importance_sampling_ratio/max": 2.9915335178375244, "sampling/importance_sampling_ratio/mean": 0.9584981799125671, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.93669319152832, "sampling/sampling_logp_difference/mean": 0.18049074709415436, "step": 77, "step_time": 88.75774584687315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 982.546875, "completions/mean_terminated_length": 882.1128540039062, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8276418894529343, "epoch": 0.1921182266009852, "frac_reward_zero_std": 0.25, "grad_norm": 0.01783241687418567, "kl": 0.0020481182436924428, "learning_rate": 4.975640156114322e-05, "loss": -0.13759304583072662, "num_tokens": 11892212.0, "reward": 1.13671875, "reward_std": 0.5891372561454773, "rewards/reward_func/mean": 0.12630208333333334, "rewards/reward_func/std": 0.1024610847234726, "sampling/importance_sampling_ratio/max": 2.99920916557312, "sampling/importance_sampling_ratio/mean": 0.9454777240753174, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.856114387512207, "sampling/sampling_logp_difference/mean": 0.22521965205669403, "step": 78, "step_time": 155.03567869076505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 661.8125, "completions/mean_terminated_length": 607.3016357421875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7546624094247818, "epoch": 0.19458128078817735, "frac_reward_zero_std": 0.75, "grad_norm": 0.012855942487314536, "kl": 0.0011995464155916125, "learning_rate": 4.974959934416346e-05, "loss": -0.006040642969310284, "num_tokens": 12013368.0, "reward": 1.203125, "reward_std": 0.5957438349723816, "rewards/reward_func/mean": 0.13368055555555555, "rewards/reward_func/std": 0.09448693858252631, "sampling/importance_sampling_ratio/max": 2.9967188835144043, "sampling/importance_sampling_ratio/mean": 0.9574424624443054, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.747173309326172, "sampling/sampling_logp_difference/mean": 0.1902427226305008, "step": 79, "step_time": 108.63824260118417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3953.0, "completions/mean_length": 1435.046875, "completions/mean_terminated_length": 1342.6884765625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7154666781425476, "epoch": 0.19704433497536947, "frac_reward_zero_std": 0.25, "grad_norm": 0.008442218807283417, "kl": 0.0028500978223746642, "learning_rate": 4.9742703936159586e-05, "loss": -0.014930376783013344, "num_tokens": 12195467.0, "reward": 1.1640625, "reward_std": 0.5255736708641052, "rewards/reward_func/mean": 0.1293402777777778, "rewards/reward_func/std": 0.09521205723285675, "sampling/importance_sampling_ratio/max": 2.998208999633789, "sampling/importance_sampling_ratio/mean": 0.9411571025848389, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.294017791748047, "sampling/sampling_logp_difference/mean": 0.21197447180747986, "step": 80, "step_time": 132.8957766594831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3296.0, "completions/mean_length": 1159.109375, "completions/mean_terminated_length": 1064.3709716796875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7700304388999939, "epoch": 0.19950738916256158, "frac_reward_zero_std": 0.25, "grad_norm": 0.0076037617796647925, "kl": 0.0030132651154417545, "learning_rate": 4.973571536309525e-05, "loss": 0.0006418544799089432, "num_tokens": 12355842.0, "reward": 1.18359375, "reward_std": 0.6550205945968628, "rewards/reward_func/mean": 0.13151041666666666, "rewards/reward_func/std": 0.1200390938255522, "sampling/importance_sampling_ratio/max": 2.9922046661376953, "sampling/importance_sampling_ratio/mean": 0.9423102736473083, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.634392738342285, "sampling/sampling_logp_difference/mean": 0.22503802180290222, "step": 81, "step_time": 165.3830566899851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 1027.625, "completions/mean_terminated_length": 981.0967407226562, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6725837737321854, "epoch": 0.2019704433497537, "frac_reward_zero_std": 0.25, "grad_norm": 0.009061744134370203, "kl": 0.0016485043161083013, "learning_rate": 4.9728633651284914e-05, "loss": 0.013996231369674206, "num_tokens": 12524666.0, "reward": 1.18359375, "reward_std": 0.5794808268547058, "rewards/reward_func/mean": 0.13151041666666666, "rewards/reward_func/std": 0.10200054198503494, "sampling/importance_sampling_ratio/max": 2.993680953979492, "sampling/importance_sampling_ratio/mean": 0.9446038007736206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.980178833007812, "sampling/sampling_logp_difference/mean": 0.21005243062973022, "step": 82, "step_time": 165.60002181283198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 755.8125, "completions/mean_terminated_length": 755.8125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7532096058130264, "epoch": 0.2044334975369458, "frac_reward_zero_std": 0.75, "grad_norm": 0.012740595538592081, "kl": 0.0015318515652325004, "learning_rate": 4.972145882739374e-05, "loss": -0.03307109698653221, "num_tokens": 12656094.0, "reward": 1.171875, "reward_std": 0.605652928352356, "rewards/reward_func/mean": 0.13020833333333334, "rewards/reward_func/std": 0.1064814825852712, "sampling/importance_sampling_ratio/max": 2.997760772705078, "sampling/importance_sampling_ratio/mean": 0.952104389667511, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.445816040039062, "sampling/sampling_logp_difference/mean": 0.20742540061473846, "step": 83, "step_time": 73.10571676073596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3916.0, "completions/mean_length": 1155.390625, "completions/mean_terminated_length": 1096.6773681640625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7081952691078186, "epoch": 0.20689655172413793, "frac_reward_zero_std": 0.5, "grad_norm": 0.001375808186987275, "kl": 0.001374059182126075, "learning_rate": 4.971419091843748e-05, "loss": -0.0016404204070568085, "num_tokens": 12838567.0, "reward": 1.12109375, "reward_std": 0.3726571798324585, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.05033052464326223, "sampling/importance_sampling_ratio/max": 2.9999070167541504, "sampling/importance_sampling_ratio/mean": 0.9335456490516663, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.315726280212402, "sampling/sampling_logp_difference/mean": 0.24390606582164764, "step": 84, "step_time": 163.8807848696597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 734.703125, "completions/mean_terminated_length": 734.703125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6912446320056915, "epoch": 0.20935960591133004, "frac_reward_zero_std": 0.25, "grad_norm": 0.001448416309828472, "kl": 0.0035722781904041767, "learning_rate": 4.970682995178238e-05, "loss": -0.003674007486552, "num_tokens": 12973476.0, "reward": 1.109375, "reward_std": 0.3329611122608185, "rewards/reward_func/mean": 0.1232638888888889, "rewards/reward_func/std": 0.04381412226292822, "sampling/importance_sampling_ratio/max": 2.993730306625366, "sampling/importance_sampling_ratio/mean": 0.9494987726211548, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.641794204711914, "sampling/sampling_logp_difference/mean": 0.2067471742630005, "step": 85, "step_time": 77.86746055702679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 1015.203125, "completions/mean_terminated_length": 867.36669921875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7194895893335342, "epoch": 0.21182266009852216, "frac_reward_zero_std": 0.5, "grad_norm": 0.003522987529729015, "kl": 0.0016897321911528707, "learning_rate": 4.9699375955145114e-05, "loss": 0.055119533091783524, "num_tokens": 13116593.0, "reward": 1.15234375, "reward_std": 0.6675844788551331, "rewards/reward_func/mean": 0.12803819444444445, "rewards/reward_func/std": 0.09167053633266026, "sampling/importance_sampling_ratio/max": 2.997987747192383, "sampling/importance_sampling_ratio/mean": 0.9551426768302917, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.985254287719727, "sampling/sampling_logp_difference/mean": 0.19374895095825195, "step": 86, "step_time": 158.37500746524893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2987.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 704.09375, "completions/mean_terminated_length": 704.09375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6402583420276642, "epoch": 0.21428571428571427, "frac_reward_zero_std": 0.5, "grad_norm": 0.002849577272942313, "kl": 0.002314119366928935, "learning_rate": 4.96918289565926e-05, "loss": 0.010025442577898502, "num_tokens": 13240935.0, "reward": 1.08203125, "reward_std": 0.3592725396156311, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.052181267076068454, "sampling/importance_sampling_ratio/max": 2.9983925819396973, "sampling/importance_sampling_ratio/mean": 0.9622253775596619, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.306507110595703, "sampling/sampling_logp_difference/mean": 0.17209717631340027, "step": 87, "step_time": 89.82424217509106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 903.71875, "completions/mean_terminated_length": 853.0476684570312, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7863613218069077, "epoch": 0.21674876847290642, "frac_reward_zero_std": 0.5, "grad_norm": 0.0008406550829518916, "kl": 0.0019374474068172276, "learning_rate": 4.968418898454199e-05, "loss": -0.015768352895975113, "num_tokens": 13379749.0, "reward": 1.09375, "reward_std": 0.322748601436615, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.045880657931168876, "sampling/importance_sampling_ratio/max": 2.9976954460144043, "sampling/importance_sampling_ratio/mean": 0.950974702835083, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.29272174835205, "sampling/sampling_logp_difference/mean": 0.2106407880783081, "step": 88, "step_time": 148.23201068071648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3024.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 756.96875, "completions/mean_terminated_length": 756.96875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6922882348299026, "epoch": 0.21921182266009853, "frac_reward_zero_std": 0.0, "grad_norm": 0.012908831983821652, "kl": 0.0029333174461498857, "learning_rate": 4.967645606776047e-05, "loss": 0.05130379647016525, "num_tokens": 13513667.0, "reward": 1.16015625, "reward_std": 0.8313872218132019, "rewards/reward_func/mean": 0.12890625, "rewards/reward_func/std": 0.11999391516049702, "sampling/importance_sampling_ratio/max": 2.9972262382507324, "sampling/importance_sampling_ratio/mean": 0.9526609182357788, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.749960899353027, "sampling/sampling_logp_difference/mean": 0.19566956162452698, "step": 89, "step_time": 90.78295086394064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3175.0, "completions/max_terminated_length": 3175.0, "completions/mean_length": 949.28125, "completions/mean_terminated_length": 956.84130859375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "degenerate_groups_filtered": 1.0, "entropy": 0.666090190410614, "epoch": 0.22167487684729065, "frac_reward_zero_std": 0.5, "grad_norm": 0.006511050657427078, "kl": 0.0016060115303844213, "learning_rate": 4.966863023536523e-05, "loss": 0.027448534965515137, "num_tokens": 13669637.0, "reward": 1.1796875, "reward_std": 0.5917232632637024, "rewards/reward_func/mean": 0.1310763888888889, "rewards/reward_func/std": 0.09746392981873618, "sampling/importance_sampling_ratio/max": 2.9971022605895996, "sampling/importance_sampling_ratio/mean": 0.9492952823638916, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.294812202453613, "sampling/sampling_logp_difference/mean": 0.2006000131368637, "step": 90, "step_time": 104.78784828796051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 852.40625, "completions/mean_terminated_length": 815.3278198242188, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6824957877397537, "epoch": 0.22413793103448276, "frac_reward_zero_std": 0.0, "grad_norm": 0.013043479712303003, "kl": 0.0023478574585169554, "learning_rate": 4.96607115168233e-05, "loss": -0.04977197200059891, "num_tokens": 13818799.0, "reward": 1.03515625, "reward_std": 0.5453031659126282, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.08766606450080872, "sampling/importance_sampling_ratio/max": 2.9927175045013428, "sampling/importance_sampling_ratio/mean": 0.9504046440124512, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.90580940246582, "sampling/sampling_logp_difference/mean": 0.19969666004180908, "step": 91, "step_time": 170.82128311530687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 1190.921875, "completions/mean_terminated_length": 1062.1334228515625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "degenerate_groups_filtered": 0.0, "entropy": 0.67007115483284, "epoch": 0.22660098522167488, "frac_reward_zero_std": 0.25, "grad_norm": 0.00884542043937575, "kl": 0.002599392697447911, "learning_rate": 4.965269994195146e-05, "loss": 0.006632131524384022, "num_tokens": 13984954.0, "reward": 1.21875, "reward_std": 0.6965048909187317, "rewards/reward_func/mean": 0.13541666666666666, "rewards/reward_func/std": 0.12860211316082212, "sampling/importance_sampling_ratio/max": 2.9993481636047363, "sampling/importance_sampling_ratio/mean": 0.9467147588729858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.052059173583984, "sampling/sampling_logp_difference/mean": 0.20239023864269257, "step": 92, "step_time": 128.50959110469557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3876.0, "completions/max_terminated_length": 3876.0, "completions/mean_length": 807.453125, "completions/mean_terminated_length": 785.245849609375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6888804733753204, "epoch": 0.229064039408867, "frac_reward_zero_std": 0.5, "grad_norm": 0.0015433485271994368, "kl": 0.002188895596191287, "learning_rate": 4.964459554091615e-05, "loss": 0.012207363732159138, "num_tokens": 14128919.0, "reward": 1.0546875, "reward_std": 0.2498759627342224, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.031979672610759735, "sampling/importance_sampling_ratio/max": 2.99534010887146, "sampling/importance_sampling_ratio/mean": 0.9456230998039246, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.117466926574707, "sampling/sampling_logp_difference/mean": 0.21316124498844147, "step": 93, "step_time": 100.48218262591399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 1112.515625, "completions/mean_terminated_length": 998.35595703125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7000087946653366, "epoch": 0.2315270935960591, "frac_reward_zero_std": 0.5, "grad_norm": 0.004334828463500131, "kl": 0.0017916160868480802, "learning_rate": 4.9636398344233294e-05, "loss": 0.029138652607798576, "num_tokens": 14288936.0, "reward": 1.21875, "reward_std": 0.7189332842826843, "rewards/reward_func/mean": 0.13541666666666666, "rewards/reward_func/std": 0.10873374260134167, "sampling/importance_sampling_ratio/max": 2.9988293647766113, "sampling/importance_sampling_ratio/mean": 0.948699414730072, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.972160339355469, "sampling/sampling_logp_difference/mean": 0.20441797375679016, "step": 94, "step_time": 144.95788948773406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3095.0, "completions/max_terminated_length": 3095.0, "completions/mean_length": 746.5, "completions/mean_terminated_length": 749.5238647460938, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8084656894207001, "epoch": 0.23399014778325122, "frac_reward_zero_std": 0.0, "grad_norm": 0.002255399850514679, "kl": 0.0023641733278054744, "learning_rate": 4.9628108382768255e-05, "loss": 0.006529998034238815, "num_tokens": 14419336.0, "reward": 1.0859375, "reward_std": 0.33100003004074097, "rewards/reward_func/mean": 0.12065972222222222, "rewards/reward_func/std": 0.04311362819539176, "sampling/importance_sampling_ratio/max": 2.9969942569732666, "sampling/importance_sampling_ratio/mean": 0.949213981628418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.631890296936035, "sampling/sampling_logp_difference/mean": 0.21985289454460144, "step": 95, "step_time": 88.58697469602339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3782.0, "completions/mean_length": 998.828125, "completions/mean_terminated_length": 884.8359985351562, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8362721502780914, "epoch": 0.23645320197044334, "frac_reward_zero_std": 0.0, "grad_norm": 0.006188412376322827, "kl": 0.005402846087235957, "learning_rate": 4.9619725687735686e-05, "loss": 0.012449314817786217, "num_tokens": 14568589.0, "reward": 1.25, "reward_std": 0.8428032994270325, "rewards/reward_func/mean": 0.1388888888888889, "rewards/reward_func/std": 0.13194227839509645, "sampling/importance_sampling_ratio/max": 2.9986398220062256, "sampling/importance_sampling_ratio/mean": 0.9465553164482117, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.732090950012207, "sampling/sampling_logp_difference/mean": 0.22394214570522308, "step": 96, "step_time": 154.48756418889388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3459.0, "completions/mean_length": 879.171875, "completions/mean_terminated_length": 828.1111450195312, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6241284310817719, "epoch": 0.23891625615763548, "frac_reward_zero_std": 0.25, "grad_norm": 0.0066502625663478546, "kl": 0.002683707105461508, "learning_rate": 4.96112502906994e-05, "loss": 0.014375880360603333, "num_tokens": 14705512.0, "reward": 1.2109375, "reward_std": 0.6223654747009277, "rewards/reward_func/mean": 0.1345486111111111, "rewards/reward_func/std": 0.10964169187678231, "sampling/importance_sampling_ratio/max": 2.9906349182128906, "sampling/importance_sampling_ratio/mean": 0.9618527889251709, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.417328834533691, "sampling/sampling_logp_difference/mean": 0.16517508029937744, "step": 97, "step_time": 124.35440509300679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2937.0, "completions/mean_length": 956.59375, "completions/mean_terminated_length": 906.761962890625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7140306979417801, "epoch": 0.2413793103448276, "frac_reward_zero_std": 0.25, "grad_norm": 0.00833558507766438, "kl": 0.0026831042487174273, "learning_rate": 4.960268222357227e-05, "loss": -0.014123106375336647, "num_tokens": 14860574.0, "reward": 1.20703125, "reward_std": 0.6879845857620239, "rewards/reward_func/mean": 0.13411458333333334, "rewards/reward_func/std": 0.12600696169667774, "sampling/importance_sampling_ratio/max": 2.998737096786499, "sampling/importance_sampling_ratio/mean": 0.9459173083305359, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.235814094543457, "sampling/sampling_logp_difference/mean": 0.21586598455905914, "step": 98, "step_time": 125.9828494079411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 1306.296875, "completions/mean_terminated_length": 1179.559326171875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7497763484716415, "epoch": 0.2438423645320197, "frac_reward_zero_std": 0.5, "grad_norm": 0.0007196218706857476, "kl": 0.0024949743528850377, "learning_rate": 4.959402151861613e-05, "loss": 0.00040583667578175664, "num_tokens": 15036001.0, "reward": 1.08203125, "reward_std": 0.3022885024547577, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.03856059287985166, "sampling/importance_sampling_ratio/max": 2.9977340698242188, "sampling/importance_sampling_ratio/mean": 0.9443312883377075, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.374924659729004, "sampling/sampling_logp_difference/mean": 0.2221134603023529, "step": 99, "step_time": 177.20913607790135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3101.0, "completions/mean_length": 1241.84375, "completions/mean_terminated_length": 1127.6064453125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6876734793186188, "epoch": 0.24630541871921183, "frac_reward_zero_std": 0.25, "grad_norm": 0.009038766598917204, "kl": 0.00271606317255646, "learning_rate": 4.958526820844158e-05, "loss": -0.003039947245270014, "num_tokens": 15212663.0, "reward": 1.2578125, "reward_std": 0.7084646224975586, "rewards/reward_func/mean": 0.13975694444444445, "rewards/reward_func/std": 0.1226612784796291, "sampling/importance_sampling_ratio/max": 2.9980616569519043, "sampling/importance_sampling_ratio/mean": 0.9425716400146484, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.271185874938965, "sampling/sampling_logp_difference/mean": 0.2198907732963562, "step": 100, "step_time": 137.27148599014618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3853.0, "completions/mean_length": 1085.15625, "completions/mean_terminated_length": 988.0322265625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6465490013360977, "epoch": 0.24876847290640394, "frac_reward_zero_std": 0.5, "grad_norm": 0.012525746257424489, "kl": 0.0029582843417301774, "learning_rate": 4.957642232600797e-05, "loss": 0.04876864701509476, "num_tokens": 15367569.0, "reward": 1.33984375, "reward_std": 1.0440374612808228, "rewards/reward_func/mean": 0.1488715277777778, "rewards/reward_func/std": 0.1550002164310879, "sampling/importance_sampling_ratio/max": 2.999211072921753, "sampling/importance_sampling_ratio/mean": 0.95467609167099, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.06248950958252, "sampling/sampling_logp_difference/mean": 0.18970772624015808, "step": 101, "step_time": 118.69119972735643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 1154.546875, "completions/mean_terminated_length": 1059.6612548828125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6360055506229401, "epoch": 0.2512315270935961, "frac_reward_zero_std": 0.25, "grad_norm": 0.012431235826599366, "kl": 0.006371326482621953, "learning_rate": 4.956748390462316e-05, "loss": 0.026809336617588997, "num_tokens": 15533924.0, "reward": 1.34375, "reward_std": 1.1246691942214966, "rewards/reward_func/mean": 0.14930555555555555, "rewards/reward_func/std": 0.17304262146353722, "sampling/importance_sampling_ratio/max": 2.9996252059936523, "sampling/importance_sampling_ratio/mean": 0.9527342915534973, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.81247615814209, "sampling/sampling_logp_difference/mean": 0.185723677277565, "step": 102, "step_time": 131.5465091553051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3693.0, "completions/mean_length": 1222.40625, "completions/mean_terminated_length": 1181.9193115234375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7460045963525772, "epoch": 0.2536945812807882, "frac_reward_zero_std": 0.0, "grad_norm": 0.013992385855988483, "kl": 0.0032767574884928763, "learning_rate": 4.955845297794348e-05, "loss": 0.02842138148844242, "num_tokens": 15713870.0, "reward": 1.5078125, "reward_std": 1.1425819396972656, "rewards/reward_func/mean": 0.1675347222222222, "rewards/reward_func/std": 0.18107240481509101, "sampling/importance_sampling_ratio/max": 2.999453544616699, "sampling/importance_sampling_ratio/mean": 0.9345462322235107, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.628706932067871, "sampling/sampling_logp_difference/mean": 0.23743605613708496, "step": 103, "step_time": 138.19932377617806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3875.0, "completions/mean_length": 806.578125, "completions/mean_terminated_length": 766.51611328125, "completions/min_length": 1.0, "completions/min_terminated_length": 160.0, "degenerate_groups_filtered": 0.0, "entropy": 0.715620145201683, "epoch": 0.2561576354679803, "frac_reward_zero_std": 0.0, "grad_norm": 0.015716834928270983, "kl": 0.0057948356261476874, "learning_rate": 4.954932957997359e-05, "loss": -0.037697743624448776, "num_tokens": 15839715.0, "reward": 1.37109375, "reward_std": 1.130381464958191, "rewards/reward_func/mean": 0.15234375, "rewards/reward_func/std": 0.1768003437254164, "sampling/importance_sampling_ratio/max": 2.9976377487182617, "sampling/importance_sampling_ratio/mean": 0.957233190536499, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.865558624267578, "sampling/sampling_logp_difference/mean": 0.19012512266635895, "step": 104, "step_time": 162.9517569427844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3698.0, "completions/mean_length": 1274.375, "completions/mean_terminated_length": 1135.6064453125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7952848523855209, "epoch": 0.25862068965517243, "frac_reward_zero_std": 0.0, "grad_norm": 0.014526650968551654, "kl": 0.005329428589902818, "learning_rate": 4.954011374506632e-05, "loss": -0.058310359716415405, "num_tokens": 16010043.0, "reward": 1.27734375, "reward_std": 1.0236462354660034, "rewards/reward_func/mean": 0.14192708333333334, "rewards/reward_func/std": 0.1571247395541933, "sampling/importance_sampling_ratio/max": 2.9957821369171143, "sampling/importance_sampling_ratio/mean": 0.9390854239463806, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.681705474853516, "sampling/sampling_logp_difference/mean": 0.2258550226688385, "step": 105, "step_time": 196.65707094292156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 846.390625, "completions/mean_terminated_length": 797.258056640625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "degenerate_groups_filtered": 1.0, "entropy": 0.749016284942627, "epoch": 0.26108374384236455, "frac_reward_zero_std": 0.25, "grad_norm": 0.016145450162948976, "kl": 0.00974008662160486, "learning_rate": 4.953080550792254e-05, "loss": 0.00276003684848547, "num_tokens": 16144804.0, "reward": 1.4921875, "reward_std": 1.1960324048995972, "rewards/reward_func/mean": 0.1657986111111111, "rewards/reward_func/std": 0.18122834712266922, "sampling/importance_sampling_ratio/max": 2.997323989868164, "sampling/importance_sampling_ratio/mean": 0.9523142576217651, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.121150016784668, "sampling/sampling_logp_difference/mean": 0.19664835929870605, "step": 106, "step_time": 120.09341975627467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3935.0, "completions/mean_length": 902.875, "completions/mean_terminated_length": 801.6884765625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6777280122041702, "epoch": 0.26354679802955666, "frac_reward_zero_std": 0.0, "grad_norm": 0.02015814534595686, "kl": 0.0061337974620983005, "learning_rate": 4.952140490359108e-05, "loss": 0.02125009149312973, "num_tokens": 16290140.0, "reward": 1.64453125, "reward_std": 1.4643954038619995, "rewards/reward_func/mean": 0.18272569444444445, "rewards/reward_func/std": 0.21869104810886913, "sampling/importance_sampling_ratio/max": 2.9990665912628174, "sampling/importance_sampling_ratio/mean": 0.9552789926528931, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.122296333312988, "sampling/sampling_logp_difference/mean": 0.1901204138994217, "step": 107, "step_time": 136.87008723593317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3418.0, "completions/mean_length": 932.375, "completions/mean_terminated_length": 864.2333984375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6734343022108078, "epoch": 0.2660098522167488, "frac_reward_zero_std": 0.0, "grad_norm": 0.029815642765056004, "kl": 0.015761199640110135, "learning_rate": 4.951191196746855e-05, "loss": -0.06286308169364929, "num_tokens": 16442756.0, "reward": 2.078125, "reward_std": 1.7877864837646484, "rewards/reward_func/mean": 0.2309027777777778, "rewards/reward_func/std": 0.24734222681985962, "sampling/importance_sampling_ratio/max": 2.997526168823242, "sampling/importance_sampling_ratio/mean": 0.9572924375534058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.08864688873291, "sampling/sampling_logp_difference/mean": 0.18523582816123962, "step": 108, "step_time": 119.36969709699042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3303.0, "completions/mean_length": 1167.125, "completions/mean_terminated_length": 1023.7333984375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "degenerate_groups_filtered": 0.0, "entropy": 0.641172468662262, "epoch": 0.2684729064039409, "frac_reward_zero_std": 0.0, "grad_norm": 0.026888648266965774, "kl": 0.02729413635097444, "learning_rate": 4.950232673529922e-05, "loss": -0.14320430159568787, "num_tokens": 16600412.0, "reward": 2.49609375, "reward_std": 2.1480391025543213, "rewards/reward_func/mean": 0.27734375, "rewards/reward_func/std": 0.32694076084428364, "sampling/importance_sampling_ratio/max": 2.9970366954803467, "sampling/importance_sampling_ratio/mean": 0.9530101418495178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.124444007873535, "sampling/sampling_logp_difference/mean": 0.18392516672611237, "step": 109, "step_time": 185.8794325578492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3733.0, "completions/mean_length": 1088.71875, "completions/mean_terminated_length": 1040.984130859375, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6633197665214539, "epoch": 0.270935960591133, "frac_reward_zero_std": 0.0, "grad_norm": 0.03216236684392253, "kl": 0.016894686967134476, "learning_rate": 4.9492649243174894e-05, "loss": 0.16343486309051514, "num_tokens": 16762842.0, "reward": 2.7109375, "reward_std": 2.0257365703582764, "rewards/reward_func/mean": 0.3012152777777778, "rewards/reward_func/std": 0.2992282451854812, "sampling/importance_sampling_ratio/max": 2.999589443206787, "sampling/importance_sampling_ratio/mean": 0.9499955177307129, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.999334335327148, "sampling/sampling_logp_difference/mean": 0.19575105607509613, "step": 110, "step_time": 123.32209147373214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3051.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 982.609375, "completions/mean_terminated_length": 965.1802978515625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7807110399007797, "epoch": 0.2733990147783251, "frac_reward_zero_std": 0.0, "grad_norm": 0.03510656960378165, "kl": 0.054872382432222366, "learning_rate": 4.948287952753475e-05, "loss": 0.18941861391067505, "num_tokens": 16910865.0, "reward": 3.14453125, "reward_std": 2.102910280227661, "rewards/reward_func/mean": 0.3493923611111111, "rewards/reward_func/std": 0.30523034267955357, "sampling/importance_sampling_ratio/max": 2.98622727394104, "sampling/importance_sampling_ratio/mean": 0.9509456157684326, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.073673248291016, "sampling/sampling_logp_difference/mean": 0.20016415417194366, "step": 111, "step_time": 96.22127129789442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 949.78125, "completions/mean_terminated_length": 895.800048828125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6340015828609467, "epoch": 0.27586206896551724, "frac_reward_zero_std": 0.0, "grad_norm": 0.035053246082576636, "kl": 0.025362088344991207, "learning_rate": 4.947301762516526e-05, "loss": -0.24255138635635376, "num_tokens": 17052563.0, "reward": 3.7265625, "reward_std": 2.2624080181121826, "rewards/reward_func/mean": 0.4140625, "rewards/reward_func/std": 0.3382473927405145, "sampling/importance_sampling_ratio/max": 2.998375415802002, "sampling/importance_sampling_ratio/mean": 0.9581119418144226, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.95738410949707, "sampling/sampling_logp_difference/mean": 0.17058373987674713, "step": 112, "step_time": 177.4650380751118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3117.0, "completions/mean_length": 1217.34375, "completions/mean_terminated_length": 1138.6785888671875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6510083079338074, "epoch": 0.27832512315270935, "frac_reward_zero_std": 0.0, "grad_norm": 0.03581845458385713, "kl": 0.12716092402115464, "learning_rate": 4.946306357319997e-05, "loss": 0.04907531663775444, "num_tokens": 17218617.0, "reward": 3.55859375, "reward_std": 2.1090047359466553, "rewards/reward_func/mean": 0.3953993055555556, "rewards/reward_func/std": 0.3486923161480162, "sampling/importance_sampling_ratio/max": 2.998523473739624, "sampling/importance_sampling_ratio/mean": 0.951481819152832, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.994354248046875, "sampling/sampling_logp_difference/mean": 0.19012655317783356, "step": 113, "step_time": 136.22616961598396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 1177.359375, "completions/mean_terminated_length": 1005.586181640625, "completions/min_length": 237.0, "completions/min_terminated_length": 276.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5681300014257431, "epoch": 0.28078817733990147, "frac_reward_zero_std": 0.0, "grad_norm": 0.028436619055383507, "kl": 0.030896728625521064, "learning_rate": 4.9453017409119416e-05, "loss": 0.01816936582326889, "num_tokens": 17376608.0, "reward": 3.27734375, "reward_std": 2.361697196960449, "rewards/reward_func/mean": 0.3641493055555556, "rewards/reward_func/std": 0.34005943934122723, "sampling/importance_sampling_ratio/max": 2.9866039752960205, "sampling/importance_sampling_ratio/mean": 0.9625529646873474, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.621533393859863, "sampling/sampling_logp_difference/mean": 0.1564193069934845, "step": 114, "step_time": 129.57314335857518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3927.0, "completions/mean_length": 1518.875, "completions/mean_terminated_length": 1434.8070068359375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5538199990987778, "epoch": 0.2832512315270936, "frac_reward_zero_std": 0.0, "grad_norm": 0.025627192501330182, "kl": 0.026664719451218843, "learning_rate": 4.9442879170750976e-05, "loss": -0.0327904112637043, "num_tokens": 17559704.0, "reward": 3.7734375, "reward_std": 2.075326919555664, "rewards/reward_func/mean": 0.4192708333333333, "rewards/reward_func/std": 0.31613584028349984, "sampling/importance_sampling_ratio/max": 2.999535083770752, "sampling/importance_sampling_ratio/mean": 0.9549193382263184, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.575541496276855, "sampling/sampling_logp_difference/mean": 0.16454458236694336, "step": 115, "step_time": 127.90012184623629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3664.0, "completions/mean_length": 1275.0, "completions/mean_terminated_length": 1083.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6624896675348282, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.0, "grad_norm": 0.025305169538771953, "kl": 0.029944519512355328, "learning_rate": 4.943264889626871e-05, "loss": -0.010028105229139328, "num_tokens": 17731512.0, "reward": 4.19140625, "reward_std": 1.9678679704666138, "rewards/reward_func/mean": 0.4657118055555556, "rewards/reward_func/std": 0.3197719504435857, "sampling/importance_sampling_ratio/max": 2.9974968433380127, "sampling/importance_sampling_ratio/mean": 0.9508453607559204, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.936443328857422, "sampling/sampling_logp_difference/mean": 0.1938401758670807, "step": 116, "step_time": 124.77625619992614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3657.0, "completions/mean_length": 1349.109375, "completions/mean_terminated_length": 1051.345458984375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5349463224411011, "epoch": 0.2881773399014778, "frac_reward_zero_std": 0.0, "grad_norm": 0.02015511036915428, "kl": 0.02311926893889904, "learning_rate": 4.942232662419324e-05, "loss": -0.0261261984705925, "num_tokens": 17893519.0, "reward": 4.4296875, "reward_std": 1.7346084117889404, "rewards/reward_func/mean": 0.4921875, "rewards/reward_func/std": 0.28419747120804256, "sampling/importance_sampling_ratio/max": 2.9996516704559326, "sampling/importance_sampling_ratio/mean": 0.9627872109413147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.845947265625, "sampling/sampling_logp_difference/mean": 0.1443958282470703, "step": 117, "step_time": 115.21389902406372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3637.0, "completions/mean_length": 1419.34375, "completions/mean_terminated_length": 1145.5926513671875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5412204563617706, "epoch": 0.29064039408866993, "frac_reward_zero_std": 0.0, "grad_norm": 0.025050176257689093, "kl": 0.028975401539355516, "learning_rate": 4.941191239339158e-05, "loss": -0.17452068626880646, "num_tokens": 18065269.0, "reward": 4.2421875, "reward_std": 2.0397677421569824, "rewards/reward_func/mean": 0.4713541666666667, "rewards/reward_func/std": 0.32353421714570785, "sampling/importance_sampling_ratio/max": 2.9959006309509277, "sampling/importance_sampling_ratio/mean": 0.9568660259246826, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.495771408081055, "sampling/sampling_logp_difference/mean": 0.1632460504770279, "step": 118, "step_time": 131.64897252176888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 1496.625, "completions/mean_terminated_length": 1206.375, "completions/min_length": 159.0, "completions/min_terminated_length": 214.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5859149247407913, "epoch": 0.29310344827586204, "frac_reward_zero_std": 0.0, "grad_norm": 0.023282759194587912, "kl": 0.03531341487541795, "learning_rate": 4.9401406243077e-05, "loss": -0.054308779537677765, "num_tokens": 18247453.0, "reward": 3.76171875, "reward_std": 2.172807216644287, "rewards/reward_func/mean": 0.41796875, "rewards/reward_func/std": 0.3388514237271415, "sampling/importance_sampling_ratio/max": 2.9988443851470947, "sampling/importance_sampling_ratio/mean": 0.9568687677383423, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.808798789978027, "sampling/sampling_logp_difference/mean": 0.17285825312137604, "step": 119, "step_time": 186.21395082375966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 1390.578125, "completions/mean_terminated_length": 1102.26416015625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5405104458332062, "epoch": 0.2955665024630542, "frac_reward_zero_std": 0.0, "grad_norm": 0.018803990743447324, "kl": 0.06734272092580795, "learning_rate": 4.939080821280889e-05, "loss": -0.07871407270431519, "num_tokens": 18418210.0, "reward": 4.59375, "reward_std": 1.9418632984161377, "rewards/reward_func/mean": 0.5104166666666666, "rewards/reward_func/std": 0.3218831883536445, "sampling/importance_sampling_ratio/max": 2.9988901615142822, "sampling/importance_sampling_ratio/mean": 0.9608282446861267, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.319305419921875, "sampling/sampling_logp_difference/mean": 0.15571467578411102, "step": 120, "step_time": 126.49870767304674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 1579.703125, "completions/mean_terminated_length": 1083.734619140625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5791607052087784, "epoch": 0.29802955665024633, "frac_reward_zero_std": 0.0, "grad_norm": 0.018886959920027147, "kl": 0.11334666470065713, "learning_rate": 4.9380118342492596e-05, "loss": -0.0750376507639885, "num_tokens": 18600895.0, "reward": 4.59375, "reward_std": 1.8953262567520142, "rewards/reward_func/mean": 0.5104166666666666, "rewards/reward_func/std": 0.3163795851998859, "sampling/importance_sampling_ratio/max": 2.9966447353363037, "sampling/importance_sampling_ratio/mean": 0.9624161720275879, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.319793701171875, "sampling/sampling_logp_difference/mean": 0.16001760959625244, "step": 121, "step_time": 131.29123148694634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3348.0, "completions/mean_length": 1523.4375, "completions/mean_terminated_length": 1047.0369873046875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5341637283563614, "epoch": 0.30049261083743845, "frac_reward_zero_std": 0.0, "grad_norm": 0.01854662654237563, "kl": 0.03084225719794631, "learning_rate": 4.936933667237926e-05, "loss": -0.05206795781850815, "num_tokens": 18784987.0, "reward": 4.55078125, "reward_std": 1.8017736673355103, "rewards/reward_func/mean": 0.5056423611111112, "rewards/reward_func/std": 0.3026386151711146, "sampling/importance_sampling_ratio/max": 2.9998748302459717, "sampling/importance_sampling_ratio/mean": 0.9602913856506348, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.124850273132324, "sampling/sampling_logp_difference/mean": 0.1521858274936676, "step": 122, "step_time": 178.7429536471609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3319.0, "completions/mean_length": 1527.625, "completions/mean_terminated_length": 1043.1199951171875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "degenerate_groups_filtered": 0.0, "entropy": 0.580025851726532, "epoch": 0.30295566502463056, "frac_reward_zero_std": 0.0, "grad_norm": 0.020308490136383247, "kl": 0.02926149358972907, "learning_rate": 4.935846324306571e-05, "loss": -0.007207506336271763, "num_tokens": 18971091.0, "reward": 4.83984375, "reward_std": 1.9646514654159546, "rewards/reward_func/mean": 0.5377604166666666, "rewards/reward_func/std": 0.32252822981940377, "sampling/importance_sampling_ratio/max": 2.996708869934082, "sampling/importance_sampling_ratio/mean": 0.9550896883010864, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.216852188110352, "sampling/sampling_logp_difference/mean": 0.17026376724243164, "step": 123, "step_time": 198.48502158699557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1257.0625, "completions/mean_terminated_length": 992.870361328125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "degenerate_groups_filtered": 0.0, "entropy": 0.4820714667439461, "epoch": 0.3054187192118227, "frac_reward_zero_std": 0.0, "grad_norm": 0.01958396564544384, "kl": 0.04494946589693427, "learning_rate": 4.934749809549427e-05, "loss": -0.10934413224458694, "num_tokens": 19134583.0, "reward": 4.5859375, "reward_std": 1.928904414176941, "rewards/reward_func/mean": 0.5095486111111112, "rewards/reward_func/std": 0.3117486619287067, "sampling/importance_sampling_ratio/max": 2.9978034496307373, "sampling/importance_sampling_ratio/mean": 0.9640801548957825, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.056150436401367, "sampling/sampling_logp_difference/mean": 0.13894838094711304, "step": 124, "step_time": 134.90961828804575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3030.0, "completions/mean_length": 1880.578125, "completions/mean_terminated_length": 1210.3863525390625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5541824847459793, "epoch": 0.3078817733990148, "frac_reward_zero_std": 0.0, "grad_norm": 0.016006255366419888, "kl": 0.031152247916907072, "learning_rate": 4.9336441270952595e-05, "loss": -0.15780873596668243, "num_tokens": 19346876.0, "reward": 4.2109375, "reward_std": 1.9901636838912964, "rewards/reward_func/mean": 0.4678819444444444, "rewards/reward_func/std": 0.3124983575608995, "sampling/importance_sampling_ratio/max": 2.992753267288208, "sampling/importance_sampling_ratio/mean": 0.9540883302688599, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.755999565124512, "sampling/sampling_logp_difference/mean": 0.17641326785087585, "step": 125, "step_time": 134.62373927910812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 1447.328125, "completions/mean_terminated_length": 1258.482177734375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5232695862650871, "epoch": 0.3103448275862069, "frac_reward_zero_std": 0.0, "grad_norm": 0.02110460828294288, "kl": 0.021355477161705494, "learning_rate": 4.932529281107355e-05, "loss": 0.04123953729867935, "num_tokens": 19529921.0, "reward": 4.65625, "reward_std": 1.6051133871078491, "rewards/reward_func/mean": 0.5173611111111112, "rewards/reward_func/std": 0.2619215887453821, "sampling/importance_sampling_ratio/max": 2.999826192855835, "sampling/importance_sampling_ratio/mean": 0.9576446413993835, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.864720344543457, "sampling/sampling_logp_difference/mean": 0.15989640355110168, "step": 126, "step_time": 139.1471421548631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 1351.46875, "completions/mean_terminated_length": 1131.0726318359375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5890444368124008, "epoch": 0.312807881773399, "frac_reward_zero_std": 0.0, "grad_norm": 0.026187013896567975, "kl": 0.04253423307090998, "learning_rate": 4.931405275783507e-05, "loss": 0.029535703361034393, "num_tokens": 19696367.0, "reward": 4.34765625, "reward_std": 1.9167273044586182, "rewards/reward_func/mean": 0.4830729166666667, "rewards/reward_func/std": 0.2942189425230026, "sampling/importance_sampling_ratio/max": 2.9940335750579834, "sampling/importance_sampling_ratio/mean": 0.9561335444450378, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.746670722961426, "sampling/sampling_logp_difference/mean": 0.1686076819896698, "step": 127, "step_time": 129.97182760294527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 1517.96875, "completions/mean_terminated_length": 930.4118041992188, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5890470743179321, "epoch": 0.31527093596059114, "frac_reward_zero_std": 0.0, "grad_norm": 0.019386571014408556, "kl": 0.02690672129392624, "learning_rate": 4.930272115355992e-05, "loss": 0.06074991077184677, "num_tokens": 19872765.0, "reward": 4.25390625, "reward_std": 1.9833093881607056, "rewards/reward_func/mean": 0.47265625, "rewards/reward_func/std": 0.3048050221469667, "sampling/importance_sampling_ratio/max": 2.988401412963867, "sampling/importance_sampling_ratio/mean": 0.9639754295349121, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.903083801269531, "sampling/sampling_logp_difference/mean": 0.15789353847503662, "step": 128, "step_time": 127.74785562674515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3943.0, "completions/mean_length": 1287.140625, "completions/mean_terminated_length": 947.0000610351562, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5195007175207138, "epoch": 0.31773399014778325, "frac_reward_zero_std": 0.0, "grad_norm": 0.020892925194507105, "kl": 0.05176311079412699, "learning_rate": 4.929129804091562e-05, "loss": 0.012292366474866867, "num_tokens": 20032838.0, "reward": 4.8515625, "reward_std": 1.6842634677886963, "rewards/reward_func/mean": 0.5390625, "rewards/reward_func/std": 0.29546265221304363, "sampling/importance_sampling_ratio/max": 2.999087333679199, "sampling/importance_sampling_ratio/mean": 0.9689388275146484, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.614640235900879, "sampling/sampling_logp_difference/mean": 0.1367620825767517, "step": 129, "step_time": 121.45446623302996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3191.0, "completions/mean_length": 1508.109375, "completions/mean_terminated_length": 1028.870361328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6037813127040863, "epoch": 0.32019704433497537, "frac_reward_zero_std": 0.0, "grad_norm": 0.01962016974443757, "kl": 0.022648704703897238, "learning_rate": 4.927978346291424e-05, "loss": -0.17073693871498108, "num_tokens": 20210157.0, "reward": 4.05859375, "reward_std": 2.1061806678771973, "rewards/reward_func/mean": 0.4509548611111111, "rewards/reward_func/std": 0.32526984645260704, "sampling/importance_sampling_ratio/max": 2.999075174331665, "sampling/importance_sampling_ratio/mean": 0.9614124298095703, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.54149055480957, "sampling/sampling_logp_difference/mean": 0.1666044294834137, "step": 130, "step_time": 139.74313421617262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 1106.921875, "completions/mean_terminated_length": 907.6500244140625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5114710703492165, "epoch": 0.3226600985221675, "frac_reward_zero_std": 0.0, "grad_norm": 0.022324741609899095, "kl": 0.04564378131181002, "learning_rate": 4.9268177462912255e-05, "loss": -0.07674457132816315, "num_tokens": 20360984.0, "reward": 4.375, "reward_std": 1.8235670328140259, "rewards/reward_func/mean": 0.4861111111111111, "rewards/reward_func/std": 0.3013741249839465, "sampling/importance_sampling_ratio/max": 2.999788284301758, "sampling/importance_sampling_ratio/mean": 0.9689079523086548, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.985334396362305, "sampling/sampling_logp_difference/mean": 0.13317884504795074, "step": 131, "step_time": 125.56218209001236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3430.0, "completions/mean_length": 1374.1875, "completions/mean_terminated_length": 1139.5357666015625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5266564786434174, "epoch": 0.3251231527093596, "frac_reward_zero_std": 0.0, "grad_norm": 0.016910527382422866, "kl": 0.025493765715509653, "learning_rate": 4.9256480084610376e-05, "loss": -0.04664276912808418, "num_tokens": 20536708.0, "reward": 4.65234375, "reward_std": 1.600233793258667, "rewards/reward_func/mean": 0.5169270833333334, "rewards/reward_func/std": 0.25672541227605605, "sampling/importance_sampling_ratio/max": 2.9952757358551025, "sampling/importance_sampling_ratio/mean": 0.9601929187774658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.947929382324219, "sampling/sampling_logp_difference/mean": 0.15485885739326477, "step": 132, "step_time": 150.15074049308896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3145.0, "completions/mean_length": 1287.421875, "completions/mean_terminated_length": 1202.7376708984375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "degenerate_groups_filtered": 0.0, "entropy": 0.57479427754879, "epoch": 0.3275862068965517, "frac_reward_zero_std": 0.0, "grad_norm": 0.024161045457601445, "kl": 0.01593266148120165, "learning_rate": 4.9244691372053376e-05, "loss": -0.05266115441918373, "num_tokens": 20702559.0, "reward": 4.51171875, "reward_std": 1.6198359727859497, "rewards/reward_func/mean": 0.5013020833333334, "rewards/reward_func/std": 0.262553902135955, "sampling/importance_sampling_ratio/max": 2.999521017074585, "sampling/importance_sampling_ratio/mean": 0.9583712816238403, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.616655349731445, "sampling/sampling_logp_difference/mean": 0.16895034909248352, "step": 133, "step_time": 159.73296225816011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4016.0, "completions/mean_length": 1460.59375, "completions/mean_terminated_length": 1351.4482421875, "completions/min_length": 537.0, "completions/min_terminated_length": 537.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6757910698652267, "epoch": 0.33004926108374383, "frac_reward_zero_std": 0.0, "grad_norm": 0.02485612182204898, "kl": 0.030998756643384695, "learning_rate": 4.9232811369629936e-05, "loss": 0.010290354490280151, "num_tokens": 20883813.0, "reward": 4.35546875, "reward_std": 1.7169814109802246, "rewards/reward_func/mean": 0.4839409722222222, "rewards/reward_func/std": 0.2722629126575258, "sampling/importance_sampling_ratio/max": 2.9927690029144287, "sampling/importance_sampling_ratio/mean": 0.9484080672264099, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.797867774963379, "sampling/sampling_logp_difference/mean": 0.19552484154701233, "step": 134, "step_time": 136.03435460082255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 1210.078125, "completions/mean_terminated_length": 1092.7930908203125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5237693637609482, "epoch": 0.33251231527093594, "frac_reward_zero_std": 0.0, "grad_norm": 0.020927439602090963, "kl": 0.019747328478842974, "learning_rate": 4.9220840122072495e-05, "loss": -0.019350498914718628, "num_tokens": 21043770.0, "reward": 4.46875, "reward_std": 1.5587056875228882, "rewards/reward_func/mean": 0.4965277777777778, "rewards/reward_func/std": 0.25133796367380357, "sampling/importance_sampling_ratio/max": 2.9990768432617188, "sampling/importance_sampling_ratio/mean": 0.964819073677063, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.993998527526855, "sampling/sampling_logp_difference/mean": 0.14578461647033691, "step": 135, "step_time": 129.014442861313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3859.0, "completions/mean_length": 1473.328125, "completions/mean_terminated_length": 1222.0545654296875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6460306495428085, "epoch": 0.33497536945812806, "frac_reward_zero_std": 0.0, "grad_norm": 0.02366790790806787, "kl": 0.025228526908904314, "learning_rate": 4.920877767445705e-05, "loss": 0.01618235744535923, "num_tokens": 21229279.0, "reward": 4.203125, "reward_std": 1.850501298904419, "rewards/reward_func/mean": 0.4670138888888889, "rewards/reward_func/std": 0.3125879168510437, "sampling/importance_sampling_ratio/max": 2.998635768890381, "sampling/importance_sampling_ratio/mean": 0.9495352506637573, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.83233642578125, "sampling/sampling_logp_difference/mean": 0.1911965161561966, "step": 136, "step_time": 136.6534457411617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3018.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 1125.46875, "completions/mean_terminated_length": 1133.04833984375, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6018159687519073, "epoch": 0.3374384236453202, "frac_reward_zero_std": 0.0, "grad_norm": 0.028159374909103065, "kl": 0.018990385811775923, "learning_rate": 4.919662407220299e-05, "loss": 0.002560041844844818, "num_tokens": 21396941.0, "reward": 4.109375, "reward_std": 1.7648885250091553, "rewards/reward_func/mean": 0.4565972222222222, "rewards/reward_func/std": 0.24216210428211424, "sampling/importance_sampling_ratio/max": 2.999795436859131, "sampling/importance_sampling_ratio/mean": 0.9545049667358398, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.2381591796875, "sampling/sampling_logp_difference/mean": 0.17799274623394012, "step": 137, "step_time": 110.39313215529546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3624.0, "completions/mean_length": 949.890625, "completions/mean_terminated_length": 874.6612548828125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6683694273233414, "epoch": 0.3399014778325123, "frac_reward_zero_std": 0.0, "grad_norm": 0.0325675264957142, "kl": 0.02215199451893568, "learning_rate": 4.918437936107293e-05, "loss": 0.22962699830532074, "num_tokens": 21544198.0, "reward": 4.30859375, "reward_std": 1.822338342666626, "rewards/reward_func/mean": 0.4787326388888889, "rewards/reward_func/std": 0.2953062653541565, "sampling/importance_sampling_ratio/max": 2.999077558517456, "sampling/importance_sampling_ratio/mean": 0.9570326805114746, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.773735046386719, "sampling/sampling_logp_difference/mean": 0.17750665545463562, "step": 138, "step_time": 136.9202022489626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3902.0, "completions/mean_length": 1019.59375, "completions/mean_terminated_length": 979.274169921875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5504418164491653, "epoch": 0.34236453201970446, "frac_reward_zero_std": 0.0, "grad_norm": 0.02895040422333763, "kl": 0.022574008908122778, "learning_rate": 4.9172043587172564e-05, "loss": -0.1542976051568985, "num_tokens": 21691068.0, "reward": 4.20703125, "reward_std": 1.707566499710083, "rewards/reward_func/mean": 0.4674479166666667, "rewards/reward_func/std": 0.2652057492070728, "sampling/importance_sampling_ratio/max": 2.9986119270324707, "sampling/importance_sampling_ratio/mean": 0.9594486951828003, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.460792541503906, "sampling/sampling_logp_difference/mean": 0.1590704321861267, "step": 139, "step_time": 189.29539473517798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3238.0, "completions/max_terminated_length": 3238.0, "completions/mean_length": 675.71875, "completions/mean_terminated_length": 661.1428833007812, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5717846751213074, "epoch": 0.3448275862068966, "frac_reward_zero_std": 0.0, "grad_norm": 0.037716087957076136, "kl": 0.038773443549871445, "learning_rate": 4.915961679695046e-05, "loss": 0.031660258769989014, "num_tokens": 21808954.0, "reward": 4.61328125, "reward_std": 1.607848882675171, "rewards/reward_func/mean": 0.5125868055555556, "rewards/reward_func/std": 0.25105878214041394, "sampling/importance_sampling_ratio/max": 2.999619483947754, "sampling/importance_sampling_ratio/mean": 0.9686312675476074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.18880558013916, "sampling/sampling_logp_difference/mean": 0.1539146602153778, "step": 140, "step_time": 94.82269705319777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 1008.359375, "completions/mean_terminated_length": 935.3225708007812, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5867870151996613, "epoch": 0.3472906403940887, "frac_reward_zero_std": 0.0, "grad_norm": 0.02747726707660944, "kl": 0.032487844582647085, "learning_rate": 4.914709903719788e-05, "loss": -0.02273506112396717, "num_tokens": 21958289.0, "reward": 4.31640625, "reward_std": 1.7316166162490845, "rewards/reward_func/mean": 0.4796006944444444, "rewards/reward_func/std": 0.24825715935892528, "sampling/importance_sampling_ratio/max": 2.998828649520874, "sampling/importance_sampling_ratio/mean": 0.9613453149795532, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.1629056930542, "sampling/sampling_logp_difference/mean": 0.16322818398475647, "step": 141, "step_time": 175.56653777277097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3728.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 874.546875, "completions/mean_terminated_length": 874.546875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6279189586639404, "epoch": 0.3497536945812808, "frac_reward_zero_std": 0.0, "grad_norm": 0.02985454353932649, "kl": 0.024398992769420147, "learning_rate": 4.913449035504865e-05, "loss": 0.20730939507484436, "num_tokens": 22102484.0, "reward": 4.68359375, "reward_std": 1.3162914514541626, "rewards/reward_func/mean": 0.5203993055555556, "rewards/reward_func/std": 0.20795253912607828, "sampling/importance_sampling_ratio/max": 2.9962644577026367, "sampling/importance_sampling_ratio/mean": 0.9599305987358093, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.122313499450684, "sampling/sampling_logp_difference/mean": 0.17107892036437988, "step": 142, "step_time": 107.82716886512935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 993.640625, "completions/mean_terminated_length": 786.8167114257812, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5099516436457634, "epoch": 0.3522167487684729, "frac_reward_zero_std": 0.25, "grad_norm": 0.009738132356465489, "kl": 0.03395155072212219, "learning_rate": 4.912179079797892e-05, "loss": -0.06845282763242722, "num_tokens": 22254621.0, "reward": 4.87890625, "reward_std": 0.9553443789482117, "rewards/reward_func/mean": 0.5421006944444444, "rewards/reward_func/std": 0.14736688633759817, "sampling/importance_sampling_ratio/max": 2.9960014820098877, "sampling/importance_sampling_ratio/mean": 0.9686846137046814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.774383544921875, "sampling/sampling_logp_difference/mean": 0.14190690219402313, "step": 143, "step_time": 185.97113836207427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3376.0, "completions/mean_length": 1397.65625, "completions/mean_terminated_length": 1310.6129150390625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6476477831602097, "epoch": 0.35467980295566504, "frac_reward_zero_std": 0.0, "grad_norm": 0.017567346260058462, "kl": 0.014963384717702866, "learning_rate": 4.910900041380703e-05, "loss": 0.11150763928890228, "num_tokens": 22437719.0, "reward": 4.734375, "reward_std": 1.2839632034301758, "rewards/reward_func/mean": 0.5260416666666666, "rewards/reward_func/std": 0.20239159795973036, "sampling/importance_sampling_ratio/max": 2.9984025955200195, "sampling/importance_sampling_ratio/mean": 0.9474148750305176, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.22669792175293, "sampling/sampling_logp_difference/mean": 0.19336676597595215, "step": 144, "step_time": 180.27103169239126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 999.34375, "completions/mean_terminated_length": 951.2257690429688, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "degenerate_groups_filtered": 1.0, "entropy": 0.652740016579628, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.025811992766747685, "kl": 0.026440259534865618, "learning_rate": 4.909611925069332e-05, "loss": 0.04610571265220642, "num_tokens": 22588525.0, "reward": 4.76171875, "reward_std": 1.4484732151031494, "rewards/reward_func/mean": 0.5290798611111112, "rewards/reward_func/std": 0.24869189742538664, "sampling/importance_sampling_ratio/max": 2.9998068809509277, "sampling/importance_sampling_ratio/mean": 0.9527676105499268, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.181685447692871, "sampling/sampling_logp_difference/mean": 0.18670062720775604, "step": 145, "step_time": 133.24482462904416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 1359.0, "completions/mean_terminated_length": 1125.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6540272980928421, "epoch": 0.35960591133004927, "frac_reward_zero_std": 0.0, "grad_norm": 0.023050681392453684, "kl": 0.01712850504554808, "learning_rate": 4.9083147357139936e-05, "loss": -0.24461397528648376, "num_tokens": 22782141.0, "reward": 4.38671875, "reward_std": 1.710106372833252, "rewards/reward_func/mean": 0.4874131944444444, "rewards/reward_func/std": 0.2449416286415524, "sampling/importance_sampling_ratio/max": 2.999965190887451, "sampling/importance_sampling_ratio/mean": 0.9427659511566162, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.624150276184082, "sampling/sampling_logp_difference/mean": 0.2082119882106781, "step": 146, "step_time": 146.17805301607586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 784.75, "completions/mean_terminated_length": 784.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5902538895606995, "epoch": 0.3620689655172414, "frac_reward_zero_std": 0.0, "grad_norm": 0.027611265073668196, "kl": 0.04381394945085049, "learning_rate": 4.9070084781990655e-05, "loss": -0.012405045330524445, "num_tokens": 22916269.0, "reward": 4.7421875, "reward_std": 1.1503697633743286, "rewards/reward_func/mean": 0.5269097222222222, "rewards/reward_func/std": 0.18867847737338808, "sampling/importance_sampling_ratio/max": 2.998880624771118, "sampling/importance_sampling_ratio/mean": 0.9579499959945679, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.612027168273926, "sampling/sampling_logp_difference/mean": 0.16890573501586914, "step": 147, "step_time": 68.85666625201702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 1074.546875, "completions/mean_terminated_length": 1009.01611328125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6222652047872543, "epoch": 0.3645320197044335, "frac_reward_zero_std": 0.25, "grad_norm": 0.024617454867958573, "kl": 0.01417856477200985, "learning_rate": 4.905693157443072e-05, "loss": 0.043572280555963516, "num_tokens": 23073344.0, "reward": 4.66015625, "reward_std": 1.435357928276062, "rewards/reward_func/mean": 0.5177951388888888, "rewards/reward_func/std": 0.21545444304744402, "sampling/importance_sampling_ratio/max": 2.99368953704834, "sampling/importance_sampling_ratio/mean": 0.9554688930511475, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.491532325744629, "sampling/sampling_logp_difference/mean": 0.17892569303512573, "step": 148, "step_time": 164.53390644979663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2713.0, "completions/mean_length": 1009.8125, "completions/mean_terminated_length": 960.825439453125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "degenerate_groups_filtered": 1.0, "entropy": 0.5842855423688889, "epoch": 0.3669950738916256, "frac_reward_zero_std": 0.25, "grad_norm": 0.018268203346222877, "kl": 0.01848753821104765, "learning_rate": 4.904368778398662e-05, "loss": -0.11345016956329346, "num_tokens": 23222356.0, "reward": 4.76171875, "reward_std": 1.0469435453414917, "rewards/reward_func/mean": 0.5290798611111112, "rewards/reward_func/std": 0.174877953198221, "sampling/importance_sampling_ratio/max": 2.991849422454834, "sampling/importance_sampling_ratio/mean": 0.9590041637420654, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.206958770751953, "sampling/sampling_logp_difference/mean": 0.17348900437355042, "step": 149, "step_time": 127.15000204136595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3131.0, "completions/mean_length": 1100.765625, "completions/mean_terminated_length": 1053.2222900390625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6619395166635513, "epoch": 0.3694581280788177, "frac_reward_zero_std": 0.0, "grad_norm": 0.02320337507863541, "kl": 0.030713028274476528, "learning_rate": 4.903035346052593e-05, "loss": -0.050076283514499664, "num_tokens": 23386565.0, "reward": 4.77734375, "reward_std": 1.4740595817565918, "rewards/reward_func/mean": 0.5308159722222222, "rewards/reward_func/std": 0.23688736226823595, "sampling/importance_sampling_ratio/max": 2.9961957931518555, "sampling/importance_sampling_ratio/mean": 0.9498468637466431, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.355584144592285, "sampling/sampling_logp_difference/mean": 0.19352048635482788, "step": 150, "step_time": 158.25811398518272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 958.609375, "completions/mean_terminated_length": 795.5423583984375, "completions/min_length": 183.0, "completions/min_terminated_length": 245.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5594561994075775, "epoch": 0.37192118226600984, "frac_reward_zero_std": 0.25, "grad_norm": 0.018923320892404144, "kl": 0.030594575218856335, "learning_rate": 4.9016928654257096e-05, "loss": -0.05903652310371399, "num_tokens": 23535724.0, "reward": 4.796875, "reward_std": 1.4712016582489014, "rewards/reward_func/mean": 0.5329861111111112, "rewards/reward_func/std": 0.23497174680233002, "sampling/importance_sampling_ratio/max": 2.984567403793335, "sampling/importance_sampling_ratio/mean": 0.9650619626045227, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.476213455200195, "sampling/sampling_logp_difference/mean": 0.15478157997131348, "step": 151, "step_time": 181.10872292728163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 827.765625, "completions/mean_terminated_length": 827.765625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6111591905355453, "epoch": 0.37438423645320196, "frac_reward_zero_std": 0.5, "grad_norm": 0.01755727796270937, "kl": 0.023565078154206276, "learning_rate": 4.9003413415729295e-05, "loss": -0.02637684904038906, "num_tokens": 23670253.0, "reward": 4.77734375, "reward_std": 1.104749083518982, "rewards/reward_func/mean": 0.5308159722222222, "rewards/reward_func/std": 0.1676331791612837, "sampling/importance_sampling_ratio/max": 2.992830276489258, "sampling/importance_sampling_ratio/mean": 0.9616970419883728, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.969971656799316, "sampling/sampling_logp_difference/mean": 0.16461607813835144, "step": 152, "step_time": 69.56748036597855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 973.4375, "completions/mean_terminated_length": 923.873046875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5804478824138641, "epoch": 0.3768472906403941, "frac_reward_zero_std": 0.0, "grad_norm": 0.024461340986450475, "kl": 0.023421769961714745, "learning_rate": 4.898980779583218e-05, "loss": 0.0462716780602932, "num_tokens": 23821209.0, "reward": 4.796875, "reward_std": 1.231107234954834, "rewards/reward_func/mean": 0.5329861111111112, "rewards/reward_func/std": 0.20140869998269612, "sampling/importance_sampling_ratio/max": 2.9916088581085205, "sampling/importance_sampling_ratio/mean": 0.9576438665390015, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.179545402526855, "sampling/sampling_logp_difference/mean": 0.17727312445640564, "step": 153, "step_time": 154.47874995111488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3886.0, "completions/max_terminated_length": 3886.0, "completions/mean_length": 1236.453125, "completions/mean_terminated_length": 1254.2064208984375, "completions/min_length": 118.0, "completions/min_terminated_length": 308.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6448825746774673, "epoch": 0.3793103448275862, "frac_reward_zero_std": 0.25, "grad_norm": 0.008303242862563802, "kl": 0.01926612318493426, "learning_rate": 4.897611184579575e-05, "loss": -0.08189457654953003, "num_tokens": 23985638.0, "reward": 5.0703125, "reward_std": 0.9422498941421509, "rewards/reward_func/mean": 0.5633680555555556, "rewards/reward_func/std": 0.19585710681147045, "sampling/importance_sampling_ratio/max": 2.9982712268829346, "sampling/importance_sampling_ratio/mean": 0.9486467838287354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.374839782714844, "sampling/sampling_logp_difference/mean": 0.1977824568748474, "step": 154, "step_time": 117.2964083738625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2027.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 865.5625, "completions/mean_terminated_length": 866.5238647460938, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6071893572807312, "epoch": 0.3817733990147783, "frac_reward_zero_std": 0.0, "grad_norm": 0.1193280757900666, "kl": 0.030153931118547916, "learning_rate": 4.896232561719011e-05, "loss": 0.010661143809556961, "num_tokens": 24124298.0, "reward": 4.13671875, "reward_std": 1.7789162397384644, "rewards/reward_func/mean": 0.4596354166666667, "rewards/reward_func/std": 0.23638725777467093, "sampling/importance_sampling_ratio/max": 2.999091625213623, "sampling/importance_sampling_ratio/mean": 0.9599412679672241, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.112852096557617, "sampling/sampling_logp_difference/mean": 0.17123615741729736, "step": 155, "step_time": 72.2496001359541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 978.875, "completions/mean_terminated_length": 930.9677124023438, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6019200086593628, "epoch": 0.3842364532019704, "frac_reward_zero_std": 0.0, "grad_norm": 0.022745034032572956, "kl": 0.05701204831711948, "learning_rate": 4.8948449161925304e-05, "loss": 0.007736865431070328, "num_tokens": 24272082.0, "reward": 4.8359375, "reward_std": 1.287315011024475, "rewards/reward_func/mean": 0.5373263888888888, "rewards/reward_func/std": 0.20693102561765248, "sampling/importance_sampling_ratio/max": 2.99910831451416, "sampling/importance_sampling_ratio/mean": 0.9618995189666748, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.936193466186523, "sampling/sampling_logp_difference/mean": 0.1633065938949585, "step": 156, "step_time": 192.91116239712574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3499.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 1035.4375, "completions/mean_terminated_length": 1035.4375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5545484572649002, "epoch": 0.3866995073891626, "frac_reward_zero_std": 0.25, "grad_norm": 0.018643941734307494, "kl": 0.034695918671786785, "learning_rate": 4.893448253225111e-05, "loss": 0.025819044560194016, "num_tokens": 24427182.0, "reward": 4.6484375, "reward_std": 1.4466153383255005, "rewards/reward_func/mean": 0.5164930555555556, "rewards/reward_func/std": 0.2145352140069008, "sampling/importance_sampling_ratio/max": 2.991377830505371, "sampling/importance_sampling_ratio/mean": 0.9611527919769287, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.916698455810547, "sampling/sampling_logp_difference/mean": 0.1512816846370697, "step": 157, "step_time": 133.73708005039953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 983.921875, "completions/mean_terminated_length": 883.5322265625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6081868559122086, "epoch": 0.3891625615763547, "frac_reward_zero_std": 0.0, "grad_norm": 0.02139363037812444, "kl": 0.03456718381494284, "learning_rate": 4.892042578075685e-05, "loss": -0.09442838281393051, "num_tokens": 24570361.0, "reward": 4.703125, "reward_std": 1.3648616075515747, "rewards/reward_func/mean": 0.5225694444444444, "rewards/reward_func/std": 0.20582874615987143, "sampling/importance_sampling_ratio/max": 2.9956085681915283, "sampling/importance_sampling_ratio/mean": 0.9590303897857666, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.812082290649414, "sampling/sampling_logp_difference/mean": 0.1729988157749176, "step": 158, "step_time": 138.27136790496297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 768.484375, "completions/mean_terminated_length": 768.484375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6265248358249664, "epoch": 0.3916256157635468, "frac_reward_zero_std": 0.5, "grad_norm": 0.028373820616951973, "kl": 0.021229174453765154, "learning_rate": 4.8906278960371176e-05, "loss": 0.03736027330160141, "num_tokens": 24708344.0, "reward": 4.63671875, "reward_std": 1.4064263105392456, "rewards/reward_func/mean": 0.5151909722222222, "rewards/reward_func/std": 0.21763736671871609, "sampling/importance_sampling_ratio/max": 2.999945640563965, "sampling/importance_sampling_ratio/mean": 0.96051025390625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.708161354064941, "sampling/sampling_logp_difference/mean": 0.17337340116500854, "step": 159, "step_time": 66.52018399396911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1092.65625, "completions/mean_terminated_length": 1019.4515991210938, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5845111310482025, "epoch": 0.39408866995073893, "frac_reward_zero_std": 0.0, "grad_norm": 0.03354331250442753, "kl": 0.03103955276310444, "learning_rate": 4.889204212436189e-05, "loss": 0.1756535917520523, "num_tokens": 24860034.0, "reward": 4.234375, "reward_std": 1.7158660888671875, "rewards/reward_func/mean": 0.4704861111111111, "rewards/reward_func/std": 0.22013813257217407, "sampling/importance_sampling_ratio/max": 2.9987423419952393, "sampling/importance_sampling_ratio/mean": 0.9564813375473022, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.36332893371582, "sampling/sampling_logp_difference/mean": 0.1764547973871231, "step": 160, "step_time": 172.21876229112968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3241.0, "completions/max_terminated_length": 3241.0, "completions/mean_length": 936.03125, "completions/mean_terminated_length": 936.03125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5562917143106461, "epoch": 0.39655172413793105, "frac_reward_zero_std": 0.0, "grad_norm": 0.07900173065640134, "kl": 0.21416432037949562, "learning_rate": 4.8877715326335735e-05, "loss": 0.14280450344085693, "num_tokens": 25002900.0, "reward": 4.67578125, "reward_std": 1.3120925426483154, "rewards/reward_func/mean": 0.51953125, "rewards/reward_func/std": 0.22544356021616194, "sampling/importance_sampling_ratio/max": 2.9968130588531494, "sampling/importance_sampling_ratio/mean": 0.9616665840148926, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.51321792602539, "sampling/sampling_logp_difference/mean": 0.16403846442699432, "step": 161, "step_time": 97.09818493202329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2848.0, "completions/max_terminated_length": 2848.0, "completions/mean_length": 846.265625, "completions/mean_terminated_length": 838.1935424804688, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6348050832748413, "epoch": 0.39901477832512317, "frac_reward_zero_std": 0.0, "grad_norm": 0.027732800385176146, "kl": 0.02809206396341324, "learning_rate": 4.886329862023818e-05, "loss": -0.0797884464263916, "num_tokens": 25146741.0, "reward": 4.49609375, "reward_std": 1.6223220825195312, "rewards/reward_func/mean": 0.4995659722222222, "rewards/reward_func/std": 0.2447797093126509, "sampling/importance_sampling_ratio/max": 2.9985299110412598, "sampling/importance_sampling_ratio/mean": 0.9614369869232178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.401952743530273, "sampling/sampling_logp_difference/mean": 0.16790394484996796, "step": 162, "step_time": 85.56331390305422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1662.0, "completions/mean_length": 719.5, "completions/mean_terminated_length": 665.90478515625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5159411802887917, "epoch": 0.4014778325123153, "frac_reward_zero_std": 0.25, "grad_norm": 0.021884903974561352, "kl": 0.022087908815592527, "learning_rate": 4.884879206035324e-05, "loss": -0.0033130012452602386, "num_tokens": 25268901.0, "reward": 4.67578125, "reward_std": 1.1079436540603638, "rewards/reward_func/mean": 0.51953125, "rewards/reward_func/std": 0.14711155576838386, "sampling/importance_sampling_ratio/max": 2.996157169342041, "sampling/importance_sampling_ratio/mean": 0.965851902961731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.011970520019531, "sampling/sampling_logp_difference/mean": 0.14661702513694763, "step": 163, "step_time": 143.06184943695553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 836.859375, "completions/mean_terminated_length": 785.1270141601562, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5226787328720093, "epoch": 0.4039408866995074, "frac_reward_zero_std": 0.0, "grad_norm": 0.03481414814636514, "kl": 0.027967958711087704, "learning_rate": 4.883419570130327e-05, "loss": 0.12579919397830963, "num_tokens": 25408860.0, "reward": 4.31640625, "reward_std": 1.6074246168136597, "rewards/reward_func/mean": 0.4796006944444444, "rewards/reward_func/std": 0.23362092218465275, "sampling/importance_sampling_ratio/max": 2.998420238494873, "sampling/importance_sampling_ratio/mean": 0.9653995037078857, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.86381721496582, "sampling/sampling_logp_difference/mean": 0.14888456463813782, "step": 164, "step_time": 130.35683792899363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3195.0, "completions/mean_length": 1075.9375, "completions/mean_terminated_length": 985.704833984375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5282468944787979, "epoch": 0.4064039408866995, "frac_reward_zero_std": 0.0, "grad_norm": 0.02783431665422358, "kl": 0.03446731064468622, "learning_rate": 4.881950959804874e-05, "loss": 0.045085370540618896, "num_tokens": 25560280.0, "reward": 4.265625, "reward_std": 1.6606289148330688, "rewards/reward_func/mean": 0.4739583333333333, "rewards/reward_func/std": 0.24556127190589905, "sampling/importance_sampling_ratio/max": 2.990581512451172, "sampling/importance_sampling_ratio/mean": 0.960712194442749, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.613689422607422, "sampling/sampling_logp_difference/mean": 0.15218126773834229, "step": 165, "step_time": 172.98013453022577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 729.71875, "completions/mean_terminated_length": 733.9193115234375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6080366969108582, "epoch": 0.4088669950738916, "frac_reward_zero_std": 0.0, "grad_norm": 0.03693327148531038, "kl": 0.07251364225521684, "learning_rate": 4.8804733805888024e-05, "loss": 0.1411583572626114, "num_tokens": 25696326.0, "reward": 4.703125, "reward_std": 1.3879262208938599, "rewards/reward_func/mean": 0.5225694444444444, "rewards/reward_func/std": 0.2221848898463779, "sampling/importance_sampling_ratio/max": 2.999439239501953, "sampling/importance_sampling_ratio/mean": 0.9615045785903931, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.027545928955078, "sampling/sampling_logp_difference/mean": 0.16043534874916077, "step": 166, "step_time": 84.72602451802231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3802.0, "completions/mean_length": 1253.0625, "completions/mean_terminated_length": 1168.0655517578125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5983615964651108, "epoch": 0.41133004926108374, "frac_reward_zero_std": 0.0, "grad_norm": 0.08869248734451317, "kl": 0.30646114982664585, "learning_rate": 4.8789868380457246e-05, "loss": -0.03257778659462929, "num_tokens": 25863642.0, "reward": 4.52734375, "reward_std": 1.5418224334716797, "rewards/reward_func/mean": 0.5030381944444444, "rewards/reward_func/std": 0.2189425097571479, "sampling/importance_sampling_ratio/max": 2.995548725128174, "sampling/importance_sampling_ratio/mean": 0.9559701085090637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.81066608428955, "sampling/sampling_logp_difference/mean": 0.17433354258537292, "step": 167, "step_time": 190.16439045919105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3095.0, "completions/mean_length": 1056.328125, "completions/mean_terminated_length": 959.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "degenerate_groups_filtered": 0.0, "entropy": 0.608098179101944, "epoch": 0.41379310344827586, "frac_reward_zero_std": 0.25, "grad_norm": 0.024967044301457336, "kl": 0.04156289668753743, "learning_rate": 4.8774913377729994e-05, "loss": -0.038745272904634476, "num_tokens": 26011023.0, "reward": 4.55859375, "reward_std": 1.2946335077285767, "rewards/reward_func/mean": 0.5065104166666666, "rewards/reward_func/std": 0.18393640220165253, "sampling/importance_sampling_ratio/max": 2.99995493888855, "sampling/importance_sampling_ratio/mean": 0.9574207067489624, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.34084415435791, "sampling/sampling_logp_difference/mean": 0.1739969253540039, "step": 168, "step_time": 121.14545013522729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 839.109375, "completions/mean_terminated_length": 734.04833984375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "degenerate_groups_filtered": 0.0, "entropy": 0.624035969376564, "epoch": 0.41625615763546797, "frac_reward_zero_std": 0.0, "grad_norm": 0.02459309972915259, "kl": 0.03127794712781906, "learning_rate": 4.875986885401717e-05, "loss": 0.03152251988649368, "num_tokens": 26148006.0, "reward": 4.54296875, "reward_std": 1.3668079376220703, "rewards/reward_func/mean": 0.5047743055555556, "rewards/reward_func/std": 0.18499460816383362, "sampling/importance_sampling_ratio/max": 2.9995522499084473, "sampling/importance_sampling_ratio/mean": 0.9615331888198853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.471660614013672, "sampling/sampling_logp_difference/mean": 0.16513219475746155, "step": 169, "step_time": 118.91501420899294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 809.875, "completions/mean_terminated_length": 746.01611328125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "degenerate_groups_filtered": 0.0, "entropy": 0.551299437880516, "epoch": 0.4187192118226601, "frac_reward_zero_std": 0.0, "grad_norm": 0.02714725701280757, "kl": 0.02795234275981784, "learning_rate": 4.874473486596672e-05, "loss": -0.06360671669244766, "num_tokens": 26282494.0, "reward": 4.421875, "reward_std": 1.571620225906372, "rewards/reward_func/mean": 0.4913194444444444, "rewards/reward_func/std": 0.22683406207296583, "sampling/importance_sampling_ratio/max": 2.9994564056396484, "sampling/importance_sampling_ratio/mean": 0.9681460857391357, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.212518692016602, "sampling/sampling_logp_difference/mean": 0.1451174020767212, "step": 170, "step_time": 140.71922606788576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3145.0, "completions/mean_length": 1584.28125, "completions/mean_terminated_length": 1173.272705078125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5730425864458084, "epoch": 0.4211822660098522, "frac_reward_zero_std": 0.0, "grad_norm": 0.01904952675086213, "kl": 0.014946466544643044, "learning_rate": 4.8729511470563514e-05, "loss": -0.17807739973068237, "num_tokens": 26479264.0, "reward": 4.13671875, "reward_std": 1.866012692451477, "rewards/reward_func/mean": 0.4596354166666667, "rewards/reward_func/std": 0.26322751575046116, "sampling/importance_sampling_ratio/max": 2.996227502822876, "sampling/importance_sampling_ratio/mean": 0.9491404294967651, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.776933670043945, "sampling/sampling_logp_difference/mean": 0.18628865480422974, "step": 171, "step_time": 139.96793680964038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3338.0, "completions/mean_length": 1098.578125, "completions/mean_terminated_length": 946.4166870117188, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6739596724510193, "epoch": 0.4236453201970443, "frac_reward_zero_std": 0.0, "grad_norm": 0.02536639850029695, "kl": 0.016601723851636052, "learning_rate": 4.871419872512901e-05, "loss": -0.15329566597938538, "num_tokens": 26640853.0, "reward": 4.15234375, "reward_std": 1.831501841545105, "rewards/reward_func/mean": 0.4613715277777778, "rewards/reward_func/std": 0.26819422592719394, "sampling/importance_sampling_ratio/max": 2.99464750289917, "sampling/importance_sampling_ratio/mean": 0.9525026082992554, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.213711738586426, "sampling/sampling_logp_difference/mean": 0.18900419771671295, "step": 172, "step_time": 119.70939294900745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3843.0, "completions/mean_length": 1112.21875, "completions/mean_terminated_length": 965.475341796875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6000253409147263, "epoch": 0.42610837438423643, "frac_reward_zero_std": 0.0, "grad_norm": 0.03322072539721348, "kl": 0.01858115242794156, "learning_rate": 4.869879668732115e-05, "loss": 0.0820431113243103, "num_tokens": 26793539.0, "reward": 3.4765625, "reward_std": 2.114630937576294, "rewards/reward_func/mean": 0.3862847222222222, "rewards/reward_func/std": 0.30766087025403976, "sampling/importance_sampling_ratio/max": 2.9941298961639404, "sampling/importance_sampling_ratio/mean": 0.9587726593017578, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.781671524047852, "sampling/sampling_logp_difference/mean": 0.17097865045070648, "step": 173, "step_time": 183.34437879105099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3845.0, "completions/mean_length": 1355.65625, "completions/mean_terminated_length": 1154.2542724609375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7394872605800629, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.0, "grad_norm": 0.03241121665359061, "kl": 0.013445974560454488, "learning_rate": 4.868330541513405e-05, "loss": -0.08849971741437912, "num_tokens": 26969853.0, "reward": 3.05859375, "reward_std": 2.153228521347046, "rewards/reward_func/mean": 0.33984375, "rewards/reward_func/std": 0.2989349315563838, "sampling/importance_sampling_ratio/max": 2.9982781410217285, "sampling/importance_sampling_ratio/mean": 0.944078803062439, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.34125518798828, "sampling/sampling_logp_difference/mean": 0.21590906381607056, "step": 174, "step_time": 141.4540407299064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 888.125, "completions/mean_terminated_length": 725.7833862304688, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6452028006315231, "epoch": 0.43103448275862066, "frac_reward_zero_std": 0.0, "grad_norm": 0.03481784350490518, "kl": 0.02153546130284667, "learning_rate": 4.866772496689787e-05, "loss": 0.023521684110164642, "num_tokens": 27108069.0, "reward": 3.59375, "reward_std": 2.1308858394622803, "rewards/reward_func/mean": 0.3993055555555556, "rewards/reward_func/std": 0.3248247545626428, "sampling/importance_sampling_ratio/max": 2.9982481002807617, "sampling/importance_sampling_ratio/mean": 0.9610703587532043, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.91135311126709, "sampling/sampling_logp_difference/mean": 0.16744840145111084, "step": 175, "step_time": 127.97521190205589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3963.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 789.71875, "completions/mean_terminated_length": 767.71435546875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6018838584423065, "epoch": 0.43349753694581283, "frac_reward_zero_std": 0.0, "grad_norm": 0.04159223111511331, "kl": 0.023647161200642586, "learning_rate": 4.865205540127851e-05, "loss": 0.39704209566116333, "num_tokens": 27246995.0, "reward": 3.58203125, "reward_std": 2.0373764038085938, "rewards/reward_func/mean": 0.3980034722222222, "rewards/reward_func/std": 0.2889099650912815, "sampling/importance_sampling_ratio/max": 2.999706268310547, "sampling/importance_sampling_ratio/mean": 0.9595375061035156, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.16771125793457, "sampling/sampling_logp_difference/mean": 0.16613054275512695, "step": 176, "step_time": 116.67240364779718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3365.0, "completions/mean_length": 1200.609375, "completions/mean_terminated_length": 941.2069091796875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6288936734199524, "epoch": 0.43596059113300495, "frac_reward_zero_std": 0.0, "grad_norm": 0.027437780371530707, "kl": 0.018118501640856266, "learning_rate": 4.863629677727745e-05, "loss": 0.067866250872612, "num_tokens": 27423434.0, "reward": 3.34375, "reward_std": 2.071873188018799, "rewards/reward_func/mean": 0.3715277777777778, "rewards/reward_func/std": 0.26894643902778625, "sampling/importance_sampling_ratio/max": 2.991316318511963, "sampling/importance_sampling_ratio/mean": 0.9541721940040588, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.680610656738281, "sampling/sampling_logp_difference/mean": 0.18493330478668213, "step": 177, "step_time": 151.51794349495322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3449.0, "completions/mean_length": 1000.40625, "completions/mean_terminated_length": 895.8359985351562, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6372148543596268, "epoch": 0.43842364532019706, "frac_reward_zero_std": 0.0, "grad_norm": 0.033978302809922124, "kl": 0.021280715242028236, "learning_rate": 4.862044915423149e-05, "loss": -0.1135963499546051, "num_tokens": 27579684.0, "reward": 3.44921875, "reward_std": 2.0669257640838623, "rewards/reward_func/mean": 0.3832465277777778, "rewards/reward_func/std": 0.30381787982251907, "sampling/importance_sampling_ratio/max": 2.997410535812378, "sampling/importance_sampling_ratio/mean": 0.9527778029441833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.84807777404785, "sampling/sampling_logp_difference/mean": 0.1820211112499237, "step": 178, "step_time": 132.21914479322731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3490.0, "completions/mean_length": 940.390625, "completions/mean_terminated_length": 789.6500244140625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5805416107177734, "epoch": 0.4408866995073892, "frac_reward_zero_std": 0.0, "grad_norm": 0.03412320001462779, "kl": 0.0387720656581223, "learning_rate": 4.860451259181259e-05, "loss": 0.05906563252210617, "num_tokens": 27720781.0, "reward": 3.6875, "reward_std": 2.0004959106445312, "rewards/reward_func/mean": 0.4097222222222222, "rewards/reward_func/std": 0.2693231337600284, "sampling/importance_sampling_ratio/max": 2.9986228942871094, "sampling/importance_sampling_ratio/mean": 0.9692578315734863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.18484878540039, "sampling/sampling_logp_difference/mean": 0.14448237419128418, "step": 179, "step_time": 122.44541043927893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3246.0, "completions/max_terminated_length": 3246.0, "completions/mean_length": 876.859375, "completions/mean_terminated_length": 866.3386840820312, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "degenerate_groups_filtered": 0.0, "entropy": 0.589184507727623, "epoch": 0.4433497536945813, "frac_reward_zero_std": 0.0, "grad_norm": 0.028710947398096327, "kl": 0.02539773052558303, "learning_rate": 4.8588487150027514e-05, "loss": 0.036127492785453796, "num_tokens": 27850516.0, "reward": 4.0859375, "reward_std": 1.7189528942108154, "rewards/reward_func/mean": 0.4539930555555556, "rewards/reward_func/std": 0.24298586448033652, "sampling/importance_sampling_ratio/max": 2.9983139038085938, "sampling/importance_sampling_ratio/mean": 0.9629830718040466, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.87133502960205, "sampling/sampling_logp_difference/mean": 0.15577855706214905, "step": 180, "step_time": 88.02008921210654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3620.0, "completions/mean_length": 812.703125, "completions/mean_terminated_length": 748.758056640625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5205734521150589, "epoch": 0.4458128078817734, "frac_reward_zero_std": 0.0, "grad_norm": 0.035317714533933245, "kl": 0.024989775381982327, "learning_rate": 4.8572372889217776e-05, "loss": 0.3163328468799591, "num_tokens": 27981937.0, "reward": 4.06640625, "reward_std": 1.743037223815918, "rewards/reward_func/mean": 0.4518229166666667, "rewards/reward_func/std": 0.24088652142220074, "sampling/importance_sampling_ratio/max": 2.9916434288024902, "sampling/importance_sampling_ratio/mean": 0.9683629274368286, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.304101943969727, "sampling/sampling_logp_difference/mean": 0.1433240920305252, "step": 181, "step_time": 122.87319412222132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3806.0, "completions/mean_length": 997.6875, "completions/mean_terminated_length": 949.8547973632812, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5740093439817429, "epoch": 0.4482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.031271393477359785, "kl": 0.026333958376199007, "learning_rate": 4.855616987005926e-05, "loss": -0.1590024083852768, "num_tokens": 28138861.0, "reward": 3.86328125, "reward_std": 1.9395270347595215, "rewards/reward_func/mean": 0.4292534722222222, "rewards/reward_func/std": 0.28895225500067073, "sampling/importance_sampling_ratio/max": 2.9844038486480713, "sampling/importance_sampling_ratio/mean": 0.9601902365684509, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.815610885620117, "sampling/sampling_logp_difference/mean": 0.15642720460891724, "step": 182, "step_time": 142.38116177916527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3687.0, "completions/mean_length": 1050.6875, "completions/mean_terminated_length": 952.4515991210938, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5912422388792038, "epoch": 0.45073891625615764, "frac_reward_zero_std": 0.0, "grad_norm": 0.029375040224375964, "kl": 0.03247595578432083, "learning_rate": 4.853987815356211e-05, "loss": -0.13517621159553528, "num_tokens": 28287689.0, "reward": 4.171875, "reward_std": 1.8229548931121826, "rewards/reward_func/mean": 0.4635416666666667, "rewards/reward_func/std": 0.2815826332403554, "sampling/importance_sampling_ratio/max": 2.9973490238189697, "sampling/importance_sampling_ratio/mean": 0.9587271213531494, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.327677726745605, "sampling/sampling_logp_difference/mean": 0.16505397856235504, "step": 183, "step_time": 118.48117843503132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 982.359375, "completions/mean_terminated_length": 829.2294921875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5499522387981415, "epoch": 0.45320197044334976, "frac_reward_zero_std": 0.0, "grad_norm": 0.02496128260966136, "kl": 0.0238928678445518, "learning_rate": 4.8523497801070394e-05, "loss": -0.1412869393825531, "num_tokens": 28429712.0, "reward": 4.453125, "reward_std": 1.779443383216858, "rewards/reward_func/mean": 0.4947916666666667, "rewards/reward_func/std": 0.27015094210704166, "sampling/importance_sampling_ratio/max": 2.9977781772613525, "sampling/importance_sampling_ratio/mean": 0.9683471918106079, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.436448097229004, "sampling/sampling_logp_difference/mean": 0.14362305402755737, "step": 184, "step_time": 124.91741205216385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4090.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 731.46875, "completions/mean_terminated_length": 688.51611328125, "completions/min_length": 36.0, "completions/min_terminated_length": 72.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6139041185379028, "epoch": 0.45566502463054187, "frac_reward_zero_std": 0.25, "grad_norm": 0.037142784390176466, "kl": 0.05982669489458203, "learning_rate": 4.8507028874261965e-05, "loss": -0.006140515208244324, "num_tokens": 28559326.0, "reward": 3.98828125, "reward_std": 1.8678725957870483, "rewards/reward_func/mean": 0.4431423611111111, "rewards/reward_func/std": 0.28126167671547997, "sampling/importance_sampling_ratio/max": 2.9906561374664307, "sampling/importance_sampling_ratio/mean": 0.9640798568725586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.613927841186523, "sampling/sampling_logp_difference/mean": 0.1551136076450348, "step": 185, "step_time": 159.02927091997117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 772.203125, "completions/mean_terminated_length": 772.203125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5520921647548676, "epoch": 0.458128078817734, "frac_reward_zero_std": 0.25, "grad_norm": 0.061395920818485855, "kl": 0.05162614677101374, "learning_rate": 4.8490471435148174e-05, "loss": 0.12048451602458954, "num_tokens": 28685787.0, "reward": 4.6015625, "reward_std": 1.458907127380371, "rewards/reward_func/mean": 0.5112847222222222, "rewards/reward_func/std": 0.2330812480714586, "sampling/importance_sampling_ratio/max": 2.998060941696167, "sampling/importance_sampling_ratio/mean": 0.9678115844726562, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.809170722961426, "sampling/sampling_logp_difference/mean": 0.14706827700138092, "step": 186, "step_time": 78.58753942209296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 1044.640625, "completions/mean_terminated_length": 1046.761962890625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5624367594718933, "epoch": 0.4605911330049261, "frac_reward_zero_std": 0.0, "grad_norm": 0.02434360234786606, "kl": 0.02444734564051032, "learning_rate": 4.8473825546073656e-05, "loss": -0.01238684356212616, "num_tokens": 28844580.0, "reward": 4.50390625, "reward_std": 1.618035912513733, "rewards/reward_func/mean": 0.5004340277777778, "rewards/reward_func/std": 0.23665551717082658, "sampling/importance_sampling_ratio/max": 2.9936723709106445, "sampling/importance_sampling_ratio/mean": 0.9547263979911804, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.060944557189941, "sampling/sampling_logp_difference/mean": 0.1708623319864273, "step": 187, "step_time": 115.63352126302198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1682.0, "completions/mean_length": 1027.03125, "completions/mean_terminated_length": 808.5254516601562, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5580074787139893, "epoch": 0.4630541871921182, "frac_reward_zero_std": 0.0, "grad_norm": 0.02214669121529831, "kl": 0.04178555542603135, "learning_rate": 4.845709126971609e-05, "loss": -0.07377077639102936, "num_tokens": 28988630.0, "reward": 4.4765625, "reward_std": 1.6919033527374268, "rewards/reward_func/mean": 0.4973958333333333, "rewards/reward_func/std": 0.24915697342819637, "sampling/importance_sampling_ratio/max": 2.998155355453491, "sampling/importance_sampling_ratio/mean": 0.964207649230957, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.738546371459961, "sampling/sampling_logp_difference/mean": 0.14810925722122192, "step": 188, "step_time": 144.04732121806592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 834.3125, "completions/mean_terminated_length": 684.5423583984375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5862187743186951, "epoch": 0.46551724137931033, "frac_reward_zero_std": 0.0, "grad_norm": 0.02523673185589828, "kl": 0.04767545498907566, "learning_rate": 4.844026866908595e-05, "loss": -0.22603827714920044, "num_tokens": 29125434.0, "reward": 4.3984375, "reward_std": 1.546705961227417, "rewards/reward_func/mean": 0.4887152777777778, "rewards/reward_func/std": 0.2399756842189365, "sampling/importance_sampling_ratio/max": 2.9856364727020264, "sampling/importance_sampling_ratio/mean": 0.9674413204193115, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.371965408325195, "sampling/sampling_logp_difference/mean": 0.1515868902206421, "step": 189, "step_time": 124.82868728786707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3335.0, "completions/mean_length": 972.59375, "completions/mean_terminated_length": 920.9031982421875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "degenerate_groups_filtered": 0.0, "entropy": 0.546548068523407, "epoch": 0.46798029556650245, "frac_reward_zero_std": 0.25, "grad_norm": 0.02276588191427084, "kl": 0.03256893623620272, "learning_rate": 4.8423357807526325e-05, "loss": 0.15206681191921234, "num_tokens": 29285008.0, "reward": 4.99609375, "reward_std": 1.2673338651657104, "rewards/reward_func/mean": 0.5551215277777778, "rewards/reward_func/std": 0.24073222113980186, "sampling/importance_sampling_ratio/max": 2.999685287475586, "sampling/importance_sampling_ratio/mean": 0.9606513977050781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.87429428100586, "sampling/sampling_logp_difference/mean": 0.1558273881673813, "step": 190, "step_time": 143.13196605397388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 1032.015625, "completions/mean_terminated_length": 921.88134765625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5146888419985771, "epoch": 0.47044334975369456, "frac_reward_zero_std": 0.0, "grad_norm": 0.01795531406691693, "kl": 0.030596476048231125, "learning_rate": 4.840635874871259e-05, "loss": -0.15366441011428833, "num_tokens": 29433393.0, "reward": 4.66796875, "reward_std": 1.4584555625915527, "rewards/reward_func/mean": 0.5186631944444444, "rewards/reward_func/std": 0.2413034306632148, "sampling/importance_sampling_ratio/max": 2.996281623840332, "sampling/importance_sampling_ratio/mean": 0.9648961424827576, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.369453430175781, "sampling/sampling_logp_difference/mean": 0.14613790810108185, "step": 191, "step_time": 170.6787863143254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2995.0, "completions/mean_length": 990.515625, "completions/mean_terminated_length": 794.6551513671875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5170257911086082, "epoch": 0.4729064039408867, "frac_reward_zero_std": 0.0, "grad_norm": 0.020453909952142232, "kl": 0.04368177242577076, "learning_rate": 4.838927155665225e-05, "loss": 0.026476195082068443, "num_tokens": 29576562.0, "reward": 4.89453125, "reward_std": 1.1992099285125732, "rewards/reward_func/mean": 0.5438368055555556, "rewards/reward_func/std": 0.2044545453455713, "sampling/importance_sampling_ratio/max": 2.9978442192077637, "sampling/importance_sampling_ratio/mean": 0.9669831395149231, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.18189811706543, "sampling/sampling_logp_difference/mean": 0.13926595449447632, "step": 192, "step_time": 132.4229573700577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 1411.796875, "completions/mean_terminated_length": 1298.34423828125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5414084792137146, "epoch": 0.4753694581280788, "frac_reward_zero_std": 0.0, "grad_norm": 0.015964047365254634, "kl": 0.026161770801991224, "learning_rate": 4.837209629568462e-05, "loss": -0.20564614236354828, "num_tokens": 29759525.0, "reward": 4.53125, "reward_std": 1.5297966003417969, "rewards/reward_func/mean": 0.5034722222222222, "rewards/reward_func/std": 0.24822904335127938, "sampling/importance_sampling_ratio/max": 2.9965434074401855, "sampling/importance_sampling_ratio/mean": 0.9571963548660278, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.28773021697998, "sampling/sampling_logp_difference/mean": 0.15651743113994598, "step": 193, "step_time": 139.51090059312992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3751.0, "completions/mean_length": 1201.515625, "completions/mean_terminated_length": 987.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5668457299470901, "epoch": 0.47783251231527096, "frac_reward_zero_std": 0.0, "grad_norm": 0.01222374150957719, "kl": 0.040560389403253794, "learning_rate": 4.8354833030480674e-05, "loss": -0.09171874076128006, "num_tokens": 29924774.0, "reward": 5.125, "reward_std": 1.3348206281661987, "rewards/reward_func/mean": 0.5694444444444444, "rewards/reward_func/std": 0.22527392663889462, "sampling/importance_sampling_ratio/max": 2.9956600666046143, "sampling/importance_sampling_ratio/mean": 0.9569047689437866, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.646681785583496, "sampling/sampling_logp_difference/mean": 0.16455239057540894, "step": 194, "step_time": 127.55075355409645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 1242.109375, "completions/mean_terminated_length": 1038.2373046875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5766540616750717, "epoch": 0.4802955665024631, "frac_reward_zero_std": 0.25, "grad_norm": 0.01770060557750442, "kl": 0.01768818451091647, "learning_rate": 4.833748182604273e-05, "loss": -0.06070397049188614, "num_tokens": 30095053.0, "reward": 4.796875, "reward_std": 1.3771628141403198, "rewards/reward_func/mean": 0.5329861111111112, "rewards/reward_func/std": 0.23674807945887247, "sampling/importance_sampling_ratio/max": 2.9945473670959473, "sampling/importance_sampling_ratio/mean": 0.9582319259643555, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.335448265075684, "sampling/sampling_logp_difference/mean": 0.1665378212928772, "step": 195, "step_time": 156.3429046079982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 804.734375, "completions/mean_terminated_length": 752.4921264648438, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5483453124761581, "epoch": 0.4827586206896552, "frac_reward_zero_std": 0.25, "grad_norm": 0.011853860468393498, "kl": 0.030422898940742016, "learning_rate": 4.832004274770422e-05, "loss": -0.036816734820604324, "num_tokens": 30231644.0, "reward": 5.1640625, "reward_std": 0.9545409083366394, "rewards/reward_func/mean": 0.5737847222222222, "rewards/reward_func/std": 0.18036837296353447, "sampling/importance_sampling_ratio/max": 2.997767925262451, "sampling/importance_sampling_ratio/mean": 0.9683334231376648, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.523677825927734, "sampling/sampling_logp_difference/mean": 0.14397111535072327, "step": 196, "step_time": 109.79577358718961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3785.0, "completions/mean_length": 1498.546875, "completions/mean_terminated_length": 1466.6719970703125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6924940571188927, "epoch": 0.4852216748768473, "frac_reward_zero_std": 0.0, "grad_norm": 0.023579205094821604, "kl": 0.01952385390177369, "learning_rate": 4.8302515861129474e-05, "loss": 0.19029618799686432, "num_tokens": 30419551.0, "reward": 4.796875, "reward_std": 1.268415927886963, "rewards/reward_func/mean": 0.5329861111111112, "rewards/reward_func/std": 0.24172814769877327, "sampling/importance_sampling_ratio/max": 2.9983105659484863, "sampling/importance_sampling_ratio/mean": 0.945136308670044, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.623981475830078, "sampling/sampling_logp_difference/mean": 0.1995391845703125, "step": 197, "step_time": 176.65033843182027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3796.0, "completions/mean_length": 1169.40625, "completions/mean_terminated_length": 1025.475341796875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "degenerate_groups_filtered": 1.0, "entropy": 0.553646519780159, "epoch": 0.4876847290640394, "frac_reward_zero_std": 0.25, "grad_norm": 0.04398149539440762, "kl": 0.02133885119110346, "learning_rate": 4.828490123231342e-05, "loss": 0.01763659343123436, "num_tokens": 30582153.0, "reward": 4.37890625, "reward_std": 1.8414281606674194, "rewards/reward_func/mean": 0.4865451388888889, "rewards/reward_func/std": 0.2796827761663331, "sampling/importance_sampling_ratio/max": 2.9970648288726807, "sampling/importance_sampling_ratio/mean": 0.9559179544448853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.906939506530762, "sampling/sampling_logp_difference/mean": 0.16657188534736633, "step": 198, "step_time": 125.99079136014916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3215.0, "completions/mean_length": 952.34375, "completions/mean_terminated_length": 850.9354858398438, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5982878059148788, "epoch": 0.49014778325123154, "frac_reward_zero_std": 0.5, "grad_norm": 0.026449524534477093, "kl": 0.02422321867197752, "learning_rate": 4.8267198927581415e-05, "loss": 0.08025837689638138, "num_tokens": 30742703.0, "reward": 4.86328125, "reward_std": 1.1750799417495728, "rewards/reward_func/mean": 0.5403645833333334, "rewards/reward_func/std": 0.20424510911107063, "sampling/importance_sampling_ratio/max": 2.996950387954712, "sampling/importance_sampling_ratio/mean": 0.9554992914199829, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.107158660888672, "sampling/sampling_logp_difference/mean": 0.17947040498256683, "step": 199, "step_time": 128.3923730046954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 1333.984375, "completions/mean_terminated_length": 1193.666748046875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5008647292852402, "epoch": 0.49261083743842365, "frac_reward_zero_std": 0.25, "grad_norm": 0.012771339469120326, "kl": 0.02338361693546176, "learning_rate": 4.824940901358889e-05, "loss": -0.011300859972834587, "num_tokens": 30914094.0, "reward": 4.98046875, "reward_std": 1.153243064880371, "rewards/reward_func/mean": 0.5533854166666666, "rewards/reward_func/std": 0.2083408029543029, "sampling/importance_sampling_ratio/max": 2.994527578353882, "sampling/importance_sampling_ratio/mean": 0.9607874155044556, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.63933277130127, "sampling/sampling_logp_difference/mean": 0.15060916543006897, "step": 200, "step_time": 131.234265395673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1110.984375, "completions/mean_terminated_length": 941.4000244140625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "degenerate_groups_filtered": 1.0, "entropy": 0.5410940274596214, "epoch": 0.49507389162561577, "frac_reward_zero_std": 0.5, "grad_norm": 0.018820871729217724, "kl": 0.07351712137460709, "learning_rate": 4.82315315573212e-05, "loss": -0.008761988021433353, "num_tokens": 31075405.0, "reward": 4.7265625, "reward_std": 1.286929726600647, "rewards/reward_func/mean": 0.5251736111111112, "rewards/reward_func/std": 0.18622204413016638, "sampling/importance_sampling_ratio/max": 2.9949045181274414, "sampling/importance_sampling_ratio/mean": 0.9584058523178101, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.770557403564453, "sampling/sampling_logp_difference/mean": 0.16537359356880188, "step": 201, "step_time": 137.49633058882318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 1543.71875, "completions/mean_terminated_length": 1430.36669921875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "degenerate_groups_filtered": 0.0, "entropy": 0.546101376414299, "epoch": 0.4975369458128079, "frac_reward_zero_std": 0.0, "grad_norm": 0.016073747890772528, "kl": 0.033409463707357645, "learning_rate": 4.8213566626093316e-05, "loss": 0.05686764791607857, "num_tokens": 31269403.0, "reward": 5.0, "reward_std": 1.2669485807418823, "rewards/reward_func/mean": 0.5555555555555556, "rewards/reward_func/std": 0.23293556190199322, "sampling/importance_sampling_ratio/max": 2.9992666244506836, "sampling/importance_sampling_ratio/mean": 0.9528164863586426, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.812483787536621, "sampling/sampling_logp_difference/mean": 0.16451823711395264, "step": 202, "step_time": 132.4189603566192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3834.0, "completions/mean_length": 1231.453125, "completions/mean_terminated_length": 1121.03271484375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5824616849422455, "epoch": 0.5, "frac_reward_zero_std": 0.25, "grad_norm": 0.009700723668098444, "kl": 0.02169888187199831, "learning_rate": 4.819551428754957e-05, "loss": -0.026023028418421745, "num_tokens": 31437672.0, "reward": 5.09375, "reward_std": 0.857390820980072, "rewards/reward_func/mean": 0.5659722222222222, "rewards/reward_func/std": 0.16799302399158478, "sampling/importance_sampling_ratio/max": 2.9991044998168945, "sampling/importance_sampling_ratio/mean": 0.9563360810279846, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.749972343444824, "sampling/sampling_logp_difference/mean": 0.1682780683040619, "step": 203, "step_time": 135.12933608028106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 1295.25, "completions/mean_terminated_length": 1005.5172119140625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5633620470762253, "epoch": 0.5024630541871922, "frac_reward_zero_std": 0.0, "grad_norm": 0.016436761796537254, "kl": 0.059325653593987226, "learning_rate": 4.8177374609663415e-05, "loss": -0.09959787130355835, "num_tokens": 31612008.0, "reward": 4.875, "reward_std": 1.3901581764221191, "rewards/reward_func/mean": 0.5416666666666666, "rewards/reward_func/std": 0.23205215194159085, "sampling/importance_sampling_ratio/max": 2.9937283992767334, "sampling/importance_sampling_ratio/mean": 0.9539804458618164, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.698464393615723, "sampling/sampling_logp_difference/mean": 0.17365781962871552, "step": 204, "step_time": 123.92455417569727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3117.0, "completions/mean_length": 1157.078125, "completions/mean_terminated_length": 1067.3834228515625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "degenerate_groups_filtered": 1.0, "entropy": 0.5557613000273705, "epoch": 0.5049261083743842, "frac_reward_zero_std": 0.5, "grad_norm": 0.014862369706502163, "kl": 0.05807019583880901, "learning_rate": 4.815914766073719e-05, "loss": -0.008188445121049881, "num_tokens": 31765821.0, "reward": 5.12109375, "reward_std": 0.9008287787437439, "rewards/reward_func/mean": 0.5690104166666666, "rewards/reward_func/std": 0.16866978506247202, "sampling/importance_sampling_ratio/max": 2.9971923828125, "sampling/importance_sampling_ratio/mean": 0.9620099067687988, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.711771011352539, "sampling/sampling_logp_difference/mean": 0.15512652695178986, "step": 205, "step_time": 145.73420074605383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 1087.1875, "completions/mean_terminated_length": 939.2130737304688, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5469877645373344, "epoch": 0.5073891625615764, "frac_reward_zero_std": 0.25, "grad_norm": 0.024437213855539253, "kl": 0.03669538162648678, "learning_rate": 4.8140833509401815e-05, "loss": -0.08707018941640854, "num_tokens": 31912473.0, "reward": 4.5234375, "reward_std": 1.881783366203308, "rewards/reward_func/mean": 0.5026041666666666, "rewards/reward_func/std": 0.2910439347227414, "sampling/importance_sampling_ratio/max": 2.984907388687134, "sampling/importance_sampling_ratio/mean": 0.9634929895401001, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.110798835754395, "sampling/sampling_logp_difference/mean": 0.14892369508743286, "step": 206, "step_time": 118.83005754603073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2835.0, "completions/mean_length": 1048.890625, "completions/mean_terminated_length": 978.3770141601562, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "degenerate_groups_filtered": 0.0, "entropy": 0.47129160165786743, "epoch": 0.5098522167487685, "frac_reward_zero_std": 0.0, "grad_norm": 0.01883575281540592, "kl": 0.02437049988657236, "learning_rate": 4.812243222461658e-05, "loss": 0.0016541481018066406, "num_tokens": 32064114.0, "reward": 4.9140625, "reward_std": 1.2513633966445923, "rewards/reward_func/mean": 0.5460069444444444, "rewards/reward_func/std": 0.2148944992158148, "sampling/importance_sampling_ratio/max": 2.994464159011841, "sampling/importance_sampling_ratio/mean": 0.9673416018486023, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.47595500946045, "sampling/sampling_logp_difference/mean": 0.1353481113910675, "step": 207, "step_time": 157.52031526481733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 1064.859375, "completions/mean_terminated_length": 788.7413940429688, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5812835246324539, "epoch": 0.5123152709359606, "frac_reward_zero_std": 0.0, "grad_norm": 0.02172590175321915, "kl": 0.04220228176563978, "learning_rate": 4.8103943875668844e-05, "loss": -0.17685756087303162, "num_tokens": 32223801.0, "reward": 4.3125, "reward_std": 1.8055250644683838, "rewards/reward_func/mean": 0.4791666666666667, "rewards/reward_func/std": 0.27989307790994644, "sampling/importance_sampling_ratio/max": 2.9918086528778076, "sampling/importance_sampling_ratio/mean": 0.9594242572784424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.874069213867188, "sampling/sampling_logp_difference/mean": 0.16716912388801575, "step": 208, "step_time": 135.27337133488618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 945.953125, "completions/mean_terminated_length": 853.8851928710938, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "degenerate_groups_filtered": 1.0, "entropy": 0.568095900118351, "epoch": 0.5147783251231527, "frac_reward_zero_std": 0.25, "grad_norm": 0.02069842032168839, "kl": 0.04567871009930968, "learning_rate": 4.8085368532173804e-05, "loss": -0.054947808384895325, "num_tokens": 32373222.0, "reward": 4.88671875, "reward_std": 1.334768295288086, "rewards/reward_func/mean": 0.54296875, "rewards/reward_func/std": 0.22824304468101925, "sampling/importance_sampling_ratio/max": 2.996091604232788, "sampling/importance_sampling_ratio/mean": 0.9642556309700012, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.462482452392578, "sampling/sampling_logp_difference/mean": 0.15888792276382446, "step": 209, "step_time": 151.47347256494686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1201.515625, "completions/mean_terminated_length": 1059.163818359375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "degenerate_groups_filtered": 0.0, "entropy": 0.559689849615097, "epoch": 0.5172413793103449, "frac_reward_zero_std": 0.0, "grad_norm": 0.015491091292339451, "kl": 0.03229631157591939, "learning_rate": 4.806670626407422e-05, "loss": -0.10237803310155869, "num_tokens": 32539367.0, "reward": 4.7734375, "reward_std": 1.3770502805709839, "rewards/reward_func/mean": 0.5303819444444444, "rewards/reward_func/std": 0.21820614321364296, "sampling/importance_sampling_ratio/max": 2.9985454082489014, "sampling/importance_sampling_ratio/mean": 0.9577094912528992, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.66981315612793, "sampling/sampling_logp_difference/mean": 0.1669681966304779, "step": 210, "step_time": 128.17530722473748 } ], "logging_steps": 1, "max_steps": 1624, "num_input_tokens_seen": 32539367, "num_train_epochs": 4, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }