{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7389162561576355, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 1390.40625, "completions/mean_terminated_length": 1227.3389892578125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6809434145689011, "epoch": 0.0024630541871921183, "frac_reward_zero_std": 0.0, "grad_norm": 0.007472159201174485, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0422082245349884, "num_tokens": 179514.0, "reward": 0.70703125, "reward_std": 0.8528136014938354, "rewards/reward_func/mean": 0.07855902777777778, "rewards/reward_func/std": 0.1362110757165485, "sampling/importance_sampling_ratio/max": 2.996610164642334, "sampling/importance_sampling_ratio/mean": 0.9419828653335571, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.850329399108887, "sampling/sampling_logp_difference/mean": 0.21576407551765442, "step": 1, "step_time": 220.12700563087128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 1254.984375, "completions/mean_terminated_length": 935.3928833007812, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7852305322885513, "epoch": 0.0049261083743842365, "frac_reward_zero_std": 0.0, "grad_norm": 0.006449001270019462, "kl": 0.0, "learning_rate": 1e-05, "loss": 0.031678903847932816, "num_tokens": 345929.0, "reward": 0.8359375, "reward_std": 0.9461898803710938, "rewards/reward_func/mean": 0.09288194444444445, "rewards/reward_func/std": 0.14738090998596615, "sampling/importance_sampling_ratio/max": 2.998335361480713, "sampling/importance_sampling_ratio/mean": 0.9392645359039307, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.990495681762695, "sampling/sampling_logp_difference/mean": 0.23307372629642487, "step": 2, "step_time": 133.00610918574966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 911.671875, "completions/mean_terminated_length": 755.0655517578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7302966266870499, "epoch": 0.007389162561576354, "frac_reward_zero_std": 0.0, "grad_norm": 0.00749225424952202, "kl": 0.00024622307682875544, "learning_rate": 2e-05, "loss": -0.02362673729658127, "num_tokens": 491044.0, "reward": 0.76953125, "reward_std": 0.6112449765205383, "rewards/reward_func/mean": 0.08550347222222222, "rewards/reward_func/std": 0.08551493618223402, "sampling/importance_sampling_ratio/max": 2.999403715133667, "sampling/importance_sampling_ratio/mean": 0.9553625583648682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.173587799072266, "sampling/sampling_logp_difference/mean": 0.1916211098432541, "step": 3, "step_time": 123.70071696513332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2956.0, "completions/max_terminated_length": 2956.0, "completions/mean_length": 773.96875, "completions/mean_terminated_length": 757.1128540039062, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6060236841440201, "epoch": 0.009852216748768473, "frac_reward_zero_std": 0.0, "grad_norm": 0.0067466090505615695, "kl": 0.00021368478701333515, "learning_rate": 3e-05, "loss": -0.048330288380384445, "num_tokens": 621154.0, "reward": 0.80859375, "reward_std": 0.5629820227622986, "rewards/reward_func/mean": 0.08984375, "rewards/reward_func/std": 0.08135415448082818, "sampling/importance_sampling_ratio/max": 2.9971704483032227, "sampling/importance_sampling_ratio/mean": 0.962253212928772, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.119998931884766, "sampling/sampling_logp_difference/mean": 0.1678471714258194, "step": 4, "step_time": 98.42003497900441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 746.71875, "completions/mean_terminated_length": 693.5556030273438, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6175970137119293, "epoch": 0.012315270935960592, "frac_reward_zero_std": 0.0, "grad_norm": 0.007010015487624556, "kl": 0.00022748785340809263, "learning_rate": 4e-05, "loss": 0.026304475963115692, "num_tokens": 755728.0, "reward": 0.66796875, "reward_std": 0.5176540017127991, "rewards/reward_func/mean": 0.07421875, "rewards/reward_func/std": 0.07090304957495795, "sampling/importance_sampling_ratio/max": 2.999366283416748, "sampling/importance_sampling_ratio/mean": 0.9587454795837402, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.925031661987305, "sampling/sampling_logp_difference/mean": 0.17798733711242676, "step": 5, "step_time": 132.88659988692962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 1258.03125, "completions/mean_terminated_length": 1163.559326171875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6823784708976746, "epoch": 0.014778325123152709, "frac_reward_zero_std": 0.0, "grad_norm": 0.01697977367488114, "kl": 0.000256427658314351, "learning_rate": 5e-05, "loss": 0.02028002217411995, "num_tokens": 927458.0, "reward": 0.77734375, "reward_std": 0.5310165286064148, "rewards/reward_func/mean": 0.08637152777777778, "rewards/reward_func/std": 0.10350671741697523, "sampling/importance_sampling_ratio/max": 2.998941659927368, "sampling/importance_sampling_ratio/mean": 0.9461972713470459, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.44400405883789, "sampling/sampling_logp_difference/mean": 0.20700082182884216, "step": 6, "step_time": 133.52735476591624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 789.28125, "completions/mean_terminated_length": 718.7704467773438, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7047427892684937, "epoch": 0.017241379310344827, "frac_reward_zero_std": 0.0, "grad_norm": 0.006955772610229579, "kl": 0.0003960179747082293, "learning_rate": 4.999995293306428e-05, "loss": -0.061842143535614014, "num_tokens": 1060052.0, "reward": 0.80078125, "reward_std": 0.4983697235584259, "rewards/reward_func/mean": 0.08897569444444445, "rewards/reward_func/std": 0.07075256274806128, "sampling/importance_sampling_ratio/max": 2.997187614440918, "sampling/importance_sampling_ratio/mean": 0.9554922580718994, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.752728462219238, "sampling/sampling_logp_difference/mean": 0.19021925330162048, "step": 7, "step_time": 110.41399249015376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 679.640625, "completions/mean_terminated_length": 624.6129150390625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7376732230186462, "epoch": 0.019704433497536946, "frac_reward_zero_std": 0.0, "grad_norm": 0.008032833725557987, "kl": 0.000727692706277594, "learning_rate": 4.999981173243434e-05, "loss": -0.0068512773141264915, "num_tokens": 1175117.0, "reward": 0.859375, "reward_std": 0.5056722164154053, "rewards/reward_func/mean": 0.0954861111111111, "rewards/reward_func/std": 0.07390516665246752, "sampling/importance_sampling_ratio/max": 2.987856864929199, "sampling/importance_sampling_ratio/mean": 0.9520862698554993, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.813932418823242, "sampling/sampling_logp_difference/mean": 0.19661623239517212, "step": 8, "step_time": 119.54593483870849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2963.0, "completions/mean_length": 690.1875, "completions/mean_terminated_length": 643.9343872070312, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6059777140617371, "epoch": 0.022167487684729065, "frac_reward_zero_std": 0.0, "grad_norm": 0.005822944820783743, "kl": 0.0007264916785061359, "learning_rate": 4.999957639864185e-05, "loss": -0.039480455219745636, "num_tokens": 1301465.0, "reward": 0.84765625, "reward_std": 0.46636295318603516, "rewards/reward_func/mean": 0.09418402777777778, "rewards/reward_func/std": 0.06715176006158192, "sampling/importance_sampling_ratio/max": 2.9996423721313477, "sampling/importance_sampling_ratio/mean": 0.9616194367408752, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.928505897521973, "sampling/sampling_logp_difference/mean": 0.16725227236747742, "step": 9, "step_time": 122.72574849962257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 972.6875, "completions/mean_terminated_length": 886.0491333007812, "completions/min_length": 11.0, "completions/min_terminated_length": 165.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7491600215435028, "epoch": 0.024630541871921183, "frac_reward_zero_std": 0.0, "grad_norm": 0.00584538129342024, "kl": 0.0013387255748966709, "learning_rate": 4.999924693257293e-05, "loss": -0.01264580525457859, "num_tokens": 1449989.0, "reward": 0.84375, "reward_std": 0.39213496446609497, "rewards/reward_func/mean": 0.09375, "rewards/reward_func/std": 0.05436270104514228, "sampling/importance_sampling_ratio/max": 2.997166633605957, "sampling/importance_sampling_ratio/mean": 0.9485599994659424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.827897071838379, "sampling/sampling_logp_difference/mean": 0.20776304602622986, "step": 10, "step_time": 132.49725737981498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3585.0, "completions/mean_length": 918.796875, "completions/mean_terminated_length": 816.3064575195312, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6651621907949448, "epoch": 0.027093596059113302, "frac_reward_zero_std": 0.0, "grad_norm": 0.00662799178330484, "kl": 0.001221647864440456, "learning_rate": 4.9998823335468127e-05, "loss": 0.041060641407966614, "num_tokens": 1588824.0, "reward": 0.98046875, "reward_std": 0.5065144896507263, "rewards/reward_func/mean": 0.10894097222222222, "rewards/reward_func/std": 0.104803666472435, "sampling/importance_sampling_ratio/max": 2.9956586360931396, "sampling/importance_sampling_ratio/mean": 0.9583698511123657, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.478806495666504, "sampling/sampling_logp_difference/mean": 0.18324632942676544, "step": 11, "step_time": 120.71313043916598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 618.9375, "completions/mean_terminated_length": 569.9354858398438, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6763262897729874, "epoch": 0.029556650246305417, "frac_reward_zero_std": 0.0, "grad_norm": 0.002954945550195093, "kl": 0.0005683798735844903, "learning_rate": 4.9998305608922444e-05, "loss": -0.0004440499469637871, "num_tokens": 1702980.0, "reward": 0.9609375, "reward_std": 0.24077729880809784, "rewards/reward_func/mean": 0.10677083333333333, "rewards/reward_func/std": 0.03628063201904297, "sampling/importance_sampling_ratio/max": 2.9984214305877686, "sampling/importance_sampling_ratio/mean": 0.9655221700668335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.970296859741211, "sampling/sampling_logp_difference/mean": 0.17147639393806458, "step": 12, "step_time": 118.71694010868669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 828.515625, "completions/mean_terminated_length": 776.6508178710938, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7719429731369019, "epoch": 0.03201970443349754, "frac_reward_zero_std": 0.0, "grad_norm": 0.006594125607498368, "kl": 0.000925697706406936, "learning_rate": 4.99976937548853e-05, "loss": -0.05999775603413582, "num_tokens": 1857253.0, "reward": 1.05078125, "reward_std": 0.6801897287368774, "rewards/reward_func/mean": 0.11675347222222222, "rewards/reward_func/std": 0.13205527265866598, "sampling/importance_sampling_ratio/max": 2.999297618865967, "sampling/importance_sampling_ratio/mean": 0.9461138844490051, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.873981475830078, "sampling/sampling_logp_difference/mean": 0.2143385112285614, "step": 13, "step_time": 130.19757038285024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3709.0, "completions/mean_length": 1144.25, "completions/mean_terminated_length": 967.0516967773438, "completions/min_length": 95.0, "completions/min_terminated_length": 99.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7670099884271622, "epoch": 0.034482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.004157733533644429, "kl": 0.0005810616421513259, "learning_rate": 4.999698777566055e-05, "loss": -0.06252727657556534, "num_tokens": 2026677.0, "reward": 0.9921875, "reward_std": 0.5598482489585876, "rewards/reward_func/mean": 0.11024305555555555, "rewards/reward_func/std": 0.08032437165578206, "sampling/importance_sampling_ratio/max": 2.9975733757019043, "sampling/importance_sampling_ratio/mean": 0.9378005266189575, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.929158210754395, "sampling/sampling_logp_difference/mean": 0.23680457472801208, "step": 14, "step_time": 131.84048197907396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3624.0, "completions/mean_length": 841.796875, "completions/mean_terminated_length": 712.3770141601562, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7286232858896255, "epoch": 0.03694581280788178, "frac_reward_zero_std": 0.0, "grad_norm": 0.005158231023463571, "kl": 0.0006725726707372814, "learning_rate": 4.9996187673906445e-05, "loss": -0.002383149228990078, "num_tokens": 2160216.0, "reward": 0.875, "reward_std": 0.35073620080947876, "rewards/reward_func/mean": 0.09722222222222222, "rewards/reward_func/std": 0.05088456802897983, "sampling/importance_sampling_ratio/max": 2.9999756813049316, "sampling/importance_sampling_ratio/mean": 0.9528787732124329, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.482815742492676, "sampling/sampling_logp_difference/mean": 0.19452627003192902, "step": 15, "step_time": 120.6714372949209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3658.0, "completions/mean_length": 1049.03125, "completions/mean_terminated_length": 949.1146850585938, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7303152084350586, "epoch": 0.03940886699507389, "frac_reward_zero_std": 0.25, "grad_norm": 0.004095605963986492, "kl": 0.0006583686117664911, "learning_rate": 4.9995293452635664e-05, "loss": 0.0015001269057393074, "num_tokens": 2313242.0, "reward": 0.96875, "reward_std": 0.4045867919921875, "rewards/reward_func/mean": 0.1076388888888889, "rewards/reward_func/std": 0.0616969366868337, "sampling/importance_sampling_ratio/max": 2.998537540435791, "sampling/importance_sampling_ratio/mean": 0.9495047330856323, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.123780250549316, "sampling/sampling_logp_difference/mean": 0.20469555258750916, "step": 16, "step_time": 130.63938916497864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3645.0, "completions/mean_length": 1294.21875, "completions/mean_terminated_length": 1156.4261474609375, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6684367954730988, "epoch": 0.04187192118226601, "frac_reward_zero_std": 0.0, "grad_norm": 0.004925385999842817, "kl": 0.0006526907527586445, "learning_rate": 4.999430511521525e-05, "loss": -0.010747631080448627, "num_tokens": 2496024.0, "reward": 0.87890625, "reward_std": 0.5598897933959961, "rewards/reward_func/mean": 0.09765625, "rewards/reward_func/std": 0.08093192842271593, "sampling/importance_sampling_ratio/max": 2.999262571334839, "sampling/importance_sampling_ratio/mean": 0.9423873424530029, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.291959762573242, "sampling/sampling_logp_difference/mean": 0.213131383061409, "step": 17, "step_time": 136.92378660477698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 1045.015625, "completions/mean_terminated_length": 996.5873413085938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6524666249752045, "epoch": 0.04433497536945813, "frac_reward_zero_std": 0.25, "grad_norm": 0.0021259296347707433, "kl": 0.000516067972057499, "learning_rate": 4.999322266536666e-05, "loss": -0.004655790515244007, "num_tokens": 2648697.0, "reward": 1.0078125, "reward_std": 0.3180070221424103, "rewards/reward_func/mean": 0.11197916666666667, "rewards/reward_func/std": 0.04779417647255792, "sampling/importance_sampling_ratio/max": 2.99773907661438, "sampling/importance_sampling_ratio/mean": 0.95334792137146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.296531677246094, "sampling/sampling_logp_difference/mean": 0.1851186603307724, "step": 18, "step_time": 117.7728746230714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3369.0, "completions/mean_length": 965.890625, "completions/mean_terminated_length": 916.2064208984375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7430569380521774, "epoch": 0.046798029556650245, "frac_reward_zero_std": 0.0, "grad_norm": 0.006782355663411477, "kl": 0.0009014422394102439, "learning_rate": 4.9992046107165705e-05, "loss": -0.001231623813509941, "num_tokens": 2804386.0, "reward": 0.8515625, "reward_std": 0.4403253495693207, "rewards/reward_func/mean": 0.09461805555555555, "rewards/reward_func/std": 0.06371734705236223, "sampling/importance_sampling_ratio/max": 2.9986655712127686, "sampling/importance_sampling_ratio/mean": 0.9503310322761536, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.261140823364258, "sampling/sampling_logp_difference/mean": 0.2094900906085968, "step": 19, "step_time": 115.97705295099877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3855.0, "completions/mean_length": 1512.296875, "completions/mean_terminated_length": 1245.0172119140625, "completions/min_length": 28.0, "completions/min_terminated_length": 28.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6677703708410263, "epoch": 0.04926108374384237, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038694768468681665, "kl": 0.000615698067122139, "learning_rate": 4.999077544504252e-05, "loss": -0.013612421229481697, "num_tokens": 2996821.0, "reward": 0.9296875, "reward_std": 0.5184469223022461, "rewards/reward_func/mean": 0.1032986111111111, "rewards/reward_func/std": 0.07620417740609911, "sampling/importance_sampling_ratio/max": 2.9964072704315186, "sampling/importance_sampling_ratio/mean": 0.9467508792877197, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.328170776367188, "sampling/sampling_logp_difference/mean": 0.20124885439872742, "step": 20, "step_time": 151.16719577508047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 1017.4375, "completions/mean_terminated_length": 971.5967407226562, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6549539119005203, "epoch": 0.05172413793103448, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036090876211084095, "kl": 0.0005021466495236382, "learning_rate": 4.998941068378163e-05, "loss": 0.02314213290810585, "num_tokens": 3159153.0, "reward": 1.03125, "reward_std": 0.42140164971351624, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.062453763352500066, "sampling/importance_sampling_ratio/max": 2.996140956878662, "sampling/importance_sampling_ratio/mean": 0.9525138735771179, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.250596046447754, "sampling/sampling_logp_difference/mean": 0.1887052059173584, "step": 21, "step_time": 116.94446952594444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3670.0, "completions/mean_length": 1073.796875, "completions/mean_terminated_length": 965.7500610351562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7046701908111572, "epoch": 0.054187192118226604, "frac_reward_zero_std": 0.25, "grad_norm": 0.004920402694549647, "kl": 0.0006120866673882119, "learning_rate": 4.998795182852183e-05, "loss": 0.010382827371358871, "num_tokens": 3320836.0, "reward": 0.921875, "reward_std": 0.43387100100517273, "rewards/reward_func/mean": 0.10243055555555555, "rewards/reward_func/std": 0.06410401562849681, "sampling/importance_sampling_ratio/max": 2.998100757598877, "sampling/importance_sampling_ratio/mean": 0.9491228461265564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.469171524047852, "sampling/sampling_logp_difference/mean": 0.20143745839595795, "step": 22, "step_time": 140.71716447197832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 528.6875, "completions/mean_terminated_length": 523.793701171875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6452099084854126, "epoch": 0.05665024630541872, "frac_reward_zero_std": 0.0, "grad_norm": 0.006521818812670888, "kl": 0.0008621769229648635, "learning_rate": 4.998639888475621e-05, "loss": 0.020057888701558113, "num_tokens": 3439920.0, "reward": 0.92578125, "reward_std": 0.3951081931591034, "rewards/reward_func/mean": 0.10286458333333333, "rewards/reward_func/std": 0.05866531365447574, "sampling/importance_sampling_ratio/max": 2.9968478679656982, "sampling/importance_sampling_ratio/mean": 0.9646427631378174, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.593832969665527, "sampling/sampling_logp_difference/mean": 0.16749747097492218, "step": 23, "step_time": 73.50592263997532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 819.859375, "completions/mean_terminated_length": 773.3547973632812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6914758086204529, "epoch": 0.059113300492610835, "frac_reward_zero_std": 0.0, "grad_norm": 0.006837082241056972, "kl": 0.0008855514170136303, "learning_rate": 4.998475185833219e-05, "loss": -0.030980011448264122, "num_tokens": 3573143.0, "reward": 0.828125, "reward_std": 0.4626420736312866, "rewards/reward_func/mean": 0.0920138888888889, "rewards/reward_func/std": 0.06752209034230974, "sampling/importance_sampling_ratio/max": 2.999678373336792, "sampling/importance_sampling_ratio/mean": 0.9538560509681702, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.174277305603027, "sampling/sampling_logp_difference/mean": 0.18549545109272003, "step": 24, "step_time": 149.25408225506544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2177.0, "completions/max_terminated_length": 2177.0, "completions/mean_length": 1004.90625, "completions/mean_terminated_length": 1011.9683227539062, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6701414585113525, "epoch": 0.06157635467980296, "frac_reward_zero_std": 0.0, "grad_norm": 0.004917015329615652, "kl": 0.0007427596428897232, "learning_rate": 4.9983010755451386e-05, "loss": -0.005887920036911964, "num_tokens": 3734833.0, "reward": 0.89453125, "reward_std": 0.4197615683078766, "rewards/reward_func/mean": 0.0993923611111111, "rewards/reward_func/std": 0.06156396369139353, "sampling/importance_sampling_ratio/max": 2.9989845752716064, "sampling/importance_sampling_ratio/mean": 0.9514051079750061, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.371703147888184, "sampling/sampling_logp_difference/mean": 0.19796010851860046, "step": 25, "step_time": 77.61233417131007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3464.0, "completions/max_terminated_length": 3464.0, "completions/mean_length": 902.65625, "completions/mean_terminated_length": 916.6349487304688, "completions/min_length": 22.0, "completions/min_terminated_length": 122.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6815015971660614, "epoch": 0.06403940886699508, "frac_reward_zero_std": 0.0, "grad_norm": 0.0068310602863901095, "kl": 0.0006349557806970552, "learning_rate": 4.998117558266968e-05, "loss": -0.008174655959010124, "num_tokens": 3880139.0, "reward": 1.06640625, "reward_std": 0.7769525051116943, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.12652034560839334, "sampling/importance_sampling_ratio/max": 2.991055488586426, "sampling/importance_sampling_ratio/mean": 0.9585548639297485, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.999675750732422, "sampling/sampling_logp_difference/mean": 0.18726293742656708, "step": 26, "step_time": 121.91509590903297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3379.0, "completions/mean_length": 990.359375, "completions/mean_terminated_length": 944.274169921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7005010098218918, "epoch": 0.0665024630541872, "frac_reward_zero_std": 0.0, "grad_norm": 0.004302942766743173, "kl": 0.0006778043898520991, "learning_rate": 4.9979246346897136e-05, "loss": -0.004731738939881325, "num_tokens": 4023810.0, "reward": 0.98828125, "reward_std": 0.4530095160007477, "rewards/reward_func/mean": 0.10980902777777778, "rewards/reward_func/std": 0.06777984897295634, "sampling/importance_sampling_ratio/max": 2.9963815212249756, "sampling/importance_sampling_ratio/mean": 0.948081374168396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.98293685913086, "sampling/sampling_logp_difference/mean": 0.20313100516796112, "step": 27, "step_time": 137.8828469752334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3554.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 938.46875, "completions/mean_terminated_length": 933.1638793945312, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "degenerate_groups_filtered": 0.0, "entropy": 0.670497789978981, "epoch": 0.06896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.0027728934206346844, "kl": 0.000708279010723345, "learning_rate": 4.997722305539802e-05, "loss": -0.013293277472257614, "num_tokens": 4170288.0, "reward": 0.98046875, "reward_std": 0.32521265745162964, "rewards/reward_func/mean": 0.10894097222222222, "rewards/reward_func/std": 0.05035136308934954, "sampling/importance_sampling_ratio/max": 2.9964194297790527, "sampling/importance_sampling_ratio/mean": 0.9554746150970459, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.594246864318848, "sampling/sampling_logp_difference/mean": 0.18609148263931274, "step": 28, "step_time": 104.988751814235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3978.0, "completions/max_terminated_length": 3978.0, "completions/mean_length": 1090.75, "completions/mean_terminated_length": 1090.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6079529374837875, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.25, "grad_norm": 0.004202195838337962, "kl": 0.0006077909274608828, "learning_rate": 4.997510571579074e-05, "loss": 0.002752694534137845, "num_tokens": 4323968.0, "reward": 1.01953125, "reward_std": 0.40164515376091003, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.060056461228264704, "sampling/importance_sampling_ratio/max": 2.993762254714966, "sampling/importance_sampling_ratio/mean": 0.9517241716384888, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.50137996673584, "sampling/sampling_logp_difference/mean": 0.18623289465904236, "step": 29, "step_time": 126.06809087703004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 1285.796875, "completions/mean_terminated_length": 1147.590087890625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5812672078609467, "epoch": 0.07389162561576355, "frac_reward_zero_std": 0.0, "grad_norm": 0.005014011974796489, "kl": 0.0006074062112020329, "learning_rate": 4.997289433604783e-05, "loss": -0.014224007725715637, "num_tokens": 4499315.0, "reward": 0.9609375, "reward_std": 0.3914227783679962, "rewards/reward_func/mean": 0.10677083333333333, "rewards/reward_func/std": 0.05869032442569733, "sampling/importance_sampling_ratio/max": 2.9965202808380127, "sampling/importance_sampling_ratio/mean": 0.952495813369751, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.36746597290039, "sampling/sampling_logp_difference/mean": 0.18935205042362213, "step": 30, "step_time": 127.11935253324918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 1127.96875, "completions/mean_terminated_length": 1080.857177734375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6890313923358917, "epoch": 0.07635467980295567, "frac_reward_zero_std": 0.0, "grad_norm": 0.004790872325570113, "kl": 0.0005802500963909552, "learning_rate": 4.997058892449591e-05, "loss": -0.022644199430942535, "num_tokens": 4665601.0, "reward": 0.9375, "reward_std": 0.4249182939529419, "rewards/reward_func/mean": 0.10416666666666667, "rewards/reward_func/std": 0.06313965552383, "sampling/importance_sampling_ratio/max": 2.9980430603027344, "sampling/importance_sampling_ratio/mean": 0.9494443535804749, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.687466621398926, "sampling/sampling_logp_difference/mean": 0.20097197592258453, "step": 31, "step_time": 125.67983064893633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3866.0, "completions/mean_length": 1272.625, "completions/mean_terminated_length": 1133.7703857421875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7184868454933167, "epoch": 0.07881773399014778, "frac_reward_zero_std": 0.0, "grad_norm": 0.003695038302454457, "kl": 0.0006317304505500942, "learning_rate": 4.99681894898157e-05, "loss": 0.004109987523406744, "num_tokens": 4838953.0, "reward": 1.06640625, "reward_std": 0.5122355818748474, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.07386576467090183, "sampling/importance_sampling_ratio/max": 2.9965872764587402, "sampling/importance_sampling_ratio/mean": 0.9484747648239136, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.850101470947266, "sampling/sampling_logp_difference/mean": 0.20671531558036804, "step": 32, "step_time": 124.64264245890081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3357.0, "completions/mean_length": 1086.625, "completions/mean_terminated_length": 989.54833984375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6850003600120544, "epoch": 0.0812807881773399, "frac_reward_zero_std": 0.0, "grad_norm": 0.004399793228138843, "kl": 0.0008690290997037664, "learning_rate": 4.99656960410419e-05, "loss": -0.013336382806301117, "num_tokens": 5000161.0, "reward": 0.9453125, "reward_std": 0.41659224033355713, "rewards/reward_func/mean": 0.10503472222222222, "rewards/reward_func/std": 0.062094110581609935, "sampling/importance_sampling_ratio/max": 2.997372627258301, "sampling/importance_sampling_ratio/mean": 0.9499393701553345, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.663118362426758, "sampling/sampling_logp_difference/mean": 0.20705397427082062, "step": 33, "step_time": 126.14335125219077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3882.0, "completions/mean_length": 1133.890625, "completions/mean_terminated_length": 1019.0819091796875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7852240353822708, "epoch": 0.08374384236453201, "frac_reward_zero_std": 0.25, "grad_norm": 0.0038741539289486736, "kl": 0.0007687857287237421, "learning_rate": 4.9963108587563226e-05, "loss": 0.002182937692850828, "num_tokens": 5159050.0, "reward": 1.0, "reward_std": 0.4564354717731476, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.06883835792541504, "sampling/importance_sampling_ratio/max": 2.9997353553771973, "sampling/importance_sampling_ratio/mean": 0.9504753947257996, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.089191436767578, "sampling/sampling_logp_difference/mean": 0.2042486071586609, "step": 34, "step_time": 176.92318990291096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 1198.0, "completions/mean_terminated_length": 1004.800048828125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7601384967565536, "epoch": 0.08620689655172414, "frac_reward_zero_std": 0.0, "grad_norm": 0.003761993755453868, "kl": 0.0007180090615293011, "learning_rate": 4.996042713912238e-05, "loss": -0.039179325103759766, "num_tokens": 5324810.0, "reward": 0.953125, "reward_std": 0.4517931640148163, "rewards/reward_func/mean": 0.10590277777777778, "rewards/reward_func/std": 0.06709145175086127, "sampling/importance_sampling_ratio/max": 2.999290704727173, "sampling/importance_sampling_ratio/mean": 0.9449152946472168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.980502128601074, "sampling/sampling_logp_difference/mean": 0.22166508436203003, "step": 35, "step_time": 132.487598804757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3676.0, "completions/mean_length": 1095.796875, "completions/mean_terminated_length": 999.01611328125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "degenerate_groups_filtered": 0.0, "entropy": 0.674736425280571, "epoch": 0.08866995073891626, "frac_reward_zero_std": 0.0, "grad_norm": 0.007700117637790993, "kl": 0.000519245870236773, "learning_rate": 4.995765170581595e-05, "loss": 0.05755629390478134, "num_tokens": 5483677.0, "reward": 1.0546875, "reward_std": 0.6049871444702148, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.10860339800516765, "sampling/importance_sampling_ratio/max": 2.9988772869110107, "sampling/importance_sampling_ratio/mean": 0.9505428671836853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.872570037841797, "sampling/sampling_logp_difference/mean": 0.19093479216098785, "step": 36, "step_time": 123.9898575132247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 917.796875, "completions/mean_terminated_length": 867.3492431640625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "degenerate_groups_filtered": 0.0, "entropy": 0.639846533536911, "epoch": 0.09113300492610837, "frac_reward_zero_std": 0.25, "grad_norm": 0.006815328832923949, "kl": 0.0006771365588065237, "learning_rate": 4.995478229809444e-05, "loss": 0.02746357023715973, "num_tokens": 5625440.0, "reward": 1.0703125, "reward_std": 0.7924103736877441, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.12947271267573038, "sampling/importance_sampling_ratio/max": 2.9944005012512207, "sampling/importance_sampling_ratio/mean": 0.9546407461166382, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.414692878723145, "sampling/sampling_logp_difference/mean": 0.18783044815063477, "step": 37, "step_time": 121.30526466900483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3806.0, "completions/mean_length": 932.015625, "completions/mean_terminated_length": 829.9515991210938, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6244807690382004, "epoch": 0.09359605911330049, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024319878910873303, "kl": 0.0004077902340213768, "learning_rate": 4.9951818926762174e-05, "loss": -0.016925642266869545, "num_tokens": 5757121.0, "reward": 1.09375, "reward_std": 0.43983224034309387, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.0649089366197586, "sampling/importance_sampling_ratio/max": 2.9995179176330566, "sampling/importance_sampling_ratio/mean": 0.9589510560035706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.803277969360352, "sampling/sampling_logp_difference/mean": 0.1713067591190338, "step": 38, "step_time": 124.23265223298222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3735.0, "completions/mean_length": 1043.359375, "completions/mean_terminated_length": 1010.5573120117188, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6702292114496231, "epoch": 0.0960591133004926, "frac_reward_zero_std": 0.0, "grad_norm": 0.004304848427086413, "kl": 0.0006781471893191338, "learning_rate": 4.99487616029773e-05, "loss": 0.005671734921634197, "num_tokens": 5905640.0, "reward": 1.0, "reward_std": 0.4225771427154541, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.06295079986254375, "sampling/importance_sampling_ratio/max": 2.9920156002044678, "sampling/importance_sampling_ratio/mean": 0.9502699375152588, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.812431335449219, "sampling/sampling_logp_difference/mean": 0.19391661882400513, "step": 39, "step_time": 127.96421558922157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3826.0, "completions/mean_length": 984.703125, "completions/mean_terminated_length": 944.1935424804688, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6731563657522202, "epoch": 0.09852216748768473, "frac_reward_zero_std": 0.25, "grad_norm": 0.0039037333681927427, "kl": 0.0008755343878874555, "learning_rate": 4.994561033825174e-05, "loss": -0.016288451850414276, "num_tokens": 6054021.0, "reward": 0.98046875, "reward_std": 0.41857820749282837, "rewards/reward_func/mean": 0.10894097222222222, "rewards/reward_func/std": 0.06274111072222392, "sampling/importance_sampling_ratio/max": 2.9966559410095215, "sampling/importance_sampling_ratio/mean": 0.9564224481582642, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.249920845031738, "sampling/sampling_logp_difference/mean": 0.18356505036354065, "step": 40, "step_time": 129.37665040860884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 969.96875, "completions/mean_terminated_length": 715.7368774414062, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6139142364263535, "epoch": 0.10098522167487685, "frac_reward_zero_std": 0.0, "grad_norm": 0.0060334001765760745, "kl": 0.0013773875834885985, "learning_rate": 4.99423651444511e-05, "loss": -0.00031678611412644386, "num_tokens": 6202659.0, "reward": 0.8203125, "reward_std": 0.5933974385261536, "rewards/reward_func/mean": 0.09114583333333333, "rewards/reward_func/std": 0.11807411743534936, "sampling/importance_sampling_ratio/max": 2.996901750564575, "sampling/importance_sampling_ratio/mean": 0.9532800912857056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.062097549438477, "sampling/sampling_logp_difference/mean": 0.18368680775165558, "step": 41, "step_time": 181.77138043870218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 826.328125, "completions/mean_terminated_length": 720.8547973632812, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6861467808485031, "epoch": 0.10344827586206896, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033062963877428346, "kl": 0.001169464725535363, "learning_rate": 4.993902603379471e-05, "loss": -0.009817395359277725, "num_tokens": 6335368.0, "reward": 0.96484375, "reward_std": 0.28821834921836853, "rewards/reward_func/mean": 0.1072048611111111, "rewards/reward_func/std": 0.04385380778047773, "sampling/importance_sampling_ratio/max": 2.986184597015381, "sampling/importance_sampling_ratio/mean": 0.9563000202178955, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.047112464904785, "sampling/sampling_logp_difference/mean": 0.18140462040901184, "step": 42, "step_time": 122.47313178796321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 1089.484375, "completions/mean_terminated_length": 964.0508422851562, "completions/min_length": 165.0, "completions/min_terminated_length": 238.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6908983737230301, "epoch": 0.10591133004926108, "frac_reward_zero_std": 0.0, "grad_norm": 0.0045084411161715136, "kl": 0.001085053852875717, "learning_rate": 4.99355930188555e-05, "loss": -0.020499780774116516, "num_tokens": 6493527.0, "reward": 1.0, "reward_std": 0.5527707934379578, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.07947573396894667, "sampling/importance_sampling_ratio/max": 2.9986228942871094, "sampling/importance_sampling_ratio/mean": 0.9507853984832764, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.087060928344727, "sampling/sampling_logp_difference/mean": 0.196205735206604, "step": 43, "step_time": 129.15310677397065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3619.0, "completions/max_terminated_length": 3619.0, "completions/mean_length": 804.078125, "completions/mean_terminated_length": 782.5873413085938, "completions/min_length": 15.0, "completions/min_terminated_length": 15.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6834833323955536, "epoch": 0.10837438423645321, "frac_reward_zero_std": 0.0, "grad_norm": 0.00655169426595512, "kl": 0.0013544214016292244, "learning_rate": 4.9932066112559975e-05, "loss": -0.037918414920568466, "num_tokens": 6624492.0, "reward": 0.89453125, "reward_std": 0.5152528285980225, "rewards/reward_func/mean": 0.0993923611111111, "rewards/reward_func/std": 0.074398891793357, "sampling/importance_sampling_ratio/max": 2.996908664703369, "sampling/importance_sampling_ratio/mean": 0.9543678760528564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.936127662658691, "sampling/sampling_logp_difference/mean": 0.19161191582679749, "step": 44, "step_time": 102.16571011929773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 931.40625, "completions/mean_terminated_length": 775.7704467773438, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7641775757074356, "epoch": 0.11083743842364532, "frac_reward_zero_std": 0.0, "grad_norm": 0.003791496409445276, "kl": 0.0008305757219204679, "learning_rate": 4.992844532818821e-05, "loss": -0.03192742541432381, "num_tokens": 6776582.0, "reward": 0.9765625, "reward_std": 0.47709178924560547, "rewards/reward_func/mean": 0.10850694444444445, "rewards/reward_func/std": 0.06993871264987522, "sampling/importance_sampling_ratio/max": 2.9978528022766113, "sampling/importance_sampling_ratio/mean": 0.9510887265205383, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.725985527038574, "sampling/sampling_logp_difference/mean": 0.20753449201583862, "step": 45, "step_time": 130.1271507178899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3633.0, "completions/mean_length": 1147.84375, "completions/mean_terminated_length": 842.862060546875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7140185534954071, "epoch": 0.11330049261083744, "frac_reward_zero_std": 0.0, "grad_norm": 0.004826171583942262, "kl": 0.0006912277458468452, "learning_rate": 4.9924730679373735e-05, "loss": -0.011146273463964462, "num_tokens": 6928796.0, "reward": 0.94140625, "reward_std": 0.5132030248641968, "rewards/reward_func/mean": 0.10460069444444445, "rewards/reward_func/std": 0.07456411255730523, "sampling/importance_sampling_ratio/max": 2.998485565185547, "sampling/importance_sampling_ratio/mean": 0.9537663459777832, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.836477279663086, "sampling/sampling_logp_difference/mean": 0.19005626440048218, "step": 46, "step_time": 124.87270360905677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3396.0, "completions/mean_length": 1380.703125, "completions/mean_terminated_length": 1224.4482421875, "completions/min_length": 1.0, "completions/min_terminated_length": 470.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6310307085514069, "epoch": 0.11576354679802955, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038622981497168454, "kl": 0.0006945063942112029, "learning_rate": 4.992092218010351e-05, "loss": -0.02481062337756157, "num_tokens": 7112345.0, "reward": 0.921875, "reward_std": 0.4957658052444458, "rewards/reward_func/mean": 0.10243055555555555, "rewards/reward_func/std": 0.07219833135604858, "sampling/importance_sampling_ratio/max": 2.982144832611084, "sampling/importance_sampling_ratio/mean": 0.9450221061706543, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.828449249267578, "sampling/sampling_logp_difference/mean": 0.20433643460273743, "step": 47, "step_time": 124.20643052412197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3883.0, "completions/mean_length": 1118.46875, "completions/mean_terminated_length": 1071.2064208984375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "degenerate_groups_filtered": 0.0, "entropy": 0.712666392326355, "epoch": 0.11822660098522167, "frac_reward_zero_std": 0.0, "grad_norm": 0.005585593408220866, "kl": 0.0008703803177922964, "learning_rate": 4.991701984471789e-05, "loss": -0.04101687669754028, "num_tokens": 7272519.0, "reward": 1.0234375, "reward_std": 0.6279197931289673, "rewards/reward_func/mean": 0.11371527777777778, "rewards/reward_func/std": 0.13131697310341728, "sampling/importance_sampling_ratio/max": 2.996182918548584, "sampling/importance_sampling_ratio/mean": 0.9473646879196167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.505918502807617, "sampling/sampling_logp_difference/mean": 0.20310138165950775, "step": 48, "step_time": 126.8569445500616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3621.0, "completions/mean_length": 1250.921875, "completions/mean_terminated_length": 1211.5322265625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6713459491729736, "epoch": 0.1206896551724138, "frac_reward_zero_std": 0.0, "grad_norm": 0.006099868278130744, "kl": 0.0007151865720516071, "learning_rate": 4.9913023687910575e-05, "loss": 0.005726509727537632, "num_tokens": 7449970.0, "reward": 0.92578125, "reward_std": 0.5557771921157837, "rewards/reward_func/mean": 0.10286458333333333, "rewards/reward_func/std": 0.07966403497589959, "sampling/importance_sampling_ratio/max": 2.997781991958618, "sampling/importance_sampling_ratio/mean": 0.9481871128082275, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.811446189880371, "sampling/sampling_logp_difference/mean": 0.1957038938999176, "step": 49, "step_time": 133.4852599161677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2899.0, "completions/mean_length": 1217.28125, "completions/mean_terminated_length": 930.8947143554688, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7344631999731064, "epoch": 0.12315270935960591, "frac_reward_zero_std": 0.0, "grad_norm": 0.004284861401893878, "kl": 0.0008682821207912639, "learning_rate": 4.990893372472849e-05, "loss": -0.028305571526288986, "num_tokens": 7622628.0, "reward": 0.88671875, "reward_std": 0.5212349891662598, "rewards/reward_func/mean": 0.09852430555555555, "rewards/reward_func/std": 0.10778504444493188, "sampling/importance_sampling_ratio/max": 2.996345043182373, "sampling/importance_sampling_ratio/mean": 0.9431067705154419, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.860669136047363, "sampling/sampling_logp_difference/mean": 0.21784313023090363, "step": 50, "step_time": 127.83933217800222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 1225.671875, "completions/mean_terminated_length": 1127.1966552734375, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "degenerate_groups_filtered": 1.0, "entropy": 0.661736324429512, "epoch": 0.12561576354679804, "frac_reward_zero_std": 0.25, "grad_norm": 0.004477337769392876, "kl": 0.000599712846451439, "learning_rate": 4.99047499705718e-05, "loss": -0.04254208877682686, "num_tokens": 7783983.0, "reward": 1.00390625, "reward_std": 0.5572255849838257, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.11869255536132389, "sampling/importance_sampling_ratio/max": 2.990603446960449, "sampling/importance_sampling_ratio/mean": 0.9500396251678467, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.309992790222168, "sampling/sampling_logp_difference/mean": 0.19599781930446625, "step": 51, "step_time": 118.34297248488292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3453.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 809.3125, "completions/mean_terminated_length": 804.888916015625, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6810555011034012, "epoch": 0.12807881773399016, "frac_reward_zero_std": 0.0, "grad_norm": 0.004634171127115492, "kl": 0.0006402786093531176, "learning_rate": 4.990047244119383e-05, "loss": -0.020025035366415977, "num_tokens": 7918355.0, "reward": 1.10546875, "reward_std": 0.6248883605003357, "rewards/reward_func/mean": 0.1228298611111111, "rewards/reward_func/std": 0.13279999958144295, "sampling/importance_sampling_ratio/max": 2.998283863067627, "sampling/importance_sampling_ratio/mean": 0.9604792594909668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.809757232666016, "sampling/sampling_logp_difference/mean": 0.17685744166374207, "step": 52, "step_time": 107.15592404198833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 1059.71875, "completions/mean_terminated_length": 904.1333618164062, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6661587357521057, "epoch": 0.13054187192118227, "frac_reward_zero_std": 0.0, "grad_norm": 0.009039182746563882, "kl": 0.0006583353388123214, "learning_rate": 4.9896101152701e-05, "loss": -0.04837590828537941, "num_tokens": 8069409.0, "reward": 0.83984375, "reward_std": 0.5480253100395203, "rewards/reward_func/mean": 0.09331597222222222, "rewards/reward_func/std": 0.07786636220084296, "sampling/importance_sampling_ratio/max": 2.998404026031494, "sampling/importance_sampling_ratio/mean": 0.956752598285675, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.024923324584961, "sampling/sampling_logp_difference/mean": 0.18003256618976593, "step": 53, "step_time": 121.01628102129325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3495.0, "completions/mean_length": 1147.21875, "completions/mean_terminated_length": 1035.49169921875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6769838631153107, "epoch": 0.1330049261083744, "frac_reward_zero_std": 0.0, "grad_norm": 0.005072658713012938, "kl": 0.0007076838955981657, "learning_rate": 4.9891636121552745e-05, "loss": 0.008838340640068054, "num_tokens": 8231519.0, "reward": 0.859375, "reward_std": 0.5115239024162292, "rewards/reward_func/mean": 0.0954861111111111, "rewards/reward_func/std": 0.07473522424697876, "sampling/importance_sampling_ratio/max": 2.9958250522613525, "sampling/importance_sampling_ratio/mean": 0.9562733173370361, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.137588500976562, "sampling/sampling_logp_difference/mean": 0.18809491395950317, "step": 54, "step_time": 126.67665300960653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3799.0, "completions/mean_length": 822.03125, "completions/mean_terminated_length": 782.3709716796875, "completions/min_length": 7.0, "completions/min_terminated_length": 239.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6964642554521561, "epoch": 0.1354679802955665, "frac_reward_zero_std": 0.25, "grad_norm": 0.004336077774979495, "kl": 0.00077366731420625, "learning_rate": 4.988707736456151e-05, "loss": 0.024997062981128693, "num_tokens": 8369409.0, "reward": 1.04296875, "reward_std": 0.4466690719127655, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.06591926680670844, "sampling/importance_sampling_ratio/max": 2.9957802295684814, "sampling/importance_sampling_ratio/mean": 0.9594486951828003, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.226276397705078, "sampling/sampling_logp_difference/mean": 0.17705968022346497, "step": 55, "step_time": 120.81106252782047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3210.0, "completions/mean_length": 1145.0, "completions/mean_terminated_length": 948.2667236328125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "degenerate_groups_filtered": 0.0, "entropy": 0.924326628446579, "epoch": 0.13793103448275862, "frac_reward_zero_std": 0.0, "grad_norm": 0.004602114073004957, "kl": 0.0009272525494452566, "learning_rate": 4.9882424898892635e-05, "loss": 0.007547194603830576, "num_tokens": 8527825.0, "reward": 0.921875, "reward_std": 0.46049273014068604, "rewards/reward_func/mean": 0.10243055555555555, "rewards/reward_func/std": 0.06886764367421468, "sampling/importance_sampling_ratio/max": 2.9957659244537354, "sampling/importance_sampling_ratio/mean": 0.9489148259162903, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.856990814208984, "sampling/sampling_logp_difference/mean": 0.2066124975681305, "step": 56, "step_time": 132.56589686009102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3108.0, "completions/mean_length": 817.078125, "completions/mean_terminated_length": 765.0317993164062, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7100279629230499, "epoch": 0.14039408866995073, "frac_reward_zero_std": 0.0, "grad_norm": 0.0066965015160019205, "kl": 0.0014742588391527534, "learning_rate": 4.987767874206428e-05, "loss": -0.033622272312641144, "num_tokens": 8673046.0, "reward": 0.890625, "reward_std": 0.5134597420692444, "rewards/reward_func/mean": 0.09895833333333333, "rewards/reward_func/std": 0.07518212000528972, "sampling/importance_sampling_ratio/max": 2.9987456798553467, "sampling/importance_sampling_ratio/mean": 0.9521975517272949, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.29785442352295, "sampling/sampling_logp_difference/mean": 0.19809746742248535, "step": 57, "step_time": 119.24784157378599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3990.0, "completions/mean_length": 1423.890625, "completions/mean_terminated_length": 1225.3729248046875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6659361720085144, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.004336898403114928, "kl": 0.0008447931904811412, "learning_rate": 4.987283891194743e-05, "loss": 0.0002889493480324745, "num_tokens": 8861519.0, "reward": 0.875, "reward_std": 0.5509731769561768, "rewards/reward_func/mean": 0.09722222222222222, "rewards/reward_func/std": 0.07834745115704006, "sampling/importance_sampling_ratio/max": 2.9992592334747314, "sampling/importance_sampling_ratio/mean": 0.9453926086425781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.000340461730957, "sampling/sampling_logp_difference/mean": 0.2015022337436676, "step": 58, "step_time": 131.44689005077817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 1537.296875, "completions/mean_terminated_length": 1390.3792724609375, "completions/min_length": 13.0, "completions/min_terminated_length": 318.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7286873459815979, "epoch": 0.14532019704433496, "frac_reward_zero_std": 0.0, "grad_norm": 0.0036566899418926735, "kl": 0.0008002854156075045, "learning_rate": 4.986790542676576e-05, "loss": -0.03378288820385933, "num_tokens": 9050914.0, "reward": 0.95703125, "reward_std": 0.47257041931152344, "rewards/reward_func/mean": 0.10633680555555555, "rewards/reward_func/std": 0.06965653763877021, "sampling/importance_sampling_ratio/max": 2.997560501098633, "sampling/importance_sampling_ratio/mean": 0.9403575658798218, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.749882698059082, "sampling/sampling_logp_difference/mean": 0.2278328537940979, "step": 59, "step_time": 137.2060365586076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 988.796875, "completions/mean_terminated_length": 888.5645141601562, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "degenerate_groups_filtered": 0.0, "entropy": 0.71505106985569, "epoch": 0.1477832512315271, "frac_reward_zero_std": 0.0, "grad_norm": 0.005548976099805918, "kl": 0.001017661954392679, "learning_rate": 4.986287830509558e-05, "loss": 0.008952794596552849, "num_tokens": 9206245.0, "reward": 0.8984375, "reward_std": 0.4623069167137146, "rewards/reward_func/mean": 0.0998263888888889, "rewards/reward_func/std": 0.06768603954050276, "sampling/importance_sampling_ratio/max": 2.9915103912353516, "sampling/importance_sampling_ratio/mean": 0.9487502574920654, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.22498893737793, "sampling/sampling_logp_difference/mean": 0.20820589363574982, "step": 60, "step_time": 124.10505168675445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1294.859375, "completions/mean_terminated_length": 1108.11669921875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7779558748006821, "epoch": 0.15024630541871922, "frac_reward_zero_std": 0.0, "grad_norm": 0.004369721876072753, "kl": 0.0011822088854387403, "learning_rate": 4.985775756586581e-05, "loss": -0.016528453677892685, "num_tokens": 9385436.0, "reward": 0.99609375, "reward_std": 0.5643021464347839, "rewards/reward_func/mean": 0.11067708333333333, "rewards/reward_func/std": 0.08128498329056634, "sampling/importance_sampling_ratio/max": 2.9969987869262695, "sampling/importance_sampling_ratio/mean": 0.9418332576751709, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.551093101501465, "sampling/sampling_logp_difference/mean": 0.2260904610157013, "step": 61, "step_time": 127.24007483315654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2781.0, "completions/mean_length": 654.90625, "completions/mean_terminated_length": 543.9031982421875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5641279071569443, "epoch": 0.15270935960591134, "frac_reward_zero_std": 0.0, "grad_norm": 0.010293875912976197, "kl": 0.0012169272813480347, "learning_rate": 4.9852543228357835e-05, "loss": 0.004283260554075241, "num_tokens": 9507270.0, "reward": 1.2265625, "reward_std": 0.8860904574394226, "rewards/reward_func/mean": 0.1362847222222222, "rewards/reward_func/std": 0.17288169264793396, "sampling/importance_sampling_ratio/max": 2.997990846633911, "sampling/importance_sampling_ratio/mean": 0.9731359481811523, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.856095314025879, "sampling/sampling_logp_difference/mean": 0.14170248806476593, "step": 62, "step_time": 131.49094270309433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3911.0, "completions/max_terminated_length": 3911.0, "completions/mean_length": 1028.59375, "completions/mean_terminated_length": 1028.59375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7280330806970596, "epoch": 0.15517241379310345, "frac_reward_zero_std": 0.0, "grad_norm": 0.005106972664854136, "kl": 0.001104547263821587, "learning_rate": 4.9847235312205484e-05, "loss": -0.028649836778640747, "num_tokens": 9644188.0, "reward": 0.9921875, "reward_std": 0.618768572807312, "rewards/reward_func/mean": 0.11024305555555555, "rewards/reward_func/std": 0.12877601716253492, "sampling/importance_sampling_ratio/max": 2.9986512660980225, "sampling/importance_sampling_ratio/mean": 0.9545284509658813, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.098888397216797, "sampling/sampling_logp_difference/mean": 0.19114258885383606, "step": 63, "step_time": 106.71372815198265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 1194.53125, "completions/mean_terminated_length": 991.5084838867188, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "degenerate_groups_filtered": 0.0, "entropy": 0.688900962471962, "epoch": 0.15763546798029557, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037696714939516734, "kl": 0.0010499156342120841, "learning_rate": 4.984183383739496e-05, "loss": -0.012204478494822979, "num_tokens": 9815470.0, "reward": 1.04296875, "reward_std": 0.507003903388977, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.0734660890367296, "sampling/importance_sampling_ratio/max": 2.999692440032959, "sampling/importance_sampling_ratio/mean": 0.9450367093086243, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.381852149963379, "sampling/sampling_logp_difference/mean": 0.20991432666778564, "step": 64, "step_time": 137.48744579590857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3326.0, "completions/max_terminated_length": 3326.0, "completions/mean_length": 801.546875, "completions/mean_terminated_length": 801.546875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6562062352895737, "epoch": 0.16009852216748768, "frac_reward_zero_std": 0.5, "grad_norm": 0.004939764234466071, "kl": 0.0008211284002754837, "learning_rate": 4.983633882426471e-05, "loss": 0.03823887184262276, "num_tokens": 9946017.0, "reward": 1.03125, "reward_std": 0.306995153427124, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.04659368097782135, "sampling/importance_sampling_ratio/max": 2.995638608932495, "sampling/importance_sampling_ratio/mean": 0.9580010175704956, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.03026008605957, "sampling/sampling_logp_difference/mean": 0.17757360637187958, "step": 65, "step_time": 92.6048651910387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 1163.859375, "completions/mean_terminated_length": 1019.6557006835938, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7296376675367355, "epoch": 0.1625615763546798, "frac_reward_zero_std": 0.0, "grad_norm": 0.003653082653066837, "kl": 0.0007672712963540107, "learning_rate": 4.983075029350542e-05, "loss": -0.035717546939849854, "num_tokens": 10100424.0, "reward": 0.94921875, "reward_std": 0.40868473052978516, "rewards/reward_func/mean": 0.10546875, "rewards/reward_func/std": 0.06101351810826196, "sampling/importance_sampling_ratio/max": 2.9951772689819336, "sampling/importance_sampling_ratio/mean": 0.9508060812950134, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.666208267211914, "sampling/sampling_logp_difference/mean": 0.20355165004730225, "step": 66, "step_time": 147.45380871812813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3794.0, "completions/mean_length": 1260.1875, "completions/mean_terminated_length": 1120.72119140625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6744803041219711, "epoch": 0.16502463054187191, "frac_reward_zero_std": 0.0, "grad_norm": 0.007845307744964685, "kl": 0.0007850325491745025, "learning_rate": 4.9825068266159894e-05, "loss": 0.07681119441986084, "num_tokens": 10266036.0, "reward": 1.16796875, "reward_std": 0.8165630102157593, "rewards/reward_func/mean": 0.12977430555555555, "rewards/reward_func/std": 0.13601407905419668, "sampling/importance_sampling_ratio/max": 2.9994595050811768, "sampling/importance_sampling_ratio/mean": 0.9440293312072754, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.185983657836914, "sampling/sampling_logp_difference/mean": 0.21098700165748596, "step": 67, "step_time": 127.28534332639538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 866.609375, "completions/mean_terminated_length": 722.8135375976562, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7389185130596161, "epoch": 0.16748768472906403, "frac_reward_zero_std": 0.0, "grad_norm": 0.005251337630497652, "kl": 0.000870908988872543, "learning_rate": 4.981929276362298e-05, "loss": 0.030912771821022034, "num_tokens": 10399355.0, "reward": 0.89453125, "reward_std": 0.4053332209587097, "rewards/reward_func/mean": 0.0993923611111111, "rewards/reward_func/std": 0.08497350083457099, "sampling/importance_sampling_ratio/max": 2.9986069202423096, "sampling/importance_sampling_ratio/mean": 0.9526036977767944, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.291192054748535, "sampling/sampling_logp_difference/mean": 0.19692301750183105, "step": 68, "step_time": 131.86357428110205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3656.0, "completions/mean_length": 1072.796875, "completions/mean_terminated_length": 936.4833984375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7273115962743759, "epoch": 0.16995073891625614, "frac_reward_zero_std": 0.0, "grad_norm": 0.003302635291869116, "kl": 0.0010801725584315136, "learning_rate": 4.981342380764149e-05, "loss": -0.024595141410827637, "num_tokens": 10566158.0, "reward": 0.94921875, "reward_std": 0.34840819239616394, "rewards/reward_func/mean": 0.10546875, "rewards/reward_func/std": 0.054026698072751365, "sampling/importance_sampling_ratio/max": 2.998894453048706, "sampling/importance_sampling_ratio/mean": 0.9404545426368713, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.374916076660156, "sampling/sampling_logp_difference/mean": 0.22012995183467865, "step": 69, "step_time": 117.17583943810314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 1020.984375, "completions/mean_terminated_length": 921.790283203125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7125604450702667, "epoch": 0.1724137931034483, "frac_reward_zero_std": 0.25, "grad_norm": 0.012211166320763836, "kl": 0.0008212087268475443, "learning_rate": 4.980746142031414e-05, "loss": -0.03852664679288864, "num_tokens": 10718861.0, "reward": 1.0546875, "reward_std": 0.6835201978683472, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.12549426820543078, "sampling/importance_sampling_ratio/max": 2.9977447986602783, "sampling/importance_sampling_ratio/mean": 0.9521719217300415, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.026714324951172, "sampling/sampling_logp_difference/mean": 0.19808495044708252, "step": 70, "step_time": 132.49208030593581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 1077.109375, "completions/mean_terminated_length": 995.3933715820312, "completions/min_length": 24.0, "completions/min_terminated_length": 261.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5986730307340622, "epoch": 0.1748768472906404, "frac_reward_zero_std": 0.0, "grad_norm": 0.007337189387788424, "kl": 0.0006397934484994039, "learning_rate": 4.980140562409141e-05, "loss": -0.03012331947684288, "num_tokens": 10857156.0, "reward": 0.93359375, "reward_std": 0.41605237126350403, "rewards/reward_func/mean": 0.1037326388888889, "rewards/reward_func/std": 0.06313699980576833, "sampling/importance_sampling_ratio/max": 2.9992947578430176, "sampling/importance_sampling_ratio/mean": 0.953962504863739, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.96396541595459, "sampling/sampling_logp_difference/mean": 0.1823788285255432, "step": 71, "step_time": 162.1669030210469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3893.0, "completions/mean_length": 1130.734375, "completions/mean_terminated_length": 1090.51611328125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "degenerate_groups_filtered": 0.0, "entropy": 0.658635139465332, "epoch": 0.17733990147783252, "frac_reward_zero_std": 0.0, "grad_norm": 0.0077832425322214745, "kl": 0.0007886915263952687, "learning_rate": 4.979525644177554e-05, "loss": 0.03975846618413925, "num_tokens": 11016371.0, "reward": 1.14453125, "reward_std": 0.7802175283432007, "rewards/reward_func/mean": 0.1271701388888889, "rewards/reward_func/std": 0.1387238320377138, "sampling/importance_sampling_ratio/max": 2.998619794845581, "sampling/importance_sampling_ratio/mean": 0.9450385570526123, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.331713676452637, "sampling/sampling_logp_difference/mean": 0.19837921857833862, "step": 72, "step_time": 125.53122378420085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3461.0, "completions/mean_length": 878.890625, "completions/mean_terminated_length": 806.786865234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7126602828502655, "epoch": 0.17980295566502463, "frac_reward_zero_std": 0.0, "grad_norm": 0.009708859119011157, "kl": 0.000986951738013886, "learning_rate": 4.978901389652039e-05, "loss": 0.045913953334093094, "num_tokens": 11164348.0, "reward": 1.01953125, "reward_std": 1.0849847793579102, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.16929766370190513, "sampling/importance_sampling_ratio/max": 2.9910213947296143, "sampling/importance_sampling_ratio/mean": 0.9483171701431274, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.62489128112793, "sampling/sampling_logp_difference/mean": 0.20636305212974548, "step": 73, "step_time": 129.03768724016845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3590.0, "completions/mean_length": 1483.046875, "completions/mean_terminated_length": 1275.086181640625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "degenerate_groups_filtered": 0.0, "entropy": 0.67229163646698, "epoch": 0.18226600985221675, "frac_reward_zero_std": 0.0, "grad_norm": 0.01704922977426874, "kl": 0.0007648792961845174, "learning_rate": 4.978267801183133e-05, "loss": 0.011364220641553402, "num_tokens": 11349727.0, "reward": 1.2109375, "reward_std": 0.6519487500190735, "rewards/reward_func/mean": 0.1345486111111111, "rewards/reward_func/std": 0.12985923224025303, "sampling/importance_sampling_ratio/max": 2.9996538162231445, "sampling/importance_sampling_ratio/mean": 0.9436983466148376, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.866939544677734, "sampling/sampling_logp_difference/mean": 0.20118892192840576, "step": 74, "step_time": 137.89118567178957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 1226.40625, "completions/mean_terminated_length": 1085.2786865234375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7562486529350281, "epoch": 0.18472906403940886, "frac_reward_zero_std": 0.0, "grad_norm": 0.0047505098072179695, "kl": 0.0008432121394434944, "learning_rate": 4.977624881156524e-05, "loss": -0.00926197599619627, "num_tokens": 11511657.0, "reward": 1.0078125, "reward_std": 0.4628430902957916, "rewards/reward_func/mean": 0.11197916666666667, "rewards/reward_func/std": 0.0687158273326026, "sampling/importance_sampling_ratio/max": 2.996015787124634, "sampling/importance_sampling_ratio/mean": 0.9437671899795532, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.199777603149414, "sampling/sampling_logp_difference/mean": 0.21507258713245392, "step": 75, "step_time": 119.91632703808136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3823.0, "completions/mean_length": 1037.953125, "completions/mean_terminated_length": 990.3386840820312, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6312072277069092, "epoch": 0.18719211822660098, "frac_reward_zero_std": 0.0, "grad_norm": 0.006440735421087148, "kl": 0.0009448294003959745, "learning_rate": 4.976972631993033e-05, "loss": -0.025233589112758636, "num_tokens": 11665798.0, "reward": 0.86328125, "reward_std": 0.5135653614997864, "rewards/reward_func/mean": 0.0959201388888889, "rewards/reward_func/std": 0.07400760385725233, "sampling/importance_sampling_ratio/max": 2.9997024536132812, "sampling/importance_sampling_ratio/mean": 0.9479377269744873, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.914000511169434, "sampling/sampling_logp_difference/mean": 0.1916263997554779, "step": 76, "step_time": 126.66293425206095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 808.015625, "completions/mean_terminated_length": 737.475341796875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "degenerate_groups_filtered": 0.0, "entropy": 0.75564044713974, "epoch": 0.1896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.006801059909191471, "kl": 0.0012827419268433005, "learning_rate": 4.976311056148609e-05, "loss": -0.026884566992521286, "num_tokens": 11790983.0, "reward": 0.9375, "reward_std": 0.5137012004852295, "rewards/reward_func/mean": 0.10416666666666667, "rewards/reward_func/std": 0.07450851135783726, "sampling/importance_sampling_ratio/max": 2.9969658851623535, "sampling/importance_sampling_ratio/mean": 0.9571335315704346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.874902725219727, "sampling/sampling_logp_difference/mean": 0.18946215510368347, "step": 77, "step_time": 120.88332917005755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 1026.78125, "completions/mean_terminated_length": 927.774169921875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6949244886636734, "epoch": 0.1921182266009852, "frac_reward_zero_std": 0.0, "grad_norm": 0.009162823895835595, "kl": 0.0008584359893575311, "learning_rate": 4.975640156114322e-05, "loss": 0.004000354558229446, "num_tokens": 11933161.0, "reward": 1.04296875, "reward_std": 0.7367273569107056, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.12690814170572493, "sampling/importance_sampling_ratio/max": 2.9982340335845947, "sampling/importance_sampling_ratio/mean": 0.9532779455184937, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.17520523071289, "sampling/sampling_logp_difference/mean": 0.19447797536849976, "step": 78, "step_time": 125.43066487205215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 624.765625, "completions/mean_terminated_length": 631.4515991210938, "completions/min_length": 51.0, "completions/min_terminated_length": 125.0, "degenerate_groups_filtered": 0.0, "entropy": 0.709212139248848, "epoch": 0.19458128078817735, "frac_reward_zero_std": 0.0, "grad_norm": 0.0063953100167225795, "kl": 0.0009358888928545639, "learning_rate": 4.974959934416346e-05, "loss": 0.03648173436522484, "num_tokens": 12061082.0, "reward": 0.94921875, "reward_std": 0.4299771189689636, "rewards/reward_func/mean": 0.10546875, "rewards/reward_func/std": 0.06504838996463352, "sampling/importance_sampling_ratio/max": 2.9928903579711914, "sampling/importance_sampling_ratio/mean": 0.9550417065620422, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.374837875366211, "sampling/sampling_logp_difference/mean": 0.19429925084114075, "step": 79, "step_time": 75.71170671563596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 786.9375, "completions/mean_terminated_length": 721.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6359275728464127, "epoch": 0.19704433497536947, "frac_reward_zero_std": 0.0, "grad_norm": 0.005510590683439618, "kl": 0.0013039757614023983, "learning_rate": 4.9742703936159586e-05, "loss": -0.01529807597398758, "num_tokens": 12196694.0, "reward": 0.91796875, "reward_std": 0.35370680689811707, "rewards/reward_func/mean": 0.10199652777777778, "rewards/reward_func/std": 0.05228892962137858, "sampling/importance_sampling_ratio/max": 2.9959685802459717, "sampling/importance_sampling_ratio/mean": 0.9564290642738342, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.529621124267578, "sampling/sampling_logp_difference/mean": 0.18233630061149597, "step": 80, "step_time": 143.74423436028883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3342.0, "completions/mean_length": 1421.53125, "completions/mean_terminated_length": 1329.7703857421875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6616629511117935, "epoch": 0.19950738916256158, "frac_reward_zero_std": 0.0, "grad_norm": 0.005486852713694094, "kl": 0.000617337238509208, "learning_rate": 4.973571536309525e-05, "loss": 0.036342501640319824, "num_tokens": 12369864.0, "reward": 1.0078125, "reward_std": 0.6267337203025818, "rewards/reward_func/mean": 0.11197916666666667, "rewards/reward_func/std": 0.11080888079272376, "sampling/importance_sampling_ratio/max": 2.9995808601379395, "sampling/importance_sampling_ratio/mean": 0.9480350613594055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.685368537902832, "sampling/sampling_logp_difference/mean": 0.19612553715705872, "step": 81, "step_time": 127.6738231680356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3796.0, "completions/mean_length": 1419.578125, "completions/mean_terminated_length": 1254.0169677734375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8103909641504288, "epoch": 0.2019704433497537, "frac_reward_zero_std": 0.0, "grad_norm": 0.004435665783129135, "kl": 0.0008063071873039007, "learning_rate": 4.9728633651284914e-05, "loss": -0.0042844414710998535, "num_tokens": 12548685.0, "reward": 1.1171875, "reward_std": 0.7439821362495422, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.11853209965758854, "sampling/importance_sampling_ratio/max": 2.999321460723877, "sampling/importance_sampling_ratio/mean": 0.9406133890151978, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.160935401916504, "sampling/sampling_logp_difference/mean": 0.22386467456817627, "step": 82, "step_time": 136.5100540383719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3796.0, "completions/max_terminated_length": 3796.0, "completions/mean_length": 941.171875, "completions/mean_terminated_length": 946.01611328125, "completions/min_length": 24.0, "completions/min_terminated_length": 225.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6475118100643158, "epoch": 0.2044334975369458, "frac_reward_zero_std": 0.0, "grad_norm": 0.005268346256156498, "kl": 0.0009101934847421944, "learning_rate": 4.972145882739374e-05, "loss": -0.02767626754939556, "num_tokens": 12698024.0, "reward": 1.0234375, "reward_std": 0.43121910095214844, "rewards/reward_func/mean": 0.11371527777777778, "rewards/reward_func/std": 0.06390465299288432, "sampling/importance_sampling_ratio/max": 2.9928174018859863, "sampling/importance_sampling_ratio/mean": 0.9558981657028198, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.236062049865723, "sampling/sampling_logp_difference/mean": 0.1847381889820099, "step": 83, "step_time": 112.00504079344682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2627.0, "completions/max_terminated_length": 2627.0, "completions/mean_length": 1038.828125, "completions/mean_terminated_length": 1018.4425659179688, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6957644075155258, "epoch": 0.20689655172413793, "frac_reward_zero_std": 0.0, "grad_norm": 0.004293307295253765, "kl": 0.001004656864097342, "learning_rate": 4.971419091843748e-05, "loss": -0.024233508855104446, "num_tokens": 12854877.0, "reward": 1.2109375, "reward_std": 0.5777260065078735, "rewards/reward_func/mean": 0.1345486111111111, "rewards/reward_func/std": 0.08230482538541158, "sampling/importance_sampling_ratio/max": 2.998769998550415, "sampling/importance_sampling_ratio/mean": 0.9453675150871277, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.116597175598145, "sampling/sampling_logp_difference/mean": 0.21188262104988098, "step": 84, "step_time": 84.51651998935267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 715.484375, "completions/mean_terminated_length": 661.825439453125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7366594225168228, "epoch": 0.20935960591133004, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037554999149393494, "kl": 0.001163585257017985, "learning_rate": 4.970682995178238e-05, "loss": -0.002081345533952117, "num_tokens": 12984828.0, "reward": 1.07421875, "reward_std": 0.3900541663169861, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.055674018131362066, "sampling/importance_sampling_ratio/max": 2.998495578765869, "sampling/importance_sampling_ratio/mean": 0.9560437202453613, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.31248664855957, "sampling/sampling_logp_difference/mean": 0.1962520182132721, "step": 85, "step_time": 139.01101576304063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1098.484375, "completions/mean_terminated_length": 960.86669921875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6849155128002167, "epoch": 0.21182266009852216, "frac_reward_zero_std": 0.0, "grad_norm": 0.005711301934492953, "kl": 0.0016330961079802364, "learning_rate": 4.9699375955145114e-05, "loss": -0.031073298305273056, "num_tokens": 13144571.0, "reward": 1.05859375, "reward_std": 0.6751574873924255, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.09190892179807027, "sampling/importance_sampling_ratio/max": 2.9948723316192627, "sampling/importance_sampling_ratio/mean": 0.9550687074661255, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.623679161071777, "sampling/sampling_logp_difference/mean": 0.18543776869773865, "step": 86, "step_time": 132.33628671360202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3577.0, "completions/mean_length": 1125.25, "completions/mean_terminated_length": 989.9491577148438, "completions/min_length": 47.0, "completions/min_terminated_length": 241.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7316817492246628, "epoch": 0.21428571428571427, "frac_reward_zero_std": 0.0, "grad_norm": 0.01102958555478108, "kl": 0.0011147368932142854, "learning_rate": 4.96918289565926e-05, "loss": -0.069314144551754, "num_tokens": 13300539.0, "reward": 0.9453125, "reward_std": 0.7656680345535278, "rewards/reward_func/mean": 0.10503472222222222, "rewards/reward_func/std": 0.12950749198595682, "sampling/importance_sampling_ratio/max": 2.9998490810394287, "sampling/importance_sampling_ratio/mean": 0.9460821151733398, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.81115436553955, "sampling/sampling_logp_difference/mean": 0.20998844504356384, "step": 87, "step_time": 122.99315962707624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 1200.171875, "completions/mean_terminated_length": 954.7626953125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "degenerate_groups_filtered": 1.0, "entropy": 0.5681051015853882, "epoch": 0.21674876847290642, "frac_reward_zero_std": 0.25, "grad_norm": 0.001914027338477591, "kl": 0.0006996607990004122, "learning_rate": 4.968418898454199e-05, "loss": -0.011678352952003479, "num_tokens": 13454422.0, "reward": 1.0703125, "reward_std": 0.37392371892929077, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.05488494038581848, "sampling/importance_sampling_ratio/max": 2.999253511428833, "sampling/importance_sampling_ratio/mean": 0.9604488611221313, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.74988842010498, "sampling/sampling_logp_difference/mean": 0.1624874323606491, "step": 88, "step_time": 130.86213548691012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 979.9375, "completions/mean_terminated_length": 971.825439453125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7630282044410706, "epoch": 0.21921182266009853, "frac_reward_zero_std": 0.0, "grad_norm": 0.003157074850135714, "kl": 0.0007990372541826218, "learning_rate": 4.967645606776047e-05, "loss": -0.017293542623519897, "num_tokens": 13601730.0, "reward": 1.0078125, "reward_std": 0.36451956629753113, "rewards/reward_func/mean": 0.11197916666666667, "rewards/reward_func/std": 0.05453648335403866, "sampling/importance_sampling_ratio/max": 2.999340057373047, "sampling/importance_sampling_ratio/mean": 0.9504854679107666, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.437399864196777, "sampling/sampling_logp_difference/mean": 0.20192095637321472, "step": 89, "step_time": 83.2202613428235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 1119.65625, "completions/mean_terminated_length": 973.2786254882812, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "degenerate_groups_filtered": 0.0, "entropy": 0.664212629199028, "epoch": 0.22167487684729065, "frac_reward_zero_std": 0.25, "grad_norm": 0.006293474969162976, "kl": 0.0009798562823561952, "learning_rate": 4.966863023536523e-05, "loss": 0.059450846165418625, "num_tokens": 13754572.0, "reward": 1.0390625, "reward_std": 0.7627471685409546, "rewards/reward_func/mean": 0.1154513888888889, "rewards/reward_func/std": 0.14111600981818306, "sampling/importance_sampling_ratio/max": 2.9967024326324463, "sampling/importance_sampling_ratio/mean": 0.9532588720321655, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.812085151672363, "sampling/sampling_logp_difference/mean": 0.19800463318824768, "step": 90, "step_time": 198.11143169808201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3196.0, "completions/mean_length": 1016.890625, "completions/mean_terminated_length": 917.5645141601562, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6830228269100189, "epoch": 0.22413793103448276, "frac_reward_zero_std": 0.0, "grad_norm": 0.005031840669452568, "kl": 0.0010694598604459316, "learning_rate": 4.96607115168233e-05, "loss": 0.0734919086098671, "num_tokens": 13895253.0, "reward": 0.96484375, "reward_std": 0.40792542695999146, "rewards/reward_func/mean": 0.1072048611111111, "rewards/reward_func/std": 0.06053559978802999, "sampling/importance_sampling_ratio/max": 2.9970076084136963, "sampling/importance_sampling_ratio/mean": 0.9569849967956543, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.747546195983887, "sampling/sampling_logp_difference/mean": 0.17965635657310486, "step": 91, "step_time": 118.30531685194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4063.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 1106.8125, "completions/mean_terminated_length": 1109.920654296875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7050769776105881, "epoch": 0.22660098522167488, "frac_reward_zero_std": 0.0, "grad_norm": 0.004941963891636721, "kl": 0.0008765287348069251, "learning_rate": 4.965269994195146e-05, "loss": -0.013432206586003304, "num_tokens": 14055561.0, "reward": 0.9453125, "reward_std": 0.4069552719593048, "rewards/reward_func/mean": 0.10503472222222222, "rewards/reward_func/std": 0.062094110581609935, "sampling/importance_sampling_ratio/max": 2.9970788955688477, "sampling/importance_sampling_ratio/mean": 0.9504680633544922, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.066850662231445, "sampling/sampling_logp_difference/mean": 0.19342385232448578, "step": 92, "step_time": 124.41021094494499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 958.171875, "completions/mean_terminated_length": 869.1638793945312, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6579289436340332, "epoch": 0.229064039408867, "frac_reward_zero_std": 0.25, "grad_norm": 0.005850634507404222, "kl": 0.0009105180070037022, "learning_rate": 4.964459554091615e-05, "loss": 0.0003941170871257782, "num_tokens": 14196180.0, "reward": 1.15625, "reward_std": 0.6431877613067627, "rewards/reward_func/mean": 0.1284722222222222, "rewards/reward_func/std": 0.14959995283020866, "sampling/importance_sampling_ratio/max": 2.998795747756958, "sampling/importance_sampling_ratio/mean": 0.9569849967956543, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.074917793273926, "sampling/sampling_logp_difference/mean": 0.18225592374801636, "step": 93, "step_time": 128.9647894760128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3898.0, "completions/mean_length": 1015.65625, "completions/mean_terminated_length": 926.7868041992188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6771551072597504, "epoch": 0.2315270935960591, "frac_reward_zero_std": 0.0, "grad_norm": 0.006173881670119516, "kl": 0.0010466481326147914, "learning_rate": 4.9636398344233294e-05, "loss": 0.06990549713373184, "num_tokens": 14340030.0, "reward": 1.05078125, "reward_std": 0.7348734736442566, "rewards/reward_func/mean": 0.11675347222222222, "rewards/reward_func/std": 0.1158642934428321, "sampling/importance_sampling_ratio/max": 2.9963386058807373, "sampling/importance_sampling_ratio/mean": 0.9552966356277466, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.304615020751953, "sampling/sampling_logp_difference/mean": 0.18898534774780273, "step": 94, "step_time": 120.63474641623907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3279.0, "completions/mean_length": 670.046875, "completions/mean_terminated_length": 615.6666870117188, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7438563853502274, "epoch": 0.23399014778325122, "frac_reward_zero_std": 0.0, "grad_norm": 0.007035370635253527, "kl": 0.0012569701357278973, "learning_rate": 4.9628108382768255e-05, "loss": -0.003529743291437626, "num_tokens": 14478481.0, "reward": 0.953125, "reward_std": 0.4794900715351105, "rewards/reward_func/mean": 0.10590277777777778, "rewards/reward_func/std": 0.07061862614419726, "sampling/importance_sampling_ratio/max": 2.9977564811706543, "sampling/importance_sampling_ratio/mean": 0.9566473960876465, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.311806678771973, "sampling/sampling_logp_difference/mean": 0.18983232975006104, "step": 95, "step_time": 136.43330346024595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3520.0, "completions/mean_length": 1320.671875, "completions/mean_terminated_length": 1151.413818359375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7065405398607254, "epoch": 0.23645320197044334, "frac_reward_zero_std": 0.0, "grad_norm": 0.008568856880559883, "kl": 0.0011838628561235964, "learning_rate": 4.9619725687735686e-05, "loss": -0.011811807751655579, "num_tokens": 14657916.0, "reward": 1.00390625, "reward_std": 0.8539034724235535, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.16876447200775146, "sampling/importance_sampling_ratio/max": 2.9951701164245605, "sampling/importance_sampling_ratio/mean": 0.9421722888946533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.175261497497559, "sampling/sampling_logp_difference/mean": 0.2128932625055313, "step": 96, "step_time": 121.57325341110118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 1076.6875, "completions/mean_terminated_length": 995.9835205078125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6008987575769424, "epoch": 0.23891625615763548, "frac_reward_zero_std": 0.0, "grad_norm": 0.006735602377105294, "kl": 0.001147125702118501, "learning_rate": 4.96112502906994e-05, "loss": -0.014930861070752144, "num_tokens": 14817160.0, "reward": 0.97265625, "reward_std": 0.6931025981903076, "rewards/reward_func/mean": 0.10807291666666667, "rewards/reward_func/std": 0.12522114316622415, "sampling/importance_sampling_ratio/max": 2.9980320930480957, "sampling/importance_sampling_ratio/mean": 0.9482098817825317, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.042277336120605, "sampling/sampling_logp_difference/mean": 0.1913895308971405, "step": 97, "step_time": 121.60319680999964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 930.390625, "completions/mean_terminated_length": 891.1966552734375, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7253329902887344, "epoch": 0.2413793103448276, "frac_reward_zero_std": 0.0, "grad_norm": 0.0057644207764060195, "kl": 0.0011922722042072564, "learning_rate": 4.960268222357227e-05, "loss": -0.0594397634267807, "num_tokens": 14962129.0, "reward": 0.91796875, "reward_std": 0.5383224487304688, "rewards/reward_func/mean": 0.10199652777777778, "rewards/reward_func/std": 0.07774124874009027, "sampling/importance_sampling_ratio/max": 2.9961249828338623, "sampling/importance_sampling_ratio/mean": 0.9478025436401367, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.086261749267578, "sampling/sampling_logp_difference/mean": 0.20908650755882263, "step": 98, "step_time": 144.51559121510945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 836.453125, "completions/mean_terminated_length": 784.7540283203125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7885630130767822, "epoch": 0.2438423645320197, "frac_reward_zero_std": 0.0, "grad_norm": 0.005133721823283165, "kl": 0.0010307101329090074, "learning_rate": 4.959402151861613e-05, "loss": -0.026363378390669823, "num_tokens": 15104718.0, "reward": 0.91796875, "reward_std": 0.36202332377433777, "rewards/reward_func/mean": 0.10199652777777778, "rewards/reward_func/std": 0.05339052610927158, "sampling/importance_sampling_ratio/max": 2.9989402294158936, "sampling/importance_sampling_ratio/mean": 0.9489303827285767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.430939674377441, "sampling/sampling_logp_difference/mean": 0.21791726350784302, "step": 99, "step_time": 128.17962785507552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 817.984375, "completions/mean_terminated_length": 769.3547973632812, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "degenerate_groups_filtered": 0.0, "entropy": 0.668343186378479, "epoch": 0.24630541871921183, "frac_reward_zero_std": 0.25, "grad_norm": 0.004204010827777613, "kl": 0.0011549554765224457, "learning_rate": 4.958526820844158e-05, "loss": -0.028108961880207062, "num_tokens": 15247933.0, "reward": 0.9375, "reward_std": 0.3857583701610565, "rewards/reward_func/mean": 0.10416666666666667, "rewards/reward_func/std": 0.05755675666862064, "sampling/importance_sampling_ratio/max": 2.9985768795013428, "sampling/importance_sampling_ratio/mean": 0.9531916975975037, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.425323486328125, "sampling/sampling_logp_difference/mean": 0.1958668977022171, "step": 100, "step_time": 111.34808640205301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2958.0, "completions/mean_length": 859.625, "completions/mean_terminated_length": 741.4667358398438, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7310220897197723, "epoch": 0.24876847290640394, "frac_reward_zero_std": 0.0, "grad_norm": 0.006198253337158266, "kl": 0.00121245181071572, "learning_rate": 4.957642232600797e-05, "loss": -0.02447034977376461, "num_tokens": 15383989.0, "reward": 1.01171875, "reward_std": 0.5729166865348816, "rewards/reward_func/mean": 0.11241319444444445, "rewards/reward_func/std": 0.08195814821455213, "sampling/importance_sampling_ratio/max": 2.996098518371582, "sampling/importance_sampling_ratio/mean": 0.9529974460601807, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.62481689453125, "sampling/sampling_logp_difference/mean": 0.20314571261405945, "step": 101, "step_time": 125.92396034114063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3345.0, "completions/mean_length": 1127.828125, "completions/mean_terminated_length": 1048.49169921875, "completions/min_length": 31.0, "completions/min_terminated_length": 249.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6528142094612122, "epoch": 0.2512315270935961, "frac_reward_zero_std": 0.0, "grad_norm": 0.0034161702757803136, "kl": 0.0009927775390679017, "learning_rate": 4.956748390462316e-05, "loss": -0.04475435987114906, "num_tokens": 15537162.0, "reward": 0.9609375, "reward_std": 0.3914227783679962, "rewards/reward_func/mean": 0.10677083333333333, "rewards/reward_func/std": 0.05869032442569733, "sampling/importance_sampling_ratio/max": 2.9991648197174072, "sampling/importance_sampling_ratio/mean": 0.9523589015007019, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.799592971801758, "sampling/sampling_logp_difference/mean": 0.1954439878463745, "step": 102, "step_time": 122.85386259504594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3015.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 729.453125, "completions/mean_terminated_length": 742.2257690429688, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "degenerate_groups_filtered": 0.0, "entropy": 0.616577073931694, "epoch": 0.2536945812807882, "frac_reward_zero_std": 0.5, "grad_norm": 0.005423463270168187, "kl": 0.0010207885934505612, "learning_rate": 4.955845297794348e-05, "loss": 0.0010399030288681388, "num_tokens": 15676775.0, "reward": 1.015625, "reward_std": 0.4473799467086792, "rewards/reward_func/mean": 0.11284722222222222, "rewards/reward_func/std": 0.06644997994105022, "sampling/importance_sampling_ratio/max": 2.9987363815307617, "sampling/importance_sampling_ratio/mean": 0.9537383317947388, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.999309539794922, "sampling/sampling_logp_difference/mean": 0.18572832643985748, "step": 103, "step_time": 91.29225635388866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 596.328125, "completions/mean_terminated_length": 540.77783203125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7377785593271255, "epoch": 0.2561576354679803, "frac_reward_zero_std": 0.0, "grad_norm": 0.0057071374591335855, "kl": 0.0016648432065267116, "learning_rate": 4.954932957997359e-05, "loss": -0.011287668719887733, "num_tokens": 15805308.0, "reward": 0.91015625, "reward_std": 0.4115571975708008, "rewards/reward_func/mean": 0.10112847222222222, "rewards/reward_func/std": 0.060685116383764476, "sampling/importance_sampling_ratio/max": 2.9926609992980957, "sampling/importance_sampling_ratio/mean": 0.9554417133331299, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.185643196105957, "sampling/sampling_logp_difference/mean": 0.1904008984565735, "step": 104, "step_time": 142.36614196095616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3505.0, "completions/max_terminated_length": 3505.0, "completions/mean_length": 803.0, "completions/mean_terminated_length": 803.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6270923018455505, "epoch": 0.25862068965517243, "frac_reward_zero_std": 0.25, "grad_norm": 0.0038009793433285183, "kl": 0.001456006633816287, "learning_rate": 4.954011374506632e-05, "loss": 0.003311900421977043, "num_tokens": 15948636.0, "reward": 0.87890625, "reward_std": 0.3507140874862671, "rewards/reward_func/mean": 0.09765625, "rewards/reward_func/std": 0.05090266797277662, "sampling/importance_sampling_ratio/max": 2.9969429969787598, "sampling/importance_sampling_ratio/mean": 0.9581711292266846, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.685257911682129, "sampling/sampling_logp_difference/mean": 0.17381532490253448, "step": 105, "step_time": 110.55873239389621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3710.0, "completions/mean_length": 1168.421875, "completions/mean_terminated_length": 1033.2000732421875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5577425956726074, "epoch": 0.26108374384236455, "frac_reward_zero_std": 0.0, "grad_norm": 0.004559121558826178, "kl": 0.001145464033470489, "learning_rate": 4.953080550792254e-05, "loss": -0.02663501352071762, "num_tokens": 16098423.0, "reward": 0.9921875, "reward_std": 0.49794965982437134, "rewards/reward_func/mean": 0.11024305555555555, "rewards/reward_func/std": 0.0740637414985233, "sampling/importance_sampling_ratio/max": 2.994739055633545, "sampling/importance_sampling_ratio/mean": 0.9601489305496216, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.123044967651367, "sampling/sampling_logp_difference/mean": 0.16482222080230713, "step": 106, "step_time": 127.26529746688902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3451.0, "completions/max_terminated_length": 3451.0, "completions/mean_length": 908.515625, "completions/mean_terminated_length": 908.515625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7097785025835037, "epoch": 0.26354679802955666, "frac_reward_zero_std": 0.25, "grad_norm": 0.006498712261232801, "kl": 0.001028874219628051, "learning_rate": 4.952140490359108e-05, "loss": 0.01925332471728325, "num_tokens": 16239080.0, "reward": 1.00390625, "reward_std": 0.4969992935657501, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.07306570145818922, "sampling/importance_sampling_ratio/max": 2.996018648147583, "sampling/importance_sampling_ratio/mean": 0.9512661695480347, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.228954315185547, "sampling/sampling_logp_difference/mean": 0.20210078358650208, "step": 107, "step_time": 111.68506760639139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 654.171875, "completions/mean_terminated_length": 654.171875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6160021126270294, "epoch": 0.2660098522167488, "frac_reward_zero_std": 0.25, "grad_norm": 0.003462118141168412, "kl": 0.0017048600275302306, "learning_rate": 4.951191196746855e-05, "loss": -0.002917511621490121, "num_tokens": 16370051.0, "reward": 0.96484375, "reward_std": 0.28821834921836853, "rewards/reward_func/mean": 0.1072048611111111, "rewards/reward_func/std": 0.04385380778047773, "sampling/importance_sampling_ratio/max": 2.9970760345458984, "sampling/importance_sampling_ratio/mean": 0.9633020758628845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.874786376953125, "sampling/sampling_logp_difference/mean": 0.16299639642238617, "step": 108, "step_time": 80.04466179292649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 1216.625, "completions/mean_terminated_length": 1011.6551513671875, "completions/min_length": 21.0, "completions/min_terminated_length": 318.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6527849286794662, "epoch": 0.2684729064039409, "frac_reward_zero_std": 0.0, "grad_norm": 0.0024810418704967474, "kl": 0.0012183433573227376, "learning_rate": 4.950232673529922e-05, "loss": -0.02829243801534176, "num_tokens": 16532315.0, "reward": 0.98046875, "reward_std": 0.3708224594593048, "rewards/reward_func/mean": 0.10894097222222222, "rewards/reward_func/std": 0.05599056515428755, "sampling/importance_sampling_ratio/max": 2.9940707683563232, "sampling/importance_sampling_ratio/mean": 0.9564605355262756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.400651931762695, "sampling/sampling_logp_difference/mean": 0.1813342571258545, "step": 109, "step_time": 152.9524313802831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2671.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 894.421875, "completions/mean_terminated_length": 926.7212524414062, "completions/min_length": 46.0, "completions/min_terminated_length": 127.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7501505464315414, "epoch": 0.270935960591133, "frac_reward_zero_std": 0.25, "grad_norm": 0.004039668650228865, "kl": 0.0010712686198530719, "learning_rate": 4.9492649243174894e-05, "loss": -0.03601264953613281, "num_tokens": 16674150.0, "reward": 1.00390625, "reward_std": 0.421383261680603, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.0628970828321245, "sampling/importance_sampling_ratio/max": 2.9961328506469727, "sampling/importance_sampling_ratio/mean": 0.9449620842933655, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.124836921691895, "sampling/sampling_logp_difference/mean": 0.2186630368232727, "step": 110, "step_time": 88.27275016298518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3556.0, "completions/mean_length": 1397.828125, "completions/mean_terminated_length": 1248.5167236328125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "degenerate_groups_filtered": 0.0, "entropy": 0.728972002863884, "epoch": 0.2733990147783251, "frac_reward_zero_std": 0.25, "grad_norm": 0.003406792708563326, "kl": 0.0008771911088842899, "learning_rate": 4.948287952753475e-05, "loss": -0.017109831795096397, "num_tokens": 16856187.0, "reward": 0.94140625, "reward_std": 0.407621294260025, "rewards/reward_func/mean": 0.10460069444444445, "rewards/reward_func/std": 0.06210231284300486, "sampling/importance_sampling_ratio/max": 2.9997353553771973, "sampling/importance_sampling_ratio/mean": 0.946989893913269, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.114840507507324, "sampling/sampling_logp_difference/mean": 0.20500458776950836, "step": 111, "step_time": 154.77054870105349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3783.0, "completions/mean_length": 1305.265625, "completions/mean_terminated_length": 1062.2930908203125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6425753086805344, "epoch": 0.27586206896551724, "frac_reward_zero_std": 0.0, "grad_norm": 0.006779910373498959, "kl": 0.0009470495278947055, "learning_rate": 4.947301762516526e-05, "loss": -0.021853812038898468, "num_tokens": 17023532.0, "reward": 1.04296875, "reward_std": 0.5935410857200623, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.10359535614649455, "sampling/importance_sampling_ratio/max": 2.996713161468506, "sampling/importance_sampling_ratio/mean": 0.9557031989097595, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.808320999145508, "sampling/sampling_logp_difference/mean": 0.1847507208585739, "step": 112, "step_time": 125.85401763604023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3423.0, "completions/max_terminated_length": 3423.0, "completions/mean_length": 829.296875, "completions/mean_terminated_length": 829.296875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "degenerate_groups_filtered": 0.0, "entropy": 0.665191113948822, "epoch": 0.27832512315270935, "frac_reward_zero_std": 0.0, "grad_norm": 0.0049727812963450396, "kl": 0.0011728792305802926, "learning_rate": 4.946306357319997e-05, "loss": 0.0018310556188225746, "num_tokens": 17174735.0, "reward": 0.9296875, "reward_std": 0.352059543132782, "rewards/reward_func/mean": 0.1032986111111111, "rewards/reward_func/std": 0.052210075987709895, "sampling/importance_sampling_ratio/max": 2.999053955078125, "sampling/importance_sampling_ratio/mean": 0.9518330693244934, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.943872451782227, "sampling/sampling_logp_difference/mean": 0.19780156016349792, "step": 113, "step_time": 104.48315672390163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3505.0, "completions/max_terminated_length": 3505.0, "completions/mean_length": 872.828125, "completions/mean_terminated_length": 872.828125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6913283914327621, "epoch": 0.28078817733990147, "frac_reward_zero_std": 0.5, "grad_norm": 0.0064203528593588695, "kl": 0.0009301105164922774, "learning_rate": 4.9453017409119416e-05, "loss": 0.06472717970609665, "num_tokens": 17311444.0, "reward": 1.11328125, "reward_std": 0.727751612663269, "rewards/reward_func/mean": 0.12369791666666667, "rewards/reward_func/std": 0.11462434629599254, "sampling/importance_sampling_ratio/max": 2.9989895820617676, "sampling/importance_sampling_ratio/mean": 0.9566206932067871, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.315587043762207, "sampling/sampling_logp_difference/mean": 0.1795286387205124, "step": 114, "step_time": 114.97647374519147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 776.515625, "completions/mean_terminated_length": 780.5423583984375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7222933769226074, "epoch": 0.2832512315270936, "frac_reward_zero_std": 0.0, "grad_norm": 0.004808982424799973, "kl": 0.0010381730098742992, "learning_rate": 4.9442879170750976e-05, "loss": -0.04378526657819748, "num_tokens": 17443477.0, "reward": 1.01953125, "reward_std": 0.5238454341888428, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.10823717713356018, "sampling/importance_sampling_ratio/max": 2.997987747192383, "sampling/importance_sampling_ratio/mean": 0.9515382051467896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.357341766357422, "sampling/sampling_logp_difference/mean": 0.19661840796470642, "step": 115, "step_time": 80.79798079002649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 708.015625, "completions/mean_terminated_length": 654.2381591796875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6727284938097, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.5, "grad_norm": 0.0020403117969845282, "kl": 0.001005473401164636, "learning_rate": 4.943264889626871e-05, "loss": -0.0025690970942378044, "num_tokens": 17572774.0, "reward": 1.02734375, "reward_std": 0.2524372637271881, "rewards/reward_func/mean": 0.11414930555555555, "rewards/reward_func/std": 0.03793410791291131, "sampling/importance_sampling_ratio/max": 2.9996719360351562, "sampling/importance_sampling_ratio/mean": 0.9610835313796997, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.65058708190918, "sampling/sampling_logp_difference/mean": 0.1769585758447647, "step": 116, "step_time": 140.72442844510078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3501.0, "completions/mean_length": 1290.484375, "completions/mean_terminated_length": 1152.5081787109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6598100662231445, "epoch": 0.2881773399014778, "frac_reward_zero_std": 0.25, "grad_norm": 0.005709928483840225, "kl": 0.0009271236776839942, "learning_rate": 4.942232662419324e-05, "loss": 0.02777143567800522, "num_tokens": 17757765.0, "reward": 1.01171875, "reward_std": 0.6507469415664673, "rewards/reward_func/mean": 0.11241319444444445, "rewards/reward_func/std": 0.10562626189655727, "sampling/importance_sampling_ratio/max": 2.9967916011810303, "sampling/importance_sampling_ratio/mean": 0.9467035531997681, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.749887466430664, "sampling/sampling_logp_difference/mean": 0.20036232471466064, "step": 117, "step_time": 137.15964733506553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 863.890625, "completions/mean_terminated_length": 828.34423828125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6135236918926239, "epoch": 0.29064039408866993, "frac_reward_zero_std": 0.25, "grad_norm": 0.005015909482580426, "kl": 0.0009673014137661085, "learning_rate": 4.941191239339158e-05, "loss": -0.03219921514391899, "num_tokens": 17892414.0, "reward": 0.921875, "reward_std": 0.39559829235076904, "rewards/reward_func/mean": 0.10243055555555555, "rewards/reward_func/std": 0.0586680488453971, "sampling/importance_sampling_ratio/max": 2.992807388305664, "sampling/importance_sampling_ratio/mean": 0.9618625640869141, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.102208137512207, "sampling/sampling_logp_difference/mean": 0.169651061296463, "step": 118, "step_time": 116.0549735748209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2490.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 547.140625, "completions/mean_terminated_length": 544.8870849609375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "degenerate_groups_filtered": 1.0, "entropy": 0.5702391713857651, "epoch": 0.29310344827586204, "frac_reward_zero_std": 0.25, "grad_norm": 0.005584388944937819, "kl": 0.0014928276068530977, "learning_rate": 4.9401406243077e-05, "loss": 0.0032053731847554445, "num_tokens": 18006743.0, "reward": 1.0234375, "reward_std": 0.4380665123462677, "rewards/reward_func/mean": 0.11371527777777778, "rewards/reward_func/std": 0.09896917310025957, "sampling/importance_sampling_ratio/max": 2.988774299621582, "sampling/importance_sampling_ratio/mean": 0.9679361581802368, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.677656173706055, "sampling/sampling_logp_difference/mean": 0.1493430733680725, "step": 119, "step_time": 75.99547935905866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 1029.71875, "completions/mean_terminated_length": 995.8524169921875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7400919646024704, "epoch": 0.2955665024630542, "frac_reward_zero_std": 0.0, "grad_norm": 0.006443843541173749, "kl": 0.001080518588423729, "learning_rate": 4.939080821280889e-05, "loss": 0.002451773267239332, "num_tokens": 18154629.0, "reward": 1.03515625, "reward_std": 0.7208173871040344, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.1304852200878991, "sampling/importance_sampling_ratio/max": 2.9992618560791016, "sampling/importance_sampling_ratio/mean": 0.9474197030067444, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.595125198364258, "sampling/sampling_logp_difference/mean": 0.21167021989822388, "step": 120, "step_time": 143.630502771819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3303.0, "completions/mean_length": 1061.234375, "completions/mean_terminated_length": 1023.9677124023438, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7333497107028961, "epoch": 0.29802955665024633, "frac_reward_zero_std": 0.0, "grad_norm": 0.00924462717955816, "kl": 0.0010052941361209378, "learning_rate": 4.9380118342492596e-05, "loss": 0.05594870075583458, "num_tokens": 18319604.0, "reward": 1.08203125, "reward_std": 0.8487323522567749, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.13110164470142788, "sampling/importance_sampling_ratio/max": 2.99625825881958, "sampling/importance_sampling_ratio/mean": 0.9394384622573853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.43202018737793, "sampling/sampling_logp_difference/mean": 0.22769640386104584, "step": 121, "step_time": 144.13920049113221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3333.0, "completions/max_terminated_length": 3333.0, "completions/mean_length": 1141.796875, "completions/mean_terminated_length": 1141.796875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6576246023178101, "epoch": 0.30049261083743845, "frac_reward_zero_std": 0.25, "grad_norm": 0.004997437097530932, "kl": 0.001344296833849512, "learning_rate": 4.936933667237926e-05, "loss": -3.2415613532066345e-05, "num_tokens": 18492823.0, "reward": 0.984375, "reward_std": 0.4876958429813385, "rewards/reward_func/mean": 0.109375, "rewards/reward_func/std": 0.0719899766974979, "sampling/importance_sampling_ratio/max": 2.9998884201049805, "sampling/importance_sampling_ratio/mean": 0.9471359252929688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.76190185546875, "sampling/sampling_logp_difference/mean": 0.19633889198303223, "step": 122, "step_time": 107.010423976928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 939.9375, "completions/mean_terminated_length": 846.3933715820312, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7716793268918991, "epoch": 0.30295566502463056, "frac_reward_zero_std": 0.0, "grad_norm": 0.005142065502705506, "kl": 0.0012641588837141171, "learning_rate": 4.935846324306571e-05, "loss": -0.04138103872537613, "num_tokens": 18636195.0, "reward": 0.875, "reward_std": 0.3700064420700073, "rewards/reward_func/mean": 0.09722222222222222, "rewards/reward_func/std": 0.05186475647820367, "sampling/importance_sampling_ratio/max": 2.9990382194519043, "sampling/importance_sampling_ratio/mean": 0.9490943551063538, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.95599365234375, "sampling/sampling_logp_difference/mean": 0.21140214800834656, "step": 123, "step_time": 158.32471957895905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2607.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 784.765625, "completions/mean_terminated_length": 784.765625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6249927580356598, "epoch": 0.3054187192118227, "frac_reward_zero_std": 0.0, "grad_norm": 0.0321336624260346, "kl": 0.0011989549384452403, "learning_rate": 4.934749809549427e-05, "loss": -0.01217272412031889, "num_tokens": 18771556.0, "reward": 0.95703125, "reward_std": 0.6386286616325378, "rewards/reward_func/mean": 0.10633680555555555, "rewards/reward_func/std": 0.11104831099510193, "sampling/importance_sampling_ratio/max": 2.99676775932312, "sampling/importance_sampling_ratio/mean": 0.9598660469055176, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.24995231628418, "sampling/sampling_logp_difference/mean": 0.17498475313186646, "step": 124, "step_time": 77.80229415814392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3424.0, "completions/mean_length": 895.421875, "completions/mean_terminated_length": 800.7069091796875, "completions/min_length": 7.0, "completions/min_terminated_length": 81.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6748597323894501, "epoch": 0.3078817733990148, "frac_reward_zero_std": 0.25, "grad_norm": 0.006046409516086105, "kl": 0.0010081128857564181, "learning_rate": 4.9336441270952595e-05, "loss": 0.09001608937978745, "num_tokens": 18905151.0, "reward": 1.01953125, "reward_std": 0.7896963953971863, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.1456383185254203, "sampling/importance_sampling_ratio/max": 2.9961564540863037, "sampling/importance_sampling_ratio/mean": 0.955715537071228, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.451773643493652, "sampling/sampling_logp_difference/mean": 0.19001328945159912, "step": 125, "step_time": 143.38146781618707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 1264.296875, "completions/mean_terminated_length": 1024.322021484375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7080798596143723, "epoch": 0.3103448275862069, "frac_reward_zero_std": 0.25, "grad_norm": 0.008165326775365848, "kl": 0.0014844062679912895, "learning_rate": 4.932529281107355e-05, "loss": 0.006085427477955818, "num_tokens": 19074754.0, "reward": 1.1796875, "reward_std": 1.0345890522003174, "rewards/reward_func/mean": 0.1310763888888889, "rewards/reward_func/std": 0.17465003662639195, "sampling/importance_sampling_ratio/max": 2.999812602996826, "sampling/importance_sampling_ratio/mean": 0.9442422389984131, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.812485694885254, "sampling/sampling_logp_difference/mean": 0.2114749252796173, "step": 126, "step_time": 136.04279370303266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2906.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 730.3125, "completions/mean_terminated_length": 729.5322265625, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8340341448783875, "epoch": 0.312807881773399, "frac_reward_zero_std": 0.0, "grad_norm": 0.01630669112288163, "kl": 0.002556009974796325, "learning_rate": 4.931405275783507e-05, "loss": -0.009080227464437485, "num_tokens": 19230038.0, "reward": 0.92578125, "reward_std": 0.9581311345100403, "rewards/reward_func/mean": 0.10286458333333333, "rewards/reward_func/std": 0.15839683678415087, "sampling/importance_sampling_ratio/max": 2.9990289211273193, "sampling/importance_sampling_ratio/mean": 0.9346531629562378, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.898031234741211, "sampling/sampling_logp_difference/mean": 0.24698056280612946, "step": 127, "step_time": 100.47396804229356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3481.0, "completions/mean_length": 988.28125, "completions/mean_terminated_length": 888.0322265625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "degenerate_groups_filtered": 0.0, "entropy": 0.718447744846344, "epoch": 0.31527093596059114, "frac_reward_zero_std": 0.0, "grad_norm": 0.006766736760887471, "kl": 0.0016968499112408608, "learning_rate": 4.930272115355992e-05, "loss": -0.017601270228624344, "num_tokens": 19374488.0, "reward": 1.0078125, "reward_std": 0.5992202758789062, "rewards/reward_func/mean": 0.11197916666666667, "rewards/reward_func/std": 0.10523506999015808, "sampling/importance_sampling_ratio/max": 2.9975998401641846, "sampling/importance_sampling_ratio/mean": 0.9509903192520142, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.434347152709961, "sampling/sampling_logp_difference/mean": 0.20046307146549225, "step": 128, "step_time": 131.1536720271688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 879.46875, "completions/mean_terminated_length": 775.2000122070312, "completions/min_length": 15.0, "completions/min_terminated_length": 137.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7035564482212067, "epoch": 0.31773399014778325, "frac_reward_zero_std": 0.0, "grad_norm": 0.008916657281684769, "kl": 0.0014704163477290422, "learning_rate": 4.929129804091562e-05, "loss": -0.0401824489235878, "num_tokens": 19516246.0, "reward": 1.0859375, "reward_std": 0.6580073833465576, "rewards/reward_func/mean": 0.12065972222222222, "rewards/reward_func/std": 0.11946020854843988, "sampling/importance_sampling_ratio/max": 2.984513998031616, "sampling/importance_sampling_ratio/mean": 0.9554660320281982, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.787663459777832, "sampling/sampling_logp_difference/mean": 0.1841893047094345, "step": 129, "step_time": 133.5201920508407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3938.0, "completions/max_terminated_length": 3938.0, "completions/mean_length": 829.765625, "completions/mean_terminated_length": 829.765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7256664037704468, "epoch": 0.32019704433497537, "frac_reward_zero_std": 0.25, "grad_norm": 0.0036159006370788662, "kl": 0.0014483562554232776, "learning_rate": 4.927978346291424e-05, "loss": -0.00416420167312026, "num_tokens": 19648583.0, "reward": 0.98046875, "reward_std": 0.2827889919281006, "rewards/reward_func/mean": 0.10894097222222222, "rewards/reward_func/std": 0.043313807911343045, "sampling/importance_sampling_ratio/max": 2.996049404144287, "sampling/importance_sampling_ratio/mean": 0.9527163505554199, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.433141708374023, "sampling/sampling_logp_difference/mean": 0.19274213910102844, "step": 130, "step_time": 108.99631480290554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3723.0, "completions/mean_length": 989.65625, "completions/mean_terminated_length": 903.7930908203125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "degenerate_groups_filtered": 0.0, "entropy": 0.812716618180275, "epoch": 0.3226600985221675, "frac_reward_zero_std": 0.0, "grad_norm": 0.013116335520916896, "kl": 0.0016703194414731115, "learning_rate": 4.9268177462912255e-05, "loss": -0.05333615094423294, "num_tokens": 19797857.0, "reward": 1.0703125, "reward_std": 1.1246416568756104, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.19068154527081382, "sampling/importance_sampling_ratio/max": 2.9927353858947754, "sampling/importance_sampling_ratio/mean": 0.9444986581802368, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.257890701293945, "sampling/sampling_logp_difference/mean": 0.21850484609603882, "step": 131, "step_time": 118.88544421107508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3857.0, "completions/max_terminated_length": 3857.0, "completions/mean_length": 816.703125, "completions/mean_terminated_length": 817.84130859375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6633237600326538, "epoch": 0.3251231527093596, "frac_reward_zero_std": 0.25, "grad_norm": 0.0031921393393205934, "kl": 0.0013503513764590025, "learning_rate": 4.9256480084610376e-05, "loss": -0.00991955865174532, "num_tokens": 19926286.0, "reward": 1.0625, "reward_std": 0.40089187026023865, "rewards/reward_func/mean": 0.11805555555555555, "rewards/reward_func/std": 0.05971749954753452, "sampling/importance_sampling_ratio/max": 2.9954562187194824, "sampling/importance_sampling_ratio/mean": 0.9606219530105591, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.18454647064209, "sampling/sampling_logp_difference/mean": 0.17874056100845337, "step": 132, "step_time": 110.05460423021577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3840.0, "completions/mean_length": 1232.65625, "completions/mean_terminated_length": 1169.283447265625, "completions/min_length": 1.0, "completions/min_terminated_length": 131.0, "degenerate_groups_filtered": 0.0, "entropy": 0.737604409456253, "epoch": 0.3275862068965517, "frac_reward_zero_std": 0.0, "grad_norm": 0.005410025456930972, "kl": 0.0011540141276782379, "learning_rate": 4.9244691372053376e-05, "loss": 0.0003698645159602165, "num_tokens": 20093112.0, "reward": 1.0234375, "reward_std": 0.6824308037757874, "rewards/reward_func/mean": 0.11371527777777778, "rewards/reward_func/std": 0.1231840882036421, "sampling/importance_sampling_ratio/max": 2.99904465675354, "sampling/importance_sampling_ratio/mean": 0.9456319212913513, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.690722465515137, "sampling/sampling_logp_difference/mean": 0.2077077031135559, "step": 133, "step_time": 136.14795652101748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 1119.234375, "completions/mean_terminated_length": 982.9491577148438, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8452684134244919, "epoch": 0.33004926108374383, "frac_reward_zero_std": 0.0, "grad_norm": 0.009834879984087863, "kl": 0.0014841850788798183, "learning_rate": 4.9232811369629936e-05, "loss": -0.09593109786510468, "num_tokens": 20252807.0, "reward": 1.08203125, "reward_std": 0.8177770376205444, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.16325969994068146, "sampling/importance_sampling_ratio/max": 2.999234914779663, "sampling/importance_sampling_ratio/mean": 0.9442738890647888, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.05754280090332, "sampling/sampling_logp_difference/mean": 0.22508864104747772, "step": 134, "step_time": 130.63481510290876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3492.0, "completions/mean_length": 780.5, "completions/mean_terminated_length": 668.9500122070312, "completions/min_length": 22.0, "completions/min_terminated_length": 182.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7695089131593704, "epoch": 0.33251231527093594, "frac_reward_zero_std": 0.0, "grad_norm": 0.0050137511344250825, "kl": 0.001503633859101683, "learning_rate": 4.9220840122072495e-05, "loss": -0.044059619307518005, "num_tokens": 20393879.0, "reward": 1.04296875, "reward_std": 0.5713995099067688, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.12210349573029412, "sampling/importance_sampling_ratio/max": 2.998356819152832, "sampling/importance_sampling_ratio/mean": 0.9478631615638733, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.743224143981934, "sampling/sampling_logp_difference/mean": 0.21353685855865479, "step": 135, "step_time": 128.1372858658433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 827.328125, "completions/mean_terminated_length": 756.8275756835938, "completions/min_length": 125.0, "completions/min_terminated_length": 185.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7343933880329132, "epoch": 0.33497536945812806, "frac_reward_zero_std": 0.0, "grad_norm": 0.009246592184747104, "kl": 0.0019113961607217789, "learning_rate": 4.920877767445705e-05, "loss": -0.002634907141327858, "num_tokens": 20544364.0, "reward": 0.9296875, "reward_std": 0.7063610553741455, "rewards/reward_func/mean": 0.1032986111111111, "rewards/reward_func/std": 0.12537386185593075, "sampling/importance_sampling_ratio/max": 2.9994935989379883, "sampling/importance_sampling_ratio/mean": 0.9448176026344299, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.562491416931152, "sampling/sampling_logp_difference/mean": 0.21808388829231262, "step": 136, "step_time": 122.9346935309004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 826.921875, "completions/mean_terminated_length": 623.5614013671875, "completions/min_length": 75.0, "completions/min_terminated_length": 85.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6908468455076218, "epoch": 0.3374384236453202, "frac_reward_zero_std": 0.0, "grad_norm": 0.005994547580292389, "kl": 0.0016178832156583667, "learning_rate": 4.919662407220299e-05, "loss": 0.017274843528866768, "num_tokens": 20670935.0, "reward": 0.84375, "reward_std": 0.42374932765960693, "rewards/reward_func/mean": 0.09375, "rewards/reward_func/std": 0.06059717138608297, "sampling/importance_sampling_ratio/max": 2.9996566772460938, "sampling/importance_sampling_ratio/mean": 0.9579602479934692, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.689961433410645, "sampling/sampling_logp_difference/mean": 0.18010303378105164, "step": 137, "step_time": 111.31847057538107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3975.0, "completions/mean_length": 1044.9375, "completions/mean_terminated_length": 996.5079956054688, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7295639663934708, "epoch": 0.3399014778325123, "frac_reward_zero_std": 0.0, "grad_norm": 0.014028663492014952, "kl": 0.0016514419403392822, "learning_rate": 4.918437936107293e-05, "loss": -0.002566501498222351, "num_tokens": 20827923.0, "reward": 1.171875, "reward_std": 1.023411512374878, "rewards/reward_func/mean": 0.13020833333333334, "rewards/reward_func/std": 0.173271753721767, "sampling/importance_sampling_ratio/max": 2.998654365539551, "sampling/importance_sampling_ratio/mean": 0.9453516006469727, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.36987018585205, "sampling/sampling_logp_difference/mean": 0.21218480169773102, "step": 138, "step_time": 127.0127622236032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3486.0, "completions/mean_length": 969.0, "completions/mean_terminated_length": 928.57373046875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7234471142292023, "epoch": 0.34236453201970446, "frac_reward_zero_std": 0.0, "grad_norm": 0.005342188170689295, "kl": 0.0016450581315439194, "learning_rate": 4.9172043587172564e-05, "loss": -0.045484960079193115, "num_tokens": 21004531.0, "reward": 1.00390625, "reward_std": 0.6477864980697632, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.1517499718401167, "sampling/importance_sampling_ratio/max": 2.9931271076202393, "sampling/importance_sampling_ratio/mean": 0.9399889707565308, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.982999801635742, "sampling/sampling_logp_difference/mean": 0.22505322098731995, "step": 139, "step_time": 145.2830160120502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3734.0, "completions/mean_length": 1036.578125, "completions/mean_terminated_length": 883.137939453125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "degenerate_groups_filtered": 0.0, "entropy": 1.0906709432601929, "epoch": 0.3448275862068966, "frac_reward_zero_std": 0.0, "grad_norm": 0.006354945561563261, "kl": 0.0026675731933210045, "learning_rate": 4.915961679695046e-05, "loss": 0.002837133128196001, "num_tokens": 21157112.0, "reward": 1.1953125, "reward_std": 0.8035989999771118, "rewards/reward_func/mean": 0.1328125, "rewards/reward_func/std": 0.18738024102316964, "sampling/importance_sampling_ratio/max": 2.9998462200164795, "sampling/importance_sampling_ratio/mean": 0.9394787549972534, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.561127662658691, "sampling/sampling_logp_difference/mean": 0.23769940435886383, "step": 140, "step_time": 117.01026117592119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3236.0, "completions/mean_length": 1042.078125, "completions/mean_terminated_length": 988.9031982421875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7437245845794678, "epoch": 0.3472906403940887, "frac_reward_zero_std": 0.0, "grad_norm": 0.013162011043027224, "kl": 0.002167004276998341, "learning_rate": 4.914709903719788e-05, "loss": -0.0010647885501384735, "num_tokens": 21309389.0, "reward": 1.0078125, "reward_std": 0.8392276763916016, "rewards/reward_func/mean": 0.11197916666666667, "rewards/reward_func/std": 0.14033709466457367, "sampling/importance_sampling_ratio/max": 2.9996509552001953, "sampling/importance_sampling_ratio/mean": 0.9429901838302612, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.79409122467041, "sampling/sampling_logp_difference/mean": 0.22013148665428162, "step": 141, "step_time": 114.42773477407172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3518.0, "completions/max_terminated_length": 3464.0, "completions/mean_length": 945.75, "completions/mean_terminated_length": 904.9207153320312, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "degenerate_groups_filtered": 0.0, "entropy": 0.821587085723877, "epoch": 0.3497536945812808, "frac_reward_zero_std": 0.0, "grad_norm": 0.011903686956581254, "kl": 0.0028415362467058003, "learning_rate": 4.913449035504865e-05, "loss": 0.01681530475616455, "num_tokens": 21445789.0, "reward": 1.08203125, "reward_std": 0.9407598972320557, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.16332321696811253, "sampling/importance_sampling_ratio/max": 2.9959158897399902, "sampling/importance_sampling_ratio/mean": 0.9527857303619385, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.999815940856934, "sampling/sampling_logp_difference/mean": 0.20200073719024658, "step": 142, "step_time": 101.91747950715944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 741.765625, "completions/mean_terminated_length": 688.5238647460938, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7046773880720139, "epoch": 0.3522167487684729, "frac_reward_zero_std": 0.5, "grad_norm": 0.010794463183350216, "kl": 0.0018471547809895128, "learning_rate": 4.912179079797892e-05, "loss": -0.030231602489948273, "num_tokens": 21573950.0, "reward": 1.12109375, "reward_std": 0.6516039371490479, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.11842490235964458, "sampling/importance_sampling_ratio/max": 2.997932195663452, "sampling/importance_sampling_ratio/mean": 0.9545925855636597, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.498371124267578, "sampling/sampling_logp_difference/mean": 0.19408845901489258, "step": 143, "step_time": 138.89513726904988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 825.359375, "completions/mean_terminated_length": 719.8547973632812, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6682114452123642, "epoch": 0.35467980295566504, "frac_reward_zero_std": 0.25, "grad_norm": 0.009881096093500623, "kl": 0.0020337939204182476, "learning_rate": 4.910900041380703e-05, "loss": -0.003868427127599716, "num_tokens": 21710037.0, "reward": 0.98046875, "reward_std": 0.6256816983222961, "rewards/reward_func/mean": 0.10894097222222222, "rewards/reward_func/std": 0.10894608166482714, "sampling/importance_sampling_ratio/max": 2.9985740184783936, "sampling/importance_sampling_ratio/mean": 0.9586971402168274, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.846827507019043, "sampling/sampling_logp_difference/mean": 0.17317776381969452, "step": 144, "step_time": 119.17530405288562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3935.0, "completions/mean_length": 1660.03125, "completions/mean_terminated_length": 1629.4576416015625, "completions/min_length": 111.0, "completions/min_terminated_length": 159.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5887557715177536, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.0, "grad_norm": 0.005607486150752285, "kl": 0.001655459520407021, "learning_rate": 4.909611925069332e-05, "loss": 0.037647098302841187, "num_tokens": 21913143.0, "reward": 1.08203125, "reward_std": 0.8141295313835144, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.1332318385442098, "sampling/importance_sampling_ratio/max": 2.9985463619232178, "sampling/importance_sampling_ratio/mean": 0.9469821453094482, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.999975204467773, "sampling/sampling_logp_difference/mean": 0.1916055977344513, "step": 145, "step_time": 134.20607422688045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3889.0, "completions/mean_length": 896.34375, "completions/mean_terminated_length": 754.322021484375, "completions/min_length": 22.0, "completions/min_terminated_length": 101.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7062448561191559, "epoch": 0.35960591133004927, "frac_reward_zero_std": 0.25, "grad_norm": 0.0031982862153996645, "kl": 0.0019544282113201916, "learning_rate": 4.9083147357139936e-05, "loss": -0.02738259732723236, "num_tokens": 22054397.0, "reward": 1.11328125, "reward_std": 0.5544368028640747, "rewards/reward_func/mean": 0.12369791666666667, "rewards/reward_func/std": 0.12000629636976454, "sampling/importance_sampling_ratio/max": 2.9959394931793213, "sampling/importance_sampling_ratio/mean": 0.9592767357826233, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.54010581970215, "sampling/sampling_logp_difference/mean": 0.18619805574417114, "step": 146, "step_time": 121.56179263815284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 991.390625, "completions/mean_terminated_length": 879.7000732421875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7062280178070068, "epoch": 0.3620689655172414, "frac_reward_zero_std": 0.0, "grad_norm": 0.008489732725130188, "kl": 0.002335024008061737, "learning_rate": 4.9070084781990655e-05, "loss": 0.10014194250106812, "num_tokens": 22192422.0, "reward": 1.27734375, "reward_std": 0.9042636752128601, "rewards/reward_func/mean": 0.14192708333333334, "rewards/reward_func/std": 0.14803790383868748, "sampling/importance_sampling_ratio/max": 2.998023748397827, "sampling/importance_sampling_ratio/mean": 0.953277587890625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.429122924804688, "sampling/sampling_logp_difference/mean": 0.1882876753807068, "step": 147, "step_time": 120.45774253108539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3667.0, "completions/mean_length": 1032.34375, "completions/mean_terminated_length": 983.71435546875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7528516948223114, "epoch": 0.3645320197044335, "frac_reward_zero_std": 0.25, "grad_norm": 0.009651669871581751, "kl": 0.0020805590029340237, "learning_rate": 4.905693157443072e-05, "loss": 0.05875544250011444, "num_tokens": 22341868.0, "reward": 1.203125, "reward_std": 0.8496906757354736, "rewards/reward_func/mean": 0.13368055555555555, "rewards/reward_func/std": 0.13039267228709328, "sampling/importance_sampling_ratio/max": 2.9999241828918457, "sampling/importance_sampling_ratio/mean": 0.9552860856056213, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.058977127075195, "sampling/sampling_logp_difference/mean": 0.18934276700019836, "step": 148, "step_time": 132.01033597579226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3052.0, "completions/mean_length": 1092.125, "completions/mean_terminated_length": 1031.04833984375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6683467030525208, "epoch": 0.3669950738916256, "frac_reward_zero_std": 0.25, "grad_norm": 0.006696869355189105, "kl": 0.002086179330945015, "learning_rate": 4.904368778398662e-05, "loss": 0.08531684428453445, "num_tokens": 22501812.0, "reward": 1.109375, "reward_std": 0.7713559865951538, "rewards/reward_func/mean": 0.1232638888888889, "rewards/reward_func/std": 0.12479497989018758, "sampling/importance_sampling_ratio/max": 2.9943675994873047, "sampling/importance_sampling_ratio/mean": 0.9468928575515747, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.937206268310547, "sampling/sampling_logp_difference/mean": 0.20146730542182922, "step": 149, "step_time": 116.60984491393901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 1074.625, "completions/mean_terminated_length": 923.2069091796875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7355668991804123, "epoch": 0.3694581280788177, "frac_reward_zero_std": 0.25, "grad_norm": 0.006190045620303314, "kl": 0.001839842356275767, "learning_rate": 4.903035346052593e-05, "loss": -0.04892116039991379, "num_tokens": 22656732.0, "reward": 1.0, "reward_std": 0.6267831921577454, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.1120100501510832, "sampling/importance_sampling_ratio/max": 2.99916672706604, "sampling/importance_sampling_ratio/mean": 0.9446475505828857, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.62105655670166, "sampling/sampling_logp_difference/mean": 0.21084731817245483, "step": 150, "step_time": 134.81604075315408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2905.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 933.0, "completions/mean_terminated_length": 929.2857666015625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6313983350992203, "epoch": 0.37192118226600984, "frac_reward_zero_std": 0.25, "grad_norm": 0.004448968547173259, "kl": 0.001612977299373597, "learning_rate": 4.9016928654257096e-05, "loss": 0.009620461612939835, "num_tokens": 22809196.0, "reward": 1.1171875, "reward_std": 0.4225037693977356, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.060594505733913846, "sampling/importance_sampling_ratio/max": 2.997783899307251, "sampling/importance_sampling_ratio/mean": 0.9538916945457458, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.496200561523438, "sampling/sampling_logp_difference/mean": 0.18262873589992523, "step": 151, "step_time": 89.70710955327377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3902.0, "completions/mean_length": 1262.1875, "completions/mean_terminated_length": 1127.300048828125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8315682709217072, "epoch": 0.37438423645320196, "frac_reward_zero_std": 0.0, "grad_norm": 0.011580758122952332, "kl": 0.0028582040686160326, "learning_rate": 4.9003413415729295e-05, "loss": -0.015760261565446854, "num_tokens": 22980728.0, "reward": 1.03515625, "reward_std": 0.6984494924545288, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.12651590506235758, "sampling/importance_sampling_ratio/max": 2.9961771965026855, "sampling/importance_sampling_ratio/mean": 0.928211510181427, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.83804702758789, "sampling/sampling_logp_difference/mean": 0.26203879714012146, "step": 152, "step_time": 137.6754217641428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 832.359375, "completions/mean_terminated_length": 730.8643798828125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7106350511312485, "epoch": 0.3768472906403941, "frac_reward_zero_std": 0.0, "grad_norm": 0.01284240306421068, "kl": 0.002594699908513576, "learning_rate": 4.898980779583218e-05, "loss": 0.03899255394935608, "num_tokens": 23108255.0, "reward": 1.0859375, "reward_std": 0.7483032941818237, "rewards/reward_func/mean": 0.12065972222222222, "rewards/reward_func/std": 0.14227375719282362, "sampling/importance_sampling_ratio/max": 2.999643087387085, "sampling/importance_sampling_ratio/mean": 0.9569455981254578, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.268088340759277, "sampling/sampling_logp_difference/mean": 0.18348127603530884, "step": 153, "step_time": 114.66229340620339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3680.0, "completions/max_terminated_length": 3680.0, "completions/mean_length": 886.609375, "completions/mean_terminated_length": 865.2222900390625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6759176105260849, "epoch": 0.3793103448275862, "frac_reward_zero_std": 0.5, "grad_norm": 0.009819383443114064, "kl": 0.0015356106159742922, "learning_rate": 4.897611184579575e-05, "loss": 0.04377323016524315, "num_tokens": 23256022.0, "reward": 1.125, "reward_std": 0.5721721649169922, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.0965376885400878, "sampling/importance_sampling_ratio/max": 2.991487741470337, "sampling/importance_sampling_ratio/mean": 0.9485093951225281, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.249042510986328, "sampling/sampling_logp_difference/mean": 0.1910654902458191, "step": 154, "step_time": 107.22631569998339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 1154.984375, "completions/mean_terminated_length": 958.9166870117188, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6965581178665161, "epoch": 0.3817733990147783, "frac_reward_zero_std": 0.0, "grad_norm": 0.009454443083222703, "kl": 0.0019190027960576117, "learning_rate": 4.896232561719011e-05, "loss": -0.031025201082229614, "num_tokens": 23414421.0, "reward": 1.1484375, "reward_std": 0.8576439023017883, "rewards/reward_func/mean": 0.12760416666666666, "rewards/reward_func/std": 0.16676429907480875, "sampling/importance_sampling_ratio/max": 2.9999499320983887, "sampling/importance_sampling_ratio/mean": 0.9523643851280212, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.081517219543457, "sampling/sampling_logp_difference/mean": 0.1926102340221405, "step": 155, "step_time": 137.01870591682382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2384.0, "completions/max_terminated_length": 2384.0, "completions/mean_length": 650.34375, "completions/mean_terminated_length": 646.9677124023438, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6660396009683609, "epoch": 0.3842364532019704, "frac_reward_zero_std": 0.0, "grad_norm": 0.012736599143800455, "kl": 0.0020014344772789627, "learning_rate": 4.8948449161925304e-05, "loss": -0.031402587890625, "num_tokens": 23538395.0, "reward": 1.09375, "reward_std": 0.75, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.12056290772226122, "sampling/importance_sampling_ratio/max": 2.99924635887146, "sampling/importance_sampling_ratio/mean": 0.9617007374763489, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.430522918701172, "sampling/sampling_logp_difference/mean": 0.16973595321178436, "step": 156, "step_time": 74.7309046103619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3604.0, "completions/mean_length": 832.75, "completions/mean_terminated_length": 672.2622680664062, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6982234865427017, "epoch": 0.3866995073891626, "frac_reward_zero_std": 0.25, "grad_norm": 0.01619216229721582, "kl": 0.002053438453003764, "learning_rate": 4.893448253225111e-05, "loss": -0.09056590497493744, "num_tokens": 23664155.0, "reward": 1.2421875, "reward_std": 0.9041008353233337, "rewards/reward_func/mean": 0.13802083333333334, "rewards/reward_func/std": 0.1453225099378162, "sampling/importance_sampling_ratio/max": 2.998002290725708, "sampling/importance_sampling_ratio/mean": 0.9607492089271545, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.44827938079834, "sampling/sampling_logp_difference/mean": 0.17780989408493042, "step": 157, "step_time": 145.718919953797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 953.4375, "completions/mean_terminated_length": 873.1034545898438, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7421602457761765, "epoch": 0.3891625615763547, "frac_reward_zero_std": 0.0, "grad_norm": 0.01641507777336797, "kl": 0.0034244628623127937, "learning_rate": 4.892042578075685e-05, "loss": -0.14567188918590546, "num_tokens": 23815959.0, "reward": 1.02734375, "reward_std": 0.7044602632522583, "rewards/reward_func/mean": 0.11414930555555555, "rewards/reward_func/std": 0.1274808877044254, "sampling/importance_sampling_ratio/max": 2.9998927116394043, "sampling/importance_sampling_ratio/mean": 0.9466952085494995, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.74369239807129, "sampling/sampling_logp_difference/mean": 0.21085965633392334, "step": 158, "step_time": 156.47580430214293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 1197.21875, "completions/mean_terminated_length": 1047.27587890625, "completions/min_length": 23.0, "completions/min_terminated_length": 101.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6783465445041656, "epoch": 0.3916256157635468, "frac_reward_zero_std": 0.0, "grad_norm": 0.009919495589656527, "kl": 0.0024570394889451563, "learning_rate": 4.8906278960371176e-05, "loss": -0.07994688302278519, "num_tokens": 23980373.0, "reward": 1.05859375, "reward_std": 0.783943772315979, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.15668830606672499, "sampling/importance_sampling_ratio/max": 2.998675584793091, "sampling/importance_sampling_ratio/mean": 0.9550249576568604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.630230903625488, "sampling/sampling_logp_difference/mean": 0.18413835763931274, "step": 159, "step_time": 127.59175217780285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 1115.921875, "completions/mean_terminated_length": 1032.8834228515625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6052042990922928, "epoch": 0.39408866995073893, "frac_reward_zero_std": 0.0, "grad_norm": 0.008819283784252606, "kl": 0.0022560503566637635, "learning_rate": 4.889204212436189e-05, "loss": 0.017924707382917404, "num_tokens": 24143456.0, "reward": 1.08203125, "reward_std": 0.8104656338691711, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.13666501144568124, "sampling/importance_sampling_ratio/max": 2.9991002082824707, "sampling/importance_sampling_ratio/mean": 0.9468730688095093, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.852204322814941, "sampling/sampling_logp_difference/mean": 0.19665727019309998, "step": 160, "step_time": 138.74941563908942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 1053.40625, "completions/mean_terminated_length": 1011.666748046875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6857775151729584, "epoch": 0.39655172413793105, "frac_reward_zero_std": 0.25, "grad_norm": 0.011535143162337612, "kl": 0.0021167479280848056, "learning_rate": 4.8877715326335735e-05, "loss": 0.00892894808202982, "num_tokens": 24309706.0, "reward": 0.9375, "reward_std": 0.6591842174530029, "rewards/reward_func/mean": 0.10416666666666667, "rewards/reward_func/std": 0.11478952235645717, "sampling/importance_sampling_ratio/max": 2.9970932006835938, "sampling/importance_sampling_ratio/mean": 0.9446910619735718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.179174423217773, "sampling/sampling_logp_difference/mean": 0.21110796928405762, "step": 161, "step_time": 121.95381436008029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 797.953125, "completions/mean_terminated_length": 697.1000366210938, "completions/min_length": 4.0, "completions/min_terminated_length": 254.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6954528391361237, "epoch": 0.39901477832512317, "frac_reward_zero_std": 0.0, "grad_norm": 0.003989546208282082, "kl": 0.002669573645107448, "learning_rate": 4.886329862023818e-05, "loss": -0.021821074187755585, "num_tokens": 24441415.0, "reward": 1.04296875, "reward_std": 0.4466690719127655, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.06591926680670844, "sampling/importance_sampling_ratio/max": 2.9920241832733154, "sampling/importance_sampling_ratio/mean": 0.9608190059661865, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.830057144165039, "sampling/sampling_logp_difference/mean": 0.17658987641334534, "step": 162, "step_time": 123.86876834277064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 1097.328125, "completions/mean_terminated_length": 980.2105102539062, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6942517310380936, "epoch": 0.4014778325123153, "frac_reward_zero_std": 0.25, "grad_norm": 0.012712878344654979, "kl": 0.002415290189674124, "learning_rate": 4.884879206035324e-05, "loss": -0.01825246773660183, "num_tokens": 24603996.0, "reward": 1.29296875, "reward_std": 1.1176388263702393, "rewards/reward_func/mean": 0.14366319444444445, "rewards/reward_func/std": 0.1814569118950102, "sampling/importance_sampling_ratio/max": 2.9947588443756104, "sampling/importance_sampling_ratio/mean": 0.9565131068229675, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.88363265991211, "sampling/sampling_logp_difference/mean": 0.18692165613174438, "step": 163, "step_time": 127.92860764102079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3682.0, "completions/mean_length": 1054.828125, "completions/mean_terminated_length": 940.3167114257812, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7391787171363831, "epoch": 0.4039408866995074, "frac_reward_zero_std": 0.0, "grad_norm": 0.009065769445882707, "kl": 0.0027781289536505938, "learning_rate": 4.883419570130327e-05, "loss": -0.10061228275299072, "num_tokens": 24766177.0, "reward": 1.01171875, "reward_std": 0.7746347188949585, "rewards/reward_func/mean": 0.11241319444444445, "rewards/reward_func/std": 0.1506018704838223, "sampling/importance_sampling_ratio/max": 2.998229742050171, "sampling/importance_sampling_ratio/mean": 0.9475909471511841, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.99682903289795, "sampling/sampling_logp_difference/mean": 0.20837771892547607, "step": 164, "step_time": 125.48444975796156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3837.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 570.28125, "completions/mean_terminated_length": 571.3809814453125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7563356757164001, "epoch": 0.4064039408866995, "frac_reward_zero_std": 0.0, "grad_norm": 0.02071127784172916, "kl": 0.004994380171410739, "learning_rate": 4.881950959804874e-05, "loss": 0.019023362547159195, "num_tokens": 24885267.0, "reward": 1.1328125, "reward_std": 0.9728137254714966, "rewards/reward_func/mean": 0.12586805555555555, "rewards/reward_func/std": 0.1605023874176873, "sampling/importance_sampling_ratio/max": 2.9999263286590576, "sampling/importance_sampling_ratio/mean": 0.9563567638397217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.745625495910645, "sampling/sampling_logp_difference/mean": 0.19455267488956451, "step": 165, "step_time": 131.7907678987831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 844.078125, "completions/mean_terminated_length": 792.4603881835938, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6852337867021561, "epoch": 0.4088669950738916, "frac_reward_zero_std": 0.5, "grad_norm": 0.02057318733791968, "kl": 0.002958475728519261, "learning_rate": 4.8804733805888024e-05, "loss": -0.11288006603717804, "num_tokens": 25024104.0, "reward": 1.21484375, "reward_std": 0.9320859313011169, "rewards/reward_func/mean": 0.1349826388888889, "rewards/reward_func/std": 0.1531604164176517, "sampling/importance_sampling_ratio/max": 2.9926633834838867, "sampling/importance_sampling_ratio/mean": 0.9570341110229492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.6238431930542, "sampling/sampling_logp_difference/mean": 0.19065096974372864, "step": 166, "step_time": 130.00207916204818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3418.0, "completions/mean_length": 1142.28125, "completions/mean_terminated_length": 1058.458984375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7697890400886536, "epoch": 0.41133004926108374, "frac_reward_zero_std": 0.0, "grad_norm": 0.02092271355309941, "kl": 0.004163852776400745, "learning_rate": 4.8789868380457246e-05, "loss": -0.1456824094057083, "num_tokens": 25180410.0, "reward": 1.30078125, "reward_std": 1.069211721420288, "rewards/reward_func/mean": 0.14453125, "rewards/reward_func/std": 0.17918562557962206, "sampling/importance_sampling_ratio/max": 2.9986627101898193, "sampling/importance_sampling_ratio/mean": 0.9531635046005249, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.748177528381348, "sampling/sampling_logp_difference/mean": 0.1931055784225464, "step": 167, "step_time": 123.96054100710899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 1033.203125, "completions/mean_terminated_length": 886.9661254882812, "completions/min_length": 98.0, "completions/min_terminated_length": 118.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6940324157476425, "epoch": 0.41379310344827586, "frac_reward_zero_std": 0.0, "grad_norm": 0.00879309156772693, "kl": 0.003143699432257563, "learning_rate": 4.8774913377729994e-05, "loss": 0.009899081662297249, "num_tokens": 25344935.0, "reward": 1.08984375, "reward_std": 0.9983788132667542, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.1805433001783159, "sampling/importance_sampling_ratio/max": 2.9867444038391113, "sampling/importance_sampling_ratio/mean": 0.9476098418235779, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.72199058532715, "sampling/sampling_logp_difference/mean": 0.20125585794448853, "step": 168, "step_time": 133.38061838923022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3877.0, "completions/max_terminated_length": 3877.0, "completions/mean_length": 994.78125, "completions/mean_terminated_length": 994.78125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7939935922622681, "epoch": 0.41625615763546797, "frac_reward_zero_std": 0.25, "grad_norm": 0.01507058902435564, "kl": 0.00311278022127226, "learning_rate": 4.875986885401717e-05, "loss": 0.033397216349840164, "num_tokens": 25496649.0, "reward": 1.08984375, "reward_std": 0.9045379161834717, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.15391069816218483, "sampling/importance_sampling_ratio/max": 2.9987685680389404, "sampling/importance_sampling_ratio/mean": 0.9507143497467041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.1064453125, "sampling/sampling_logp_difference/mean": 0.21293172240257263, "step": 169, "step_time": 109.43428984982893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 777.046875, "completions/mean_terminated_length": 784.7704467773438, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6964544653892517, "epoch": 0.4187192118226601, "frac_reward_zero_std": 0.0, "grad_norm": 0.018781297681167743, "kl": 0.003129941411316395, "learning_rate": 4.874473486596672e-05, "loss": -0.03748438134789467, "num_tokens": 25655852.0, "reward": 1.1796875, "reward_std": 0.9692379236221313, "rewards/reward_func/mean": 0.1310763888888889, "rewards/reward_func/std": 0.16774308350351122, "sampling/importance_sampling_ratio/max": 2.998063564300537, "sampling/importance_sampling_ratio/mean": 0.9458487629890442, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.899086952209473, "sampling/sampling_logp_difference/mean": 0.2181737720966339, "step": 170, "step_time": 72.11048253579065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3049.0, "completions/mean_length": 987.90625, "completions/mean_terminated_length": 896.5409545898438, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7009437531232834, "epoch": 0.4211822660098522, "frac_reward_zero_std": 0.0, "grad_norm": 0.0162183247531834, "kl": 0.003935195156373084, "learning_rate": 4.8729511470563514e-05, "loss": 0.026689358055591583, "num_tokens": 25804854.0, "reward": 1.34375, "reward_std": 1.4103500843048096, "rewards/reward_func/mean": 0.14930555555555555, "rewards/reward_func/std": 0.20477482179800668, "sampling/importance_sampling_ratio/max": 2.9969730377197266, "sampling/importance_sampling_ratio/mean": 0.95501309633255, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.797805786132812, "sampling/sampling_logp_difference/mean": 0.18509003520011902, "step": 171, "step_time": 154.1194182871841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3533.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 767.1875, "completions/mean_terminated_length": 738.4762573242188, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6789942979812622, "epoch": 0.4236453201970443, "frac_reward_zero_std": 0.25, "grad_norm": 0.02484970387824421, "kl": 0.0039458731771446764, "learning_rate": 4.871419872512901e-05, "loss": -0.0457112193107605, "num_tokens": 25940050.0, "reward": 1.359375, "reward_std": 1.1179230213165283, "rewards/reward_func/mean": 0.15104166666666666, "rewards/reward_func/std": 0.1697574125395881, "sampling/importance_sampling_ratio/max": 2.999643087387085, "sampling/importance_sampling_ratio/mean": 0.9572591185569763, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.05351734161377, "sampling/sampling_logp_difference/mean": 0.18504132330417633, "step": 172, "step_time": 104.09425508067943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 739.171875, "completions/mean_terminated_length": 673.1638793945312, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6542257070541382, "epoch": 0.42610837438423643, "frac_reward_zero_std": 0.0, "grad_norm": 0.022307858591524333, "kl": 0.0056029813713394105, "learning_rate": 4.869879668732115e-05, "loss": -0.04914276301860809, "num_tokens": 26065581.0, "reward": 1.3828125, "reward_std": 1.1529541015625, "rewards/reward_func/mean": 0.15364583333333334, "rewards/reward_func/std": 0.18794474667972988, "sampling/importance_sampling_ratio/max": 2.9967164993286133, "sampling/importance_sampling_ratio/mean": 0.9573915004730225, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.686840057373047, "sampling/sampling_logp_difference/mean": 0.1825268715620041, "step": 173, "step_time": 119.28080543805845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 713.21875, "completions/mean_terminated_length": 601.1802978515625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6988493353128433, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.25, "grad_norm": 0.018969040626113362, "kl": 0.004114088660571724, "learning_rate": 4.868330541513405e-05, "loss": -0.03230910003185272, "num_tokens": 26194555.0, "reward": 1.28125, "reward_std": 1.0249855518341064, "rewards/reward_func/mean": 0.1423611111111111, "rewards/reward_func/std": 0.16953420970175, "sampling/importance_sampling_ratio/max": 2.997673511505127, "sampling/importance_sampling_ratio/mean": 0.954637885093689, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.62595272064209, "sampling/sampling_logp_difference/mean": 0.19148147106170654, "step": 174, "step_time": 124.16180285089649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4022.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 1148.6875, "completions/mean_terminated_length": 1197.5423583984375, "completions/min_length": 149.0, "completions/min_terminated_length": 352.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6463498473167419, "epoch": 0.43103448275862066, "frac_reward_zero_std": 0.0, "grad_norm": 0.015462004127244699, "kl": 0.003992202051449567, "learning_rate": 4.866772496689787e-05, "loss": -0.02261742576956749, "num_tokens": 26363159.0, "reward": 1.3203125, "reward_std": 1.0796353816986084, "rewards/reward_func/mean": 0.1467013888888889, "rewards/reward_func/std": 0.19009446766641405, "sampling/importance_sampling_ratio/max": 2.9993505477905273, "sampling/importance_sampling_ratio/mean": 0.9478614926338196, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.764888763427734, "sampling/sampling_logp_difference/mean": 0.1974748969078064, "step": 175, "step_time": 127.3127924175933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4002.0, "completions/mean_length": 1293.46875, "completions/mean_terminated_length": 1174.0, "completions/min_length": 8.0, "completions/min_terminated_length": 287.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6410723924636841, "epoch": 0.43349753694581283, "frac_reward_zero_std": 0.0, "grad_norm": 0.015568149632948742, "kl": 0.005730267730541527, "learning_rate": 4.865205540127851e-05, "loss": 0.02572108432650566, "num_tokens": 26549781.0, "reward": 1.625, "reward_std": 1.3153549432754517, "rewards/reward_func/mean": 0.18055555555555555, "rewards/reward_func/std": 0.20540823373529646, "sampling/importance_sampling_ratio/max": 2.9966816902160645, "sampling/importance_sampling_ratio/mean": 0.9409160017967224, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.302165985107422, "sampling/sampling_logp_difference/mean": 0.20651187002658844, "step": 176, "step_time": 149.82990049594082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 1170.84375, "completions/mean_terminated_length": 1078.704833984375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7377604842185974, "epoch": 0.43596059113300495, "frac_reward_zero_std": 0.0, "grad_norm": 0.022361029088014617, "kl": 0.004049715644214302, "learning_rate": 4.863629677727745e-05, "loss": -0.07867275178432465, "num_tokens": 26716651.0, "reward": 1.78515625, "reward_std": 1.599768877029419, "rewards/reward_func/mean": 0.19835069444444445, "rewards/reward_func/std": 0.23245189090569815, "sampling/importance_sampling_ratio/max": 2.999277114868164, "sampling/importance_sampling_ratio/mean": 0.9434698224067688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.033159255981445, "sampling/sampling_logp_difference/mean": 0.20777705311775208, "step": 177, "step_time": 124.1001110309735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3152.0, "completions/max_terminated_length": 3152.0, "completions/mean_length": 830.140625, "completions/mean_terminated_length": 833.5238647460938, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6503562480211258, "epoch": 0.43842364532019706, "frac_reward_zero_std": 0.0, "grad_norm": 0.023577480739962978, "kl": 0.006311332923360169, "learning_rate": 4.862044915423149e-05, "loss": -0.0761241614818573, "num_tokens": 26848820.0, "reward": 1.609375, "reward_std": 1.490628719329834, "rewards/reward_func/mean": 0.17881944444444445, "rewards/reward_func/std": 0.24757508436838785, "sampling/importance_sampling_ratio/max": 2.9963910579681396, "sampling/importance_sampling_ratio/mean": 0.9584037065505981, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.9031982421875, "sampling/sampling_logp_difference/mean": 0.16852861642837524, "step": 178, "step_time": 88.4958158947993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3069.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 822.015625, "completions/mean_terminated_length": 837.4745483398438, "completions/min_length": 19.0, "completions/min_terminated_length": 227.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7814570516347885, "epoch": 0.4408866995073892, "frac_reward_zero_std": 0.0, "grad_norm": 0.029061251701778403, "kl": 0.0067477618576958776, "learning_rate": 4.860451259181259e-05, "loss": -0.05121511220932007, "num_tokens": 26985429.0, "reward": 1.609375, "reward_std": 1.5228908061981201, "rewards/reward_func/mean": 0.17881944444444445, "rewards/reward_func/std": 0.22272776729530758, "sampling/importance_sampling_ratio/max": 2.9979407787323, "sampling/importance_sampling_ratio/mean": 0.9471230506896973, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.142129898071289, "sampling/sampling_logp_difference/mean": 0.2147315889596939, "step": 179, "step_time": 145.50826694909483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3252.0, "completions/mean_length": 1040.078125, "completions/mean_terminated_length": 991.5714721679688, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6426051408052444, "epoch": 0.4433497536945813, "frac_reward_zero_std": 0.0, "grad_norm": 0.02264846469626092, "kl": 0.008284276816993952, "learning_rate": 4.8588487150027514e-05, "loss": -0.011647295206785202, "num_tokens": 27128874.0, "reward": 1.59765625, "reward_std": 1.513574242591858, "rewards/reward_func/mean": 0.1775173611111111, "rewards/reward_func/std": 0.2384296092722151, "sampling/importance_sampling_ratio/max": 2.9980435371398926, "sampling/importance_sampling_ratio/mean": 0.954474687576294, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.294371604919434, "sampling/sampling_logp_difference/mean": 0.1824147254228592, "step": 180, "step_time": 139.90463946410455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 946.453125, "completions/mean_terminated_length": 816.4385986328125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6580641269683838, "epoch": 0.4458128078817734, "frac_reward_zero_std": 0.0, "grad_norm": 0.01917106050891827, "kl": 0.00607175484765321, "learning_rate": 4.8572372889217776e-05, "loss": 0.13771361112594604, "num_tokens": 27269127.0, "reward": 1.734375, "reward_std": 1.7643263339996338, "rewards/reward_func/mean": 0.19270833333333334, "rewards/reward_func/std": 0.2612364457713233, "sampling/importance_sampling_ratio/max": 2.999809741973877, "sampling/importance_sampling_ratio/mean": 0.9558506608009338, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.685833930969238, "sampling/sampling_logp_difference/mean": 0.18209989368915558, "step": 181, "step_time": 122.68162017525174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3650.0, "completions/mean_length": 1262.5625, "completions/mean_terminated_length": 1193.482177734375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6180784702301025, "epoch": 0.4482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.014420698262174875, "kl": 0.004996940493583679, "learning_rate": 4.855616987005926e-05, "loss": 0.07041345536708832, "num_tokens": 27439019.0, "reward": 1.34765625, "reward_std": 1.241332769393921, "rewards/reward_func/mean": 0.14973958333333334, "rewards/reward_func/std": 0.1866938124100367, "sampling/importance_sampling_ratio/max": 2.9979941844940186, "sampling/importance_sampling_ratio/mean": 0.9477935433387756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.707728385925293, "sampling/sampling_logp_difference/mean": 0.1886201798915863, "step": 182, "step_time": 124.8088491272647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3871.0, "completions/mean_length": 1215.671875, "completions/mean_terminated_length": 1141.0509033203125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "degenerate_groups_filtered": 0.0, "entropy": 0.757892832159996, "epoch": 0.45073891625615764, "frac_reward_zero_std": 0.0, "grad_norm": 0.023065960992914185, "kl": 0.0071741442661732435, "learning_rate": 4.853987815356211e-05, "loss": -0.025667782872915268, "num_tokens": 27599814.0, "reward": 1.859375, "reward_std": 1.7388391494750977, "rewards/reward_func/mean": 0.2065972222222222, "rewards/reward_func/std": 0.2955647044711643, "sampling/importance_sampling_ratio/max": 2.9989538192749023, "sampling/importance_sampling_ratio/mean": 0.9460509419441223, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.060905456542969, "sampling/sampling_logp_difference/mean": 0.2087656557559967, "step": 183, "step_time": 126.53920693183318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3071.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 872.046875, "completions/mean_terminated_length": 890.5322265625, "completions/min_length": 44.0, "completions/min_terminated_length": 164.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7055705040693283, "epoch": 0.45320197044334976, "frac_reward_zero_std": 0.0, "grad_norm": 0.028719236256871318, "kl": 0.006613429985009134, "learning_rate": 4.8523497801070394e-05, "loss": -0.09070068597793579, "num_tokens": 27741817.0, "reward": 1.37890625, "reward_std": 1.2809596061706543, "rewards/reward_func/mean": 0.15321180555555555, "rewards/reward_func/std": 0.19664881295628017, "sampling/importance_sampling_ratio/max": 2.988349676132202, "sampling/importance_sampling_ratio/mean": 0.9532193541526794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.999823570251465, "sampling/sampling_logp_difference/mean": 0.19390341639518738, "step": 184, "step_time": 95.91285739769228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 1271.171875, "completions/mean_terminated_length": 1111.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6977435797452927, "epoch": 0.45566502463054187, "frac_reward_zero_std": 0.0, "grad_norm": 0.016889524252555597, "kl": 0.007070304825901985, "learning_rate": 4.8507028874261965e-05, "loss": -0.0027643460780382156, "num_tokens": 27911140.0, "reward": 1.4140625, "reward_std": 1.3064391613006592, "rewards/reward_func/mean": 0.15711805555555555, "rewards/reward_func/std": 0.18846042454242706, "sampling/importance_sampling_ratio/max": 2.9996609687805176, "sampling/importance_sampling_ratio/mean": 0.9465343952178955, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.93674087524414, "sampling/sampling_logp_difference/mean": 0.19495807588100433, "step": 185, "step_time": 189.83484322600998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3173.0, "completions/mean_length": 1272.578125, "completions/mean_terminated_length": 1022.8245849609375, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "degenerate_groups_filtered": 0.0, "entropy": 0.647504448890686, "epoch": 0.458128078817734, "frac_reward_zero_std": 0.0, "grad_norm": 0.017992669360486515, "kl": 0.006825581775046885, "learning_rate": 4.8490471435148174e-05, "loss": -0.038769882172346115, "num_tokens": 28083673.0, "reward": 1.40234375, "reward_std": 1.4055002927780151, "rewards/reward_func/mean": 0.1558159722222222, "rewards/reward_func/std": 0.22071651286549038, "sampling/importance_sampling_ratio/max": 2.999582290649414, "sampling/importance_sampling_ratio/mean": 0.9500458240509033, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.749305725097656, "sampling/sampling_logp_difference/mean": 0.18807470798492432, "step": 186, "step_time": 131.8277764460072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3862.0, "completions/mean_length": 1178.859375, "completions/mean_terminated_length": 997.6551513671875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6585134118795395, "epoch": 0.4605911330049261, "frac_reward_zero_std": 0.0, "grad_norm": 0.018530615661706835, "kl": 0.007103452226147056, "learning_rate": 4.8473825546073656e-05, "loss": 0.06069903075695038, "num_tokens": 28230928.0, "reward": 1.61328125, "reward_std": 1.4302082061767578, "rewards/reward_func/mean": 0.1792534722222222, "rewards/reward_func/std": 0.2170476433303621, "sampling/importance_sampling_ratio/max": 2.997567653656006, "sampling/importance_sampling_ratio/mean": 0.9533196687698364, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.04468822479248, "sampling/sampling_logp_difference/mean": 0.17806966602802277, "step": 187, "step_time": 121.11668449593708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 870.734375, "completions/mean_terminated_length": 752.4035034179688, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7276755422353745, "epoch": 0.4630541871921182, "frac_reward_zero_std": 0.0, "grad_norm": 0.017478474676854153, "kl": 0.007267383160069585, "learning_rate": 4.845709126971609e-05, "loss": -0.03535791486501694, "num_tokens": 28369791.0, "reward": 1.40234375, "reward_std": 1.1874151229858398, "rewards/reward_func/mean": 0.1558159722222222, "rewards/reward_func/std": 0.22231183614995745, "sampling/importance_sampling_ratio/max": 2.9939093589782715, "sampling/importance_sampling_ratio/mean": 0.9565584659576416, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.437378883361816, "sampling/sampling_logp_difference/mean": 0.18134385347366333, "step": 188, "step_time": 131.01593620865606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3208.0, "completions/mean_length": 1342.59375, "completions/mean_terminated_length": 1030.732177734375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "degenerate_groups_filtered": 0.0, "entropy": 1.0366933643817902, "epoch": 0.46551724137931033, "frac_reward_zero_std": 0.0, "grad_norm": 0.01591276787266667, "kl": 0.006096535362303257, "learning_rate": 4.844026866908595e-05, "loss": -0.040912821888923645, "num_tokens": 28552389.0, "reward": 1.4609375, "reward_std": 1.260840892791748, "rewards/reward_func/mean": 0.1623263888888889, "rewards/reward_func/std": 0.2152951161066691, "sampling/importance_sampling_ratio/max": 2.9983325004577637, "sampling/importance_sampling_ratio/mean": 0.939391016960144, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.413434028625488, "sampling/sampling_logp_difference/mean": 0.2270013391971588, "step": 189, "step_time": 135.0051975690294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3871.0, "completions/mean_length": 1380.671875, "completions/mean_terminated_length": 1252.2203369140625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7188487350940704, "epoch": 0.46798029556650245, "frac_reward_zero_std": 0.0, "grad_norm": 0.01357299206191558, "kl": 0.005054897745139897, "learning_rate": 4.8423357807526325e-05, "loss": -0.09534484893083572, "num_tokens": 28729328.0, "reward": 1.35546875, "reward_std": 1.248653769493103, "rewards/reward_func/mean": 0.1506076388888889, "rewards/reward_func/std": 0.20661381714873844, "sampling/importance_sampling_ratio/max": 2.997480630874634, "sampling/importance_sampling_ratio/mean": 0.9424310326576233, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.062469482421875, "sampling/sampling_logp_difference/mean": 0.20740850269794464, "step": 190, "step_time": 130.3345901852008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3946.0, "completions/mean_length": 1181.046875, "completions/mean_terminated_length": 1046.8333740234375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "degenerate_groups_filtered": 0.0, "entropy": 0.9121982902288437, "epoch": 0.47044334975369456, "frac_reward_zero_std": 0.0, "grad_norm": 0.01961347417559075, "kl": 0.006588997668586671, "learning_rate": 4.840635874871259e-05, "loss": -0.0807374119758606, "num_tokens": 28899123.0, "reward": 1.3671875, "reward_std": 1.382263422012329, "rewards/reward_func/mean": 0.1519097222222222, "rewards/reward_func/std": 0.20543800791104636, "sampling/importance_sampling_ratio/max": 2.998202323913574, "sampling/importance_sampling_ratio/mean": 0.9454882144927979, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.819480895996094, "sampling/sampling_logp_difference/mean": 0.21896135807037354, "step": 191, "step_time": 128.1972416790668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3309.0, "completions/mean_length": 920.1875, "completions/mean_terminated_length": 869.3933715820312, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7038512676954269, "epoch": 0.4729064039408867, "frac_reward_zero_std": 0.0, "grad_norm": 0.027061634852367587, "kl": 0.00604074785951525, "learning_rate": 4.838927155665225e-05, "loss": -0.05432787165045738, "num_tokens": 29032895.0, "reward": 1.53515625, "reward_std": 1.5058531761169434, "rewards/reward_func/mean": 0.17057291666666666, "rewards/reward_func/std": 0.2294796390665902, "sampling/importance_sampling_ratio/max": 2.9994328022003174, "sampling/importance_sampling_ratio/mean": 0.9593228101730347, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.059528350830078, "sampling/sampling_logp_difference/mean": 0.17926844954490662, "step": 192, "step_time": 115.40231793024577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3525.0, "completions/mean_length": 781.296875, "completions/mean_terminated_length": 706.6271362304688, "completions/min_length": 5.0, "completions/min_terminated_length": 170.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7098062336444855, "epoch": 0.4753694581280788, "frac_reward_zero_std": 0.0, "grad_norm": 0.021032851847854764, "kl": 0.006787009071558714, "learning_rate": 4.837209629568462e-05, "loss": -0.049884945154190063, "num_tokens": 29161282.0, "reward": 1.4765625, "reward_std": 1.2374118566513062, "rewards/reward_func/mean": 0.1640625, "rewards/reward_func/std": 0.21430290904310015, "sampling/importance_sampling_ratio/max": 2.995041608810425, "sampling/importance_sampling_ratio/mean": 0.9568045735359192, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.902270317077637, "sampling/sampling_logp_difference/mean": 0.1803169846534729, "step": 193, "step_time": 116.49772782274522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3673.0, "completions/mean_length": 1280.890625, "completions/mean_terminated_length": 1118.8035888671875, "completions/min_length": 40.0, "completions/min_terminated_length": 285.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6611451655626297, "epoch": 0.47783251231527096, "frac_reward_zero_std": 0.0, "grad_norm": 0.011058275247570002, "kl": 0.004933293093927205, "learning_rate": 4.8354833030480674e-05, "loss": -0.033044856041669846, "num_tokens": 29337771.0, "reward": 1.2734375, "reward_std": 1.1723660230636597, "rewards/reward_func/mean": 0.14149305555555555, "rewards/reward_func/std": 0.23454364968670738, "sampling/importance_sampling_ratio/max": 2.9890265464782715, "sampling/importance_sampling_ratio/mean": 0.9440937042236328, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.013520240783691, "sampling/sampling_logp_difference/mean": 0.1958637237548828, "step": 194, "step_time": 130.9736714749597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3675.0, "completions/mean_length": 1018.359375, "completions/mean_terminated_length": 917.458984375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7075535953044891, "epoch": 0.4802955665024631, "frac_reward_zero_std": 0.0, "grad_norm": 0.02364851796391656, "kl": 0.005875613307580352, "learning_rate": 4.833748182604273e-05, "loss": -0.031076285988092422, "num_tokens": 29488930.0, "reward": 1.44921875, "reward_std": 1.387429118156433, "rewards/reward_func/mean": 0.16102430555555555, "rewards/reward_func/std": 0.2225587632921007, "sampling/importance_sampling_ratio/max": 2.9986376762390137, "sampling/importance_sampling_ratio/mean": 0.9525994062423706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.249788284301758, "sampling/sampling_logp_difference/mean": 0.19980862736701965, "step": 195, "step_time": 137.75460224575363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 947.40625, "completions/mean_terminated_length": 851.4500732421875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8509411960840225, "epoch": 0.4827586206896552, "frac_reward_zero_std": 0.0, "grad_norm": 0.02013512783074024, "kl": 0.0069408094277605414, "learning_rate": 4.832004274770422e-05, "loss": -0.05644526705145836, "num_tokens": 29629100.0, "reward": 1.24609375, "reward_std": 1.2813467979431152, "rewards/reward_func/mean": 0.1384548611111111, "rewards/reward_func/std": 0.21960993111133575, "sampling/importance_sampling_ratio/max": 2.996084451675415, "sampling/importance_sampling_ratio/mean": 0.9489051103591919, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.714995384216309, "sampling/sampling_logp_difference/mean": 0.21050521731376648, "step": 196, "step_time": 125.77825205796398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3440.0, "completions/mean_length": 1200.953125, "completions/mean_terminated_length": 737.6481323242188, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7732942551374435, "epoch": 0.4852216748768473, "frac_reward_zero_std": 0.0, "grad_norm": 0.012400100349164389, "kl": 0.005751760327257216, "learning_rate": 4.8302515861129474e-05, "loss": -0.024515990167856216, "num_tokens": 29791833.0, "reward": 1.14453125, "reward_std": 1.1741297245025635, "rewards/reward_func/mean": 0.1271701388888889, "rewards/reward_func/std": 0.1953661491473516, "sampling/importance_sampling_ratio/max": 2.997990846633911, "sampling/importance_sampling_ratio/mean": 0.9584062099456787, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.944055557250977, "sampling/sampling_logp_difference/mean": 0.19072258472442627, "step": 197, "step_time": 126.20407959399745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 1286.90625, "completions/mean_terminated_length": 1082.9830322265625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7938503175973892, "epoch": 0.4876847290640394, "frac_reward_zero_std": 0.0, "grad_norm": 0.01640850667993589, "kl": 0.004297417588531971, "learning_rate": 4.828490123231342e-05, "loss": -0.06255729496479034, "num_tokens": 29961379.0, "reward": 1.2421875, "reward_std": 1.15467369556427, "rewards/reward_func/mean": 0.13802083333333334, "rewards/reward_func/std": 0.19956799844900766, "sampling/importance_sampling_ratio/max": 2.997150421142578, "sampling/importance_sampling_ratio/mean": 0.9441516995429993, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.244465827941895, "sampling/sampling_logp_difference/mean": 0.21875452995300293, "step": 198, "step_time": 122.48823585105129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 913.578125, "completions/mean_terminated_length": 778.440673828125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7322440892457962, "epoch": 0.49014778325123154, "frac_reward_zero_std": 0.0, "grad_norm": 0.018865350006102854, "kl": 0.004258246102835983, "learning_rate": 4.8267198927581415e-05, "loss": 0.0023453934118151665, "num_tokens": 30104040.0, "reward": 1.2734375, "reward_std": 0.937235414981842, "rewards/reward_func/mean": 0.14149305555555555, "rewards/reward_func/std": 0.16726858417193094, "sampling/importance_sampling_ratio/max": 2.9988765716552734, "sampling/importance_sampling_ratio/mean": 0.9504024982452393, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.14719009399414, "sampling/sampling_logp_difference/mean": 0.2065386325120926, "step": 199, "step_time": 119.87218592292629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3342.0, "completions/mean_length": 1433.75, "completions/mean_terminated_length": 1164.9454345703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6855715811252594, "epoch": 0.49261083743842365, "frac_reward_zero_std": 0.25, "grad_norm": 0.010089758857000182, "kl": 0.004258447384927422, "learning_rate": 4.824940901358889e-05, "loss": -0.054729118943214417, "num_tokens": 30266920.0, "reward": 1.203125, "reward_std": 1.0195266008377075, "rewards/reward_func/mean": 0.13368055555555555, "rewards/reward_func/std": 0.1727140380276574, "sampling/importance_sampling_ratio/max": 2.9999141693115234, "sampling/importance_sampling_ratio/mean": 0.950796365737915, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.130544662475586, "sampling/sampling_logp_difference/mean": 0.19564782083034515, "step": 200, "step_time": 124.14298599702306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3757.0, "completions/mean_length": 1323.84375, "completions/mean_terminated_length": 1124.913818359375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7940724641084671, "epoch": 0.49507389162561577, "frac_reward_zero_std": 0.0, "grad_norm": 0.013795859181479811, "kl": 0.004771111474838108, "learning_rate": 4.82315315573212e-05, "loss": -0.009237069636583328, "num_tokens": 30443342.0, "reward": 1.48828125, "reward_std": 1.2347720861434937, "rewards/reward_func/mean": 0.16536458333333334, "rewards/reward_func/std": 0.22848578625255161, "sampling/importance_sampling_ratio/max": 2.9994006156921387, "sampling/importance_sampling_ratio/mean": 0.9410449266433716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.181767463684082, "sampling/sampling_logp_difference/mean": 0.22467756271362305, "step": 201, "step_time": 135.13719806890003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 786.0625, "completions/mean_terminated_length": 716.9508056640625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6506723165512085, "epoch": 0.4975369458128079, "frac_reward_zero_std": 0.0, "grad_norm": 0.02002007843057624, "kl": 0.0043604791280813515, "learning_rate": 4.8213566626093316e-05, "loss": -0.04014727473258972, "num_tokens": 30573922.0, "reward": 1.4140625, "reward_std": 1.193749189376831, "rewards/reward_func/mean": 0.15711805555555555, "rewards/reward_func/std": 0.18327190147505867, "sampling/importance_sampling_ratio/max": 2.996601104736328, "sampling/importance_sampling_ratio/mean": 0.9635285139083862, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.511571884155273, "sampling/sampling_logp_difference/mean": 0.15732041001319885, "step": 202, "step_time": 115.08086889213882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 925.65625, "completions/mean_terminated_length": 805.4667358398438, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8921978324651718, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.016980813259468056, "kl": 0.005670565296895802, "learning_rate": 4.819551428754957e-05, "loss": -0.04834799841046333, "num_tokens": 30718876.0, "reward": 1.2265625, "reward_std": 1.034828782081604, "rewards/reward_func/mean": 0.1362847222222222, "rewards/reward_func/std": 0.18432046307457817, "sampling/importance_sampling_ratio/max": 2.9978744983673096, "sampling/importance_sampling_ratio/mean": 0.951756477355957, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.001660346984863, "sampling/sampling_logp_difference/mean": 0.20985287427902222, "step": 203, "step_time": 125.05054689198732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3314.0, "completions/mean_length": 1142.5, "completions/mean_terminated_length": 1064.34423828125, "completions/min_length": 3.0, "completions/min_terminated_length": 319.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6664412915706635, "epoch": 0.5024630541871922, "frac_reward_zero_std": 0.0, "grad_norm": 0.0167733571502946, "kl": 0.0050976480124518275, "learning_rate": 4.8177374609663415e-05, "loss": 0.05545557290315628, "num_tokens": 30873740.0, "reward": 1.41015625, "reward_std": 1.5264644622802734, "rewards/reward_func/mean": 0.1566840277777778, "rewards/reward_func/std": 0.2437412308322059, "sampling/importance_sampling_ratio/max": 2.992414951324463, "sampling/importance_sampling_ratio/mean": 0.9548521637916565, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.351396560668945, "sampling/sampling_logp_difference/mean": 0.18453365564346313, "step": 204, "step_time": 135.3645177448634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3267.0, "completions/mean_length": 814.234375, "completions/mean_terminated_length": 762.36669921875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "degenerate_groups_filtered": 0.0, "entropy": 0.699128270149231, "epoch": 0.5049261083743842, "frac_reward_zero_std": 0.0, "grad_norm": 0.02369324206135809, "kl": 0.005623727338388562, "learning_rate": 4.815914766073719e-05, "loss": 0.06875115633010864, "num_tokens": 31009723.0, "reward": 1.51953125, "reward_std": 1.43462336063385, "rewards/reward_func/mean": 0.16883680555555555, "rewards/reward_func/std": 0.19979824622472128, "sampling/importance_sampling_ratio/max": 2.9969117641448975, "sampling/importance_sampling_ratio/mean": 0.9596514105796814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.337469100952148, "sampling/sampling_logp_difference/mean": 0.18011973798274994, "step": 205, "step_time": 116.27848253119737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2744.0, "completions/mean_length": 927.265625, "completions/mean_terminated_length": 865.6557006835938, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6605761498212814, "epoch": 0.5073891625615764, "frac_reward_zero_std": 0.0, "grad_norm": 0.017280983168777832, "kl": 0.005327857914380729, "learning_rate": 4.8140833509401815e-05, "loss": 0.0009279539808630943, "num_tokens": 31147436.0, "reward": 1.265625, "reward_std": 1.1364060640335083, "rewards/reward_func/mean": 0.140625, "rewards/reward_func/std": 0.16583361559444004, "sampling/importance_sampling_ratio/max": 2.998948097229004, "sampling/importance_sampling_ratio/mean": 0.9557917714118958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.98939323425293, "sampling/sampling_logp_difference/mean": 0.18955284357070923, "step": 206, "step_time": 134.5779933303129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 3640.0, "completions/max_terminated_length": 3640.0, "completions/mean_length": 967.21875, "completions/mean_terminated_length": 956.2542114257812, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "degenerate_groups_filtered": 0.0, "entropy": 0.70358906686306, "epoch": 0.5098522167487685, "frac_reward_zero_std": 0.0, "grad_norm": 0.02323335844168167, "kl": 0.00514031108468771, "learning_rate": 4.812243222461658e-05, "loss": 0.046197690069675446, "num_tokens": 31295338.0, "reward": 1.75, "reward_std": 1.6097421646118164, "rewards/reward_func/mean": 0.19444444444444445, "rewards/reward_func/std": 0.2402564717663659, "sampling/importance_sampling_ratio/max": 2.9992141723632812, "sampling/importance_sampling_ratio/mean": 0.9542554616928101, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.864341735839844, "sampling/sampling_logp_difference/mean": 0.1839657723903656, "step": 207, "step_time": 107.93793587083928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3272.0, "completions/mean_length": 1129.546875, "completions/mean_terminated_length": 1040.2130126953125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6552405655384064, "epoch": 0.5123152709359606, "frac_reward_zero_std": 0.0, "grad_norm": 0.02923078335322443, "kl": 0.004710226668976247, "learning_rate": 4.8103943875668844e-05, "loss": -0.08094117045402527, "num_tokens": 31452973.0, "reward": 2.140625, "reward_std": 1.937451958656311, "rewards/reward_func/mean": 0.2378472222222222, "rewards/reward_func/std": 0.27044207023249733, "sampling/importance_sampling_ratio/max": 2.994187593460083, "sampling/importance_sampling_ratio/mean": 0.9509314298629761, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.53609848022461, "sampling/sampling_logp_difference/mean": 0.18808647990226746, "step": 208, "step_time": 135.48419915605336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 996.5625, "completions/mean_terminated_length": 811.385986328125, "completions/min_length": 108.0, "completions/min_terminated_length": 124.0, "degenerate_groups_filtered": 0.0, "entropy": 0.662027508020401, "epoch": 0.5147783251231527, "frac_reward_zero_std": 0.0, "grad_norm": 0.02570822604626865, "kl": 0.008059444488026202, "learning_rate": 4.8085368532173804e-05, "loss": -0.10004392266273499, "num_tokens": 31600961.0, "reward": 2.04296875, "reward_std": 1.8801145553588867, "rewards/reward_func/mean": 0.2269965277777778, "rewards/reward_func/std": 0.26916251911057365, "sampling/importance_sampling_ratio/max": 2.9979264736175537, "sampling/importance_sampling_ratio/mean": 0.9550250172615051, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.761881828308105, "sampling/sampling_logp_difference/mean": 0.18191197514533997, "step": 209, "step_time": 124.38241540174931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 986.203125, "completions/mean_terminated_length": 874.8851928710938, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6463757306337357, "epoch": 0.5172413793103449, "frac_reward_zero_std": 0.0, "grad_norm": 0.022735434662613345, "kl": 0.008034229511395097, "learning_rate": 4.806670626407422e-05, "loss": -0.09111690521240234, "num_tokens": 31747678.0, "reward": 1.80078125, "reward_std": 1.6863008737564087, "rewards/reward_func/mean": 0.20008680555555555, "rewards/reward_func/std": 0.27082228660583496, "sampling/importance_sampling_ratio/max": 2.9977269172668457, "sampling/importance_sampling_ratio/mean": 0.950595498085022, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.483198165893555, "sampling/sampling_logp_difference/mean": 0.1938043236732483, "step": 210, "step_time": 162.62786612519994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 1204.53125, "completions/mean_terminated_length": 807.370361328125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6006270796060562, "epoch": 0.5197044334975369, "frac_reward_zero_std": 0.0, "grad_norm": 0.018144455499452674, "kl": 0.008116140263155103, "learning_rate": 4.804795714164015e-05, "loss": -0.05340362340211868, "num_tokens": 31909312.0, "reward": 2.09375, "reward_std": 1.914595127105713, "rewards/reward_func/mean": 0.2326388888888889, "rewards/reward_func/std": 0.305544869767295, "sampling/importance_sampling_ratio/max": 2.997669219970703, "sampling/importance_sampling_ratio/mean": 0.9609960317611694, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.310054779052734, "sampling/sampling_logp_difference/mean": 0.156742125749588, "step": 211, "step_time": 120.653579573147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3924.0, "completions/mean_length": 1445.078125, "completions/mean_terminated_length": 1236.310302734375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7738412618637085, "epoch": 0.5221674876847291, "frac_reward_zero_std": 0.0, "grad_norm": 0.02789054476062182, "kl": 0.00692247343249619, "learning_rate": 4.8029121235468696e-05, "loss": -0.20333515107631683, "num_tokens": 32088485.0, "reward": 2.5859375, "reward_std": 2.116741180419922, "rewards/reward_func/mean": 0.2873263888888889, "rewards/reward_func/std": 0.316247637073199, "sampling/importance_sampling_ratio/max": 2.9986884593963623, "sampling/importance_sampling_ratio/mean": 0.9414281249046326, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.835151672363281, "sampling/sampling_logp_difference/mean": 0.21052196621894836, "step": 212, "step_time": 136.5137119400315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3337.0, "completions/mean_length": 1149.40625, "completions/mean_terminated_length": 1064.2950439453125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8650064170360565, "epoch": 0.5246305418719212, "frac_reward_zero_std": 0.0, "grad_norm": 0.027477140042687382, "kl": 0.008720705634914339, "learning_rate": 4.8010198616483736e-05, "loss": 0.21267648041248322, "num_tokens": 32249679.0, "reward": 2.20703125, "reward_std": 1.8620877265930176, "rewards/reward_func/mean": 0.24522569444444445, "rewards/reward_func/std": 0.25635351406203377, "sampling/importance_sampling_ratio/max": 2.995835781097412, "sampling/importance_sampling_ratio/mean": 0.9504793882369995, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.059192657470703, "sampling/sampling_logp_difference/mean": 0.1958104521036148, "step": 213, "step_time": 123.98496635304764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3253.0, "completions/mean_length": 995.390625, "completions/mean_terminated_length": 842.901611328125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6862529069185257, "epoch": 0.5270935960591133, "frac_reward_zero_std": 0.0, "grad_norm": 0.027503322526612032, "kl": 0.008734274189919233, "learning_rate": 4.799118935593563e-05, "loss": 0.00571461021900177, "num_tokens": 32397592.0, "reward": 2.296875, "reward_std": 2.0537784099578857, "rewards/reward_func/mean": 0.2552083333333333, "rewards/reward_func/std": 0.30399200485812294, "sampling/importance_sampling_ratio/max": 2.970259666442871, "sampling/importance_sampling_ratio/mean": 0.9581179618835449, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.819830894470215, "sampling/sampling_logp_difference/mean": 0.1745845526456833, "step": 214, "step_time": 125.05398750072345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3835.0, "completions/mean_length": 1262.625, "completions/mean_terminated_length": 1118.7166748046875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6201090812683105, "epoch": 0.5295566502463054, "frac_reward_zero_std": 0.0, "grad_norm": 0.029608888003022468, "kl": 0.01200737664476037, "learning_rate": 4.797209352540101e-05, "loss": -0.18912369012832642, "num_tokens": 32558048.0, "reward": 2.640625, "reward_std": 2.0236082077026367, "rewards/reward_func/mean": 0.2934027777777778, "rewards/reward_func/std": 0.2782379339138667, "sampling/importance_sampling_ratio/max": 2.9898691177368164, "sampling/importance_sampling_ratio/mean": 0.9521766901016235, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.811450958251953, "sampling/sampling_logp_difference/mean": 0.18017134070396423, "step": 215, "step_time": 119.14505596109666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 1009.796875, "completions/mean_terminated_length": 858.016357421875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7858224660158157, "epoch": 0.5320197044334976, "frac_reward_zero_std": 0.0, "grad_norm": 0.04291691770310537, "kl": 0.011967223603278399, "learning_rate": 4.7952911196782426e-05, "loss": -0.01381763070821762, "num_tokens": 32710515.0, "reward": 2.515625, "reward_std": 2.1830310821533203, "rewards/reward_func/mean": 0.2795138888888889, "rewards/reward_func/std": 0.31012119187249076, "sampling/importance_sampling_ratio/max": 2.9967217445373535, "sampling/importance_sampling_ratio/mean": 0.9497541189193726, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.55856704711914, "sampling/sampling_logp_difference/mean": 0.2038884311914444, "step": 216, "step_time": 132.75489768222906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4016.0, "completions/mean_length": 1193.171875, "completions/mean_terminated_length": 1125.550048828125, "completions/min_length": 19.0, "completions/min_terminated_length": 395.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6148668229579926, "epoch": 0.5344827586206896, "frac_reward_zero_std": 0.0, "grad_norm": 0.03435292652045712, "kl": 0.010951020522043109, "learning_rate": 4.793364244230818e-05, "loss": 0.11087347567081451, "num_tokens": 32875118.0, "reward": 2.84765625, "reward_std": 2.1037652492523193, "rewards/reward_func/mean": 0.31640625, "rewards/reward_func/std": 0.30274029903941685, "sampling/importance_sampling_ratio/max": 2.9930214881896973, "sampling/importance_sampling_ratio/mean": 0.95209139585495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.35776710510254, "sampling/sampling_logp_difference/mean": 0.1722993552684784, "step": 217, "step_time": 130.21765533811413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2928.0, "completions/mean_length": 1055.609375, "completions/mean_terminated_length": 1007.4500732421875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5769052356481552, "epoch": 0.5369458128078818, "frac_reward_zero_std": 0.0, "grad_norm": 0.04173595623774424, "kl": 0.010210154112428427, "learning_rate": 4.791428733453195e-05, "loss": -0.30730634927749634, "num_tokens": 33032517.0, "reward": 2.87890625, "reward_std": 1.97101628780365, "rewards/reward_func/mean": 0.3198784722222222, "rewards/reward_func/std": 0.29506540298461914, "sampling/importance_sampling_ratio/max": 2.997896432876587, "sampling/importance_sampling_ratio/mean": 0.9560102820396423, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.76587200164795, "sampling/sampling_logp_difference/mean": 0.1718832403421402, "step": 218, "step_time": 116.42301863082685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 1503.078125, "completions/mean_terminated_length": 1176.2037353515625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6850746124982834, "epoch": 0.5394088669950738, "frac_reward_zero_std": 0.0, "grad_norm": 0.02444807124741549, "kl": 0.010135965887457132, "learning_rate": 4.78948459463326e-05, "loss": -0.14834490418434143, "num_tokens": 33224890.0, "reward": 2.6640625, "reward_std": 2.139582872390747, "rewards/reward_func/mean": 0.2960069444444444, "rewards/reward_func/std": 0.31274378465281594, "sampling/importance_sampling_ratio/max": 2.9970359802246094, "sampling/importance_sampling_ratio/mean": 0.9442711472511292, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.665057182312012, "sampling/sampling_logp_difference/mean": 0.2032284438610077, "step": 219, "step_time": 145.91501643997617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 1046.734375, "completions/mean_terminated_length": 836.5084838867188, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7115042209625244, "epoch": 0.541871921182266, "frac_reward_zero_std": 0.0, "grad_norm": 0.028863511910483478, "kl": 0.013005719054490328, "learning_rate": 4.7875318350913846e-05, "loss": -0.00012151524424552917, "num_tokens": 33371753.0, "reward": 2.609375, "reward_std": 1.992483377456665, "rewards/reward_func/mean": 0.2899305555555556, "rewards/reward_func/std": 0.29294103052881026, "sampling/importance_sampling_ratio/max": 2.9933483600616455, "sampling/importance_sampling_ratio/mean": 0.9574570655822754, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.14802074432373, "sampling/sampling_logp_difference/mean": 0.19140395522117615, "step": 220, "step_time": 116.08205214515328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3865.0, "completions/mean_length": 1232.171875, "completions/mean_terminated_length": 1030.5535888671875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8280133903026581, "epoch": 0.5443349753694581, "frac_reward_zero_std": 0.0, "grad_norm": 0.026199806644726372, "kl": 0.01261559035629034, "learning_rate": 4.785570462180402e-05, "loss": 0.019907645881175995, "num_tokens": 33533060.0, "reward": 2.4609375, "reward_std": 1.963060975074768, "rewards/reward_func/mean": 0.2734375, "rewards/reward_func/std": 0.29242918226453996, "sampling/importance_sampling_ratio/max": 2.998601198196411, "sampling/importance_sampling_ratio/mean": 0.9539086222648621, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.437432289123535, "sampling/sampling_logp_difference/mean": 0.18835866451263428, "step": 221, "step_time": 124.80898142675869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3916.0, "completions/mean_length": 1169.75, "completions/mean_terminated_length": 1083.3792724609375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6079298108816147, "epoch": 0.5467980295566502, "frac_reward_zero_std": 0.0, "grad_norm": 0.03152604195230071, "kl": 0.009750789031386375, "learning_rate": 4.7836004832855776e-05, "loss": -0.014914639294147491, "num_tokens": 33693908.0, "reward": 2.609375, "reward_std": 2.088742971420288, "rewards/reward_func/mean": 0.2899305555555556, "rewards/reward_func/std": 0.29560703535874683, "sampling/importance_sampling_ratio/max": 2.99847412109375, "sampling/importance_sampling_ratio/mean": 0.9530090093612671, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.749363899230957, "sampling/sampling_logp_difference/mean": 0.17468664050102234, "step": 222, "step_time": 124.77503907005303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 1138.390625, "completions/mean_terminated_length": 942.2203369140625, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7041943818330765, "epoch": 0.5492610837438424, "frac_reward_zero_std": 0.0, "grad_norm": 0.04273914892197363, "kl": 0.014788009924814105, "learning_rate": 4.781621905824579e-05, "loss": -0.1250435709953308, "num_tokens": 33847053.0, "reward": 2.29296875, "reward_std": 1.9667649269104004, "rewards/reward_func/mean": 0.2547743055555556, "rewards/reward_func/std": 0.3062853713830312, "sampling/importance_sampling_ratio/max": 2.9978320598602295, "sampling/importance_sampling_ratio/mean": 0.9523090124130249, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.453618049621582, "sampling/sampling_logp_difference/mean": 0.19299328327178955, "step": 223, "step_time": 127.9603762368206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1403.65625, "completions/mean_terminated_length": 1173.8302001953125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "degenerate_groups_filtered": 0.0, "entropy": 0.682645320892334, "epoch": 0.5517241379310345, "frac_reward_zero_std": 0.0, "grad_norm": 0.025751043642452183, "kl": 0.010668770410120487, "learning_rate": 4.779634737247455e-05, "loss": -0.0730874314904213, "num_tokens": 34034903.0, "reward": 2.39453125, "reward_std": 2.0147297382354736, "rewards/reward_func/mean": 0.2660590277777778, "rewards/reward_func/std": 0.29088784919844735, "sampling/importance_sampling_ratio/max": 2.996720314025879, "sampling/importance_sampling_ratio/mean": 0.9444730281829834, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.812371253967285, "sampling/sampling_logp_difference/mean": 0.1935410052537918, "step": 224, "step_time": 138.18764766515233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2829.0, "completions/mean_length": 1212.671875, "completions/mean_terminated_length": 929.30908203125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6753417253494263, "epoch": 0.5541871921182266, "frac_reward_zero_std": 0.0, "grad_norm": 0.02639378345648046, "kl": 0.011769623961299658, "learning_rate": 4.777638985036599e-05, "loss": 0.045241162180900574, "num_tokens": 34184738.0, "reward": 2.78125, "reward_std": 2.071873188018799, "rewards/reward_func/mean": 0.3090277777777778, "rewards/reward_func/std": 0.31061913735336727, "sampling/importance_sampling_ratio/max": 2.9998955726623535, "sampling/importance_sampling_ratio/mean": 0.9576824307441711, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.645200729370117, "sampling/sampling_logp_difference/mean": 0.1704041212797165, "step": 225, "step_time": 113.9789727050811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2964.0, "completions/mean_length": 1230.234375, "completions/mean_terminated_length": 908.1154174804688, "completions/min_length": 111.0, "completions/min_terminated_length": 133.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7707178592681885, "epoch": 0.5566502463054187, "frac_reward_zero_std": 0.0, "grad_norm": 0.02290402359133147, "kl": 0.012144361389800906, "learning_rate": 4.7756346567067255e-05, "loss": -0.03128755837678909, "num_tokens": 34340529.0, "reward": 2.12890625, "reward_std": 1.7978118658065796, "rewards/reward_func/mean": 0.2365451388888889, "rewards/reward_func/std": 0.2605728440814548, "sampling/importance_sampling_ratio/max": 2.9998936653137207, "sampling/importance_sampling_ratio/mean": 0.9504464268684387, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.862293243408203, "sampling/sampling_logp_difference/mean": 0.1982928216457367, "step": 226, "step_time": 133.34942565392703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3998.0, "completions/mean_length": 1224.78125, "completions/mean_terminated_length": 885.7169799804688, "completions/min_length": 124.0, "completions/min_terminated_length": 182.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7116331607103348, "epoch": 0.5591133004926109, "frac_reward_zero_std": 0.0, "grad_norm": 0.025649588255769486, "kl": 0.014744432177394629, "learning_rate": 4.773621759804844e-05, "loss": 0.043492548167705536, "num_tokens": 34497827.0, "reward": 2.24609375, "reward_std": 2.065155029296875, "rewards/reward_func/mean": 0.2495659722222222, "rewards/reward_func/std": 0.2995048099093967, "sampling/importance_sampling_ratio/max": 2.9920711517333984, "sampling/importance_sampling_ratio/mean": 0.9514751434326172, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.58505630493164, "sampling/sampling_logp_difference/mean": 0.18859557807445526, "step": 227, "step_time": 121.2616064096801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3420.0, "completions/mean_length": 1085.0625, "completions/mean_terminated_length": 894.586181640625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7214779406785965, "epoch": 0.5615763546798029, "frac_reward_zero_std": 0.0, "grad_norm": 0.026726141471262007, "kl": 0.014612508239224553, "learning_rate": 4.771600301910224e-05, "loss": 0.21741780638694763, "num_tokens": 34648711.0, "reward": 2.16015625, "reward_std": 1.8335320949554443, "rewards/reward_func/mean": 0.2400173611111111, "rewards/reward_func/std": 0.24923836025926802, "sampling/importance_sampling_ratio/max": 2.992070436477661, "sampling/importance_sampling_ratio/mean": 0.9507383108139038, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.363275527954102, "sampling/sampling_logp_difference/mean": 0.1869376301765442, "step": 228, "step_time": 130.2360584451817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3812.0, "completions/mean_length": 1164.859375, "completions/mean_terminated_length": 960.5689697265625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "degenerate_groups_filtered": 0.0, "entropy": 1.0671870857477188, "epoch": 0.5640394088669951, "frac_reward_zero_std": 0.0, "grad_norm": 0.03701890008995095, "kl": 0.0166468839161098, "learning_rate": 4.769570290634373e-05, "loss": -0.009628377854824066, "num_tokens": 34806910.0, "reward": 2.1484375, "reward_std": 1.821236491203308, "rewards/reward_func/mean": 0.2387152777777778, "rewards/reward_func/std": 0.24402027163240644, "sampling/importance_sampling_ratio/max": 2.9982752799987793, "sampling/importance_sampling_ratio/mean": 0.9430174827575684, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.500931739807129, "sampling/sampling_logp_difference/mean": 0.21461455523967743, "step": 229, "step_time": 124.32920650811866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1480.171875, "completions/mean_terminated_length": 1200.4259033203125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "degenerate_groups_filtered": 0.0, "entropy": 0.9094728976488113, "epoch": 0.5665024630541872, "frac_reward_zero_std": 0.0, "grad_norm": 0.02566864350514655, "kl": 0.014607452787458897, "learning_rate": 4.767531733621004e-05, "loss": -0.08295504748821259, "num_tokens": 34995209.0, "reward": 2.64453125, "reward_std": 2.0225930213928223, "rewards/reward_func/mean": 0.2938368055555556, "rewards/reward_func/std": 0.31203501257631516, "sampling/importance_sampling_ratio/max": 2.9992480278015137, "sampling/importance_sampling_ratio/mean": 0.9394354820251465, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.749757766723633, "sampling/sampling_logp_difference/mean": 0.2041437029838562, "step": 230, "step_time": 138.9344523921609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3585.0, "completions/mean_length": 1350.859375, "completions/mean_terminated_length": 1118.0179443359375, "completions/min_length": 25.0, "completions/min_terminated_length": 175.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7921342998743057, "epoch": 0.5689655172413793, "frac_reward_zero_std": 0.0, "grad_norm": 0.02868100953759027, "kl": 0.01775623788125813, "learning_rate": 4.765484638546005e-05, "loss": 0.0232635997235775, "num_tokens": 35163824.0, "reward": 2.20703125, "reward_std": 1.9047532081604004, "rewards/reward_func/mean": 0.24522569444444445, "rewards/reward_func/std": 0.24406374990940094, "sampling/importance_sampling_ratio/max": 2.998385429382324, "sampling/importance_sampling_ratio/mean": 0.9466791152954102, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.723865509033203, "sampling/sampling_logp_difference/mean": 0.18429672718048096, "step": 231, "step_time": 120.73535452294163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 1476.53125, "completions/mean_terminated_length": 1130.7037353515625, "completions/min_length": 5.0, "completions/min_terminated_length": 206.0, "degenerate_groups_filtered": 0.0, "entropy": 0.9682776778936386, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.03712338663506699, "kl": 0.019314537523314357, "learning_rate": 4.7634290131174184e-05, "loss": -0.14410394430160522, "num_tokens": 35353410.0, "reward": 2.4140625, "reward_std": 1.8463921546936035, "rewards/reward_func/mean": 0.2682291666666667, "rewards/reward_func/std": 0.28911194536421037, "sampling/importance_sampling_ratio/max": 2.9995596408843994, "sampling/importance_sampling_ratio/mean": 0.9306811094284058, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.025599479675293, "sampling/sampling_logp_difference/mean": 0.23162716627120972, "step": 232, "step_time": 133.49474174529314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3336.0, "completions/mean_length": 1594.9375, "completions/mean_terminated_length": 1036.5870361328125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7518228590488434, "epoch": 0.5738916256157636, "frac_reward_zero_std": 0.0, "grad_norm": 0.02178047416680768, "kl": 0.013470216421410441, "learning_rate": 4.761364865075402e-05, "loss": -0.1526617705821991, "num_tokens": 35532574.0, "reward": 2.54296875, "reward_std": 2.0180509090423584, "rewards/reward_func/mean": 0.2825520833333333, "rewards/reward_func/std": 0.30297022809584934, "sampling/importance_sampling_ratio/max": 2.984797954559326, "sampling/importance_sampling_ratio/mean": 0.9513339996337891, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.608736038208008, "sampling/sampling_logp_difference/mean": 0.17860578000545502, "step": 233, "step_time": 122.31598937511444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 1319.5625, "completions/mean_terminated_length": 922.3077392578125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "degenerate_groups_filtered": 0.0, "entropy": 1.0021032392978668, "epoch": 0.5763546798029556, "frac_reward_zero_std": 0.25, "grad_norm": 0.03166418186559143, "kl": 0.024833133444190025, "learning_rate": 4.7592922021922056e-05, "loss": -0.15460214018821716, "num_tokens": 35722898.0, "reward": 2.0625, "reward_std": 1.8126540184020996, "rewards/reward_func/mean": 0.22916666666666666, "rewards/reward_func/std": 0.2917405896716648, "sampling/importance_sampling_ratio/max": 2.996793031692505, "sampling/importance_sampling_ratio/mean": 0.9255531430244446, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.874739646911621, "sampling/sampling_logp_difference/mean": 0.23875398933887482, "step": 234, "step_time": 148.48599314992316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1231.4375, "completions/mean_terminated_length": 811.0980834960938, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7249269187450409, "epoch": 0.5788177339901478, "frac_reward_zero_std": 0.0, "grad_norm": 0.05201097230500609, "kl": 0.014844065997749567, "learning_rate": 4.757211032272141e-05, "loss": -0.2842212915420532, "num_tokens": 35879470.0, "reward": 2.28125, "reward_std": 1.9783049821853638, "rewards/reward_func/mean": 0.2534722222222222, "rewards/reward_func/std": 0.2845391564899021, "sampling/importance_sampling_ratio/max": 2.9987900257110596, "sampling/importance_sampling_ratio/mean": 0.9521608948707581, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.4357271194458, "sampling/sampling_logp_difference/mean": 0.1780111938714981, "step": 235, "step_time": 122.01890759728849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 4096.0, "completions/max_terminated_length": 3650.0, "completions/mean_length": 1525.09375, "completions/mean_terminated_length": 986.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8893266320228577, "epoch": 0.5812807881773399, "frac_reward_zero_std": 0.0, "grad_norm": 0.02471245037776744, "kl": 0.01480063796043396, "learning_rate": 4.75512136315155e-05, "loss": -0.15336401760578156, "num_tokens": 36060756.0, "reward": 2.05859375, "reward_std": 1.8020832538604736, "rewards/reward_func/mean": 0.2287326388888889, "rewards/reward_func/std": 0.2694326473606957, "sampling/importance_sampling_ratio/max": 2.999886989593506, "sampling/importance_sampling_ratio/mean": 0.9345108270645142, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.334660530090332, "sampling/sampling_logp_difference/mean": 0.22046810388565063, "step": 236, "step_time": 131.76243212609552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 1175.203125, "completions/mean_terminated_length": 836.1111450195312, "completions/min_length": 108.0, "completions/min_terminated_length": 245.0, "degenerate_groups_filtered": 0.0, "entropy": 0.784490168094635, "epoch": 0.583743842364532, "frac_reward_zero_std": 0.0, "grad_norm": 0.02574867287046673, "kl": 0.013174011837691069, "learning_rate": 4.7530232026987807e-05, "loss": -0.061145804822444916, "num_tokens": 36226257.0, "reward": 2.12109375, "reward_std": 1.8121023178100586, "rewards/reward_func/mean": 0.23567708333333334, "rewards/reward_func/std": 0.26693976587719387, "sampling/importance_sampling_ratio/max": 2.9982166290283203, "sampling/importance_sampling_ratio/mean": 0.9447356462478638, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.428176879882812, "sampling/sampling_logp_difference/mean": 0.20164385437965393, "step": 237, "step_time": 128.0499583010096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 546.53125, "completions/mean_terminated_length": 437.35003662109375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6676979809999466, "epoch": 0.5862068965517241, "frac_reward_zero_std": 0.0, "grad_norm": 0.0501059794602117, "kl": 0.01879991707392037, "learning_rate": 4.75091655881415e-05, "loss": 0.018436290323734283, "num_tokens": 36337075.0, "reward": 2.546875, "reward_std": 1.9879971742630005, "rewards/reward_func/mean": 0.2829861111111111, "rewards/reward_func/std": 0.29805727965301937, "sampling/importance_sampling_ratio/max": 2.9939706325531006, "sampling/importance_sampling_ratio/mean": 0.9689080715179443, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.713134765625, "sampling/sampling_logp_difference/mean": 0.1616547852754593, "step": 238, "step_time": 106.62005894491449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 1046.140625, "completions/mean_terminated_length": 902.3500366210938, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7040528655052185, "epoch": 0.5886699507389163, "frac_reward_zero_std": 0.0, "grad_norm": 0.031163817812319954, "kl": 0.013875756645575166, "learning_rate": 4.7488014394299205e-05, "loss": -0.06191133335232735, "num_tokens": 36488556.0, "reward": 2.3515625, "reward_std": 1.8996233940124512, "rewards/reward_func/mean": 0.2612847222222222, "rewards/reward_func/std": 0.26455983685122597, "sampling/importance_sampling_ratio/max": 2.997973680496216, "sampling/importance_sampling_ratio/mean": 0.9539923667907715, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.80900764465332, "sampling/sampling_logp_difference/mean": 0.18930600583553314, "step": 239, "step_time": 122.02802962902933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3646.0, "completions/mean_length": 1437.96875, "completions/mean_terminated_length": 1307.245849609375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6821627020835876, "epoch": 0.5911330049261084, "frac_reward_zero_std": 0.0, "grad_norm": 0.022492803254900707, "kl": 0.008037951542064548, "learning_rate": 4.746677852510267e-05, "loss": -0.048022910952568054, "num_tokens": 36680714.0, "reward": 1.7890625, "reward_std": 1.6373727321624756, "rewards/reward_func/mean": 0.1987847222222222, "rewards/reward_func/std": 0.22592765589555105, "sampling/importance_sampling_ratio/max": 2.9999842643737793, "sampling/importance_sampling_ratio/mean": 0.9468995332717896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.066885948181152, "sampling/sampling_logp_difference/mean": 0.1964116394519806, "step": 240, "step_time": 152.4113609350752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3714.0, "completions/mean_length": 1449.25, "completions/mean_terminated_length": 1312.7857666015625, "completions/min_length": 30.0, "completions/min_terminated_length": 420.0, "degenerate_groups_filtered": 0.0, "entropy": 0.9211445450782776, "epoch": 0.5935960591133005, "frac_reward_zero_std": 0.0, "grad_norm": 0.025380578247890306, "kl": 0.01382395182736218, "learning_rate": 4.7445458060512484e-05, "loss": -0.0691765695810318, "num_tokens": 36867578.0, "reward": 2.08984375, "reward_std": 1.7980186939239502, "rewards/reward_func/mean": 0.2322048611111111, "rewards/reward_func/std": 0.2627977761957381, "sampling/importance_sampling_ratio/max": 2.9993677139282227, "sampling/importance_sampling_ratio/mean": 0.9361419081687927, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.936524391174316, "sampling/sampling_logp_difference/mean": 0.22960931062698364, "step": 241, "step_time": 131.3438694481738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3204.0, "completions/mean_length": 1209.828125, "completions/mean_terminated_length": 985.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6995112746953964, "epoch": 0.5960591133004927, "frac_reward_zero_std": 0.0, "grad_norm": 0.025394328117354086, "kl": 0.01263234089128673, "learning_rate": 4.742405308080775e-05, "loss": 0.00674794428050518, "num_tokens": 37031359.0, "reward": 2.3984375, "reward_std": 1.963818907737732, "rewards/reward_func/mean": 0.2664930555555556, "rewards/reward_func/std": 0.26679257882965934, "sampling/importance_sampling_ratio/max": 2.996161460876465, "sampling/importance_sampling_ratio/mean": 0.9531453847885132, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.238762855529785, "sampling/sampling_logp_difference/mean": 0.19814015924930573, "step": 242, "step_time": 123.06549433688633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3877.0, "completions/mean_length": 1199.21875, "completions/mean_terminated_length": 1051.3966064453125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6999485194683075, "epoch": 0.5985221674876847, "frac_reward_zero_std": 0.0, "grad_norm": 0.02976865624663422, "kl": 0.01193548133596778, "learning_rate": 4.7402563666585817e-05, "loss": -0.22917920351028442, "num_tokens": 37188493.0, "reward": 2.375, "reward_std": 1.9441609382629395, "rewards/reward_func/mean": 0.2638888888888889, "rewards/reward_func/std": 0.26875759495629203, "sampling/importance_sampling_ratio/max": 2.998791456222534, "sampling/importance_sampling_ratio/mean": 0.9585366249084473, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.101183891296387, "sampling/sampling_logp_difference/mean": 0.1771034300327301, "step": 243, "step_time": 121.21322968695313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3275.0, "completions/mean_length": 1567.84375, "completions/mean_terminated_length": 1362.847412109375, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6863079965114594, "epoch": 0.6009852216748769, "frac_reward_zero_std": 0.0, "grad_norm": 0.030290685076199052, "kl": 0.011284970445558429, "learning_rate": 4.7380989898761957e-05, "loss": -0.19699159264564514, "num_tokens": 37381203.0, "reward": 2.5078125, "reward_std": 1.9256871938705444, "rewards/reward_func/mean": 0.2786458333333333, "rewards/reward_func/std": 0.27923353181944954, "sampling/importance_sampling_ratio/max": 2.999513626098633, "sampling/importance_sampling_ratio/mean": 0.9431895017623901, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.869531631469727, "sampling/sampling_logp_difference/mean": 0.20835289359092712, "step": 244, "step_time": 125.42915512691252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 1144.96875, "completions/mean_terminated_length": 1049.774169921875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7077446132898331, "epoch": 0.603448275862069, "frac_reward_zero_std": 0.0, "grad_norm": 0.033618691580459464, "kl": 0.011604496976360679, "learning_rate": 4.735933185856906e-05, "loss": 0.008934096433222294, "num_tokens": 37545697.0, "reward": 2.09765625, "reward_std": 1.843380093574524, "rewards/reward_func/mean": 0.23307291666666666, "rewards/reward_func/std": 0.265490311715338, "sampling/importance_sampling_ratio/max": 2.9983954429626465, "sampling/importance_sampling_ratio/mean": 0.9419179558753967, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.498157501220703, "sampling/sampling_logp_difference/mean": 0.2204725742340088, "step": 245, "step_time": 136.99598171422258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 991.5, "completions/mean_terminated_length": 879.0655517578125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5730465948581696, "epoch": 0.6059113300492611, "frac_reward_zero_std": 0.0, "grad_norm": 0.03329989820722053, "kl": 0.013264509849250317, "learning_rate": 4.733758962755734e-05, "loss": -0.11232152581214905, "num_tokens": 37694305.0, "reward": 2.38671875, "reward_std": 1.932352900505066, "rewards/reward_func/mean": 0.2651909722222222, "rewards/reward_func/std": 0.27600304451253677, "sampling/importance_sampling_ratio/max": 2.995966672897339, "sampling/importance_sampling_ratio/mean": 0.9627119898796082, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.523571014404297, "sampling/sampling_logp_difference/mean": 0.16919061541557312, "step": 246, "step_time": 123.56446173996665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 914.9375, "completions/mean_terminated_length": 812.3225708007812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6465972661972046, "epoch": 0.6083743842364532, "frac_reward_zero_std": 0.0, "grad_norm": 0.034436712311394085, "kl": 0.015001513063907623, "learning_rate": 4.7315763287594e-05, "loss": -0.0005866400897502899, "num_tokens": 37850461.0, "reward": 2.65234375, "reward_std": 2.0161142349243164, "rewards/reward_func/mean": 0.2947048611111111, "rewards/reward_func/std": 0.26663076298104393, "sampling/importance_sampling_ratio/max": 2.998875617980957, "sampling/importance_sampling_ratio/mean": 0.9551164507865906, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.468926429748535, "sampling/sampling_logp_difference/mean": 0.19288510084152222, "step": 247, "step_time": 123.84648331883363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3604.0, "completions/mean_length": 983.296875, "completions/mean_terminated_length": 829.7000732421875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "degenerate_groups_filtered": 0.0, "entropy": 1.0105192512273788, "epoch": 0.6108374384236454, "frac_reward_zero_std": 0.0, "grad_norm": 0.08148400090622203, "kl": 0.037221905775368214, "learning_rate": 4.729385292086297e-05, "loss": 0.051975756883621216, "num_tokens": 37998112.0, "reward": 2.69921875, "reward_std": 1.9821523427963257, "rewards/reward_func/mean": 0.2999131944444444, "rewards/reward_func/std": 0.28208497166633606, "sampling/importance_sampling_ratio/max": 2.9968528747558594, "sampling/importance_sampling_ratio/mean": 0.9450979828834534, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.799301147460938, "sampling/sampling_logp_difference/mean": 0.23306839168071747, "step": 248, "step_time": 129.30707350256853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 791.265625, "completions/mean_terminated_length": 638.933349609375, "completions/min_length": 17.0, "completions/min_terminated_length": 62.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5722888112068176, "epoch": 0.6133004926108374, "frac_reward_zero_std": 0.0, "grad_norm": 0.030916969248751317, "kl": 0.015809379750862718, "learning_rate": 4.727185860986454e-05, "loss": 0.04632103815674782, "num_tokens": 38127633.0, "reward": 2.72265625, "reward_std": 2.0442123413085938, "rewards/reward_func/mean": 0.3025173611111111, "rewards/reward_func/std": 0.2441907309823566, "sampling/importance_sampling_ratio/max": 2.998241662979126, "sampling/importance_sampling_ratio/mean": 0.9689180850982666, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.026711463928223, "sampling/sampling_logp_difference/mean": 0.1515887975692749, "step": 249, "step_time": 113.22302321530879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 1359.890625, "completions/mean_terminated_length": 1087.5535888671875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8017663061618805, "epoch": 0.6157635467980296, "frac_reward_zero_std": 0.0, "grad_norm": 0.02586699657570665, "kl": 0.017222094582393765, "learning_rate": 4.72497804374151e-05, "loss": -0.10366284102201462, "num_tokens": 38302186.0, "reward": 2.30078125, "reward_std": 2.0010809898376465, "rewards/reward_func/mean": 0.2556423611111111, "rewards/reward_func/std": 0.27661293579472435, "sampling/importance_sampling_ratio/max": 2.9981367588043213, "sampling/importance_sampling_ratio/mean": 0.9457881450653076, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.835844993591309, "sampling/sampling_logp_difference/mean": 0.20591725409030914, "step": 250, "step_time": 135.33297077706084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3808.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 910.21875, "completions/mean_terminated_length": 848.475341796875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6596356332302094, "epoch": 0.6182266009852216, "frac_reward_zero_std": 0.0, "grad_norm": 0.030518494632083264, "kl": 0.012338304659351707, "learning_rate": 4.722761848664681e-05, "loss": -0.055076971650123596, "num_tokens": 38460520.0, "reward": 2.46875, "reward_std": 2.0263638496398926, "rewards/reward_func/mean": 0.2743055555555556, "rewards/reward_func/std": 0.2768581277794308, "sampling/importance_sampling_ratio/max": 2.9986255168914795, "sampling/importance_sampling_ratio/mean": 0.9542578458786011, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.350125312805176, "sampling/sampling_logp_difference/mean": 0.1824713945388794, "step": 251, "step_time": 139.19111042865552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3175.0, "completions/mean_length": 1223.640625, "completions/mean_terminated_length": 1021.3728637695312, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7649645656347275, "epoch": 0.6206896551724138, "frac_reward_zero_std": 0.0, "grad_norm": 0.02785374223801967, "kl": 0.012772297719493508, "learning_rate": 4.720537284100728e-05, "loss": -0.03364788740873337, "num_tokens": 38633521.0, "reward": 2.03515625, "reward_std": 1.8114862442016602, "rewards/reward_func/mean": 0.2261284722222222, "rewards/reward_func/std": 0.2559473647011651, "sampling/importance_sampling_ratio/max": 2.999694585800171, "sampling/importance_sampling_ratio/mean": 0.9402635097503662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.124222755432129, "sampling/sampling_logp_difference/mean": 0.22256529331207275, "step": 252, "step_time": 132.60145464190282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 1248.203125, "completions/mean_terminated_length": 964.6326293945312, "completions/min_length": 52.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7512803375720978, "epoch": 0.6231527093596059, "frac_reward_zero_std": 0.0, "grad_norm": 0.029908418297886107, "kl": 0.01257993234321475, "learning_rate": 4.7183043584259254e-05, "loss": -0.06793683767318726, "num_tokens": 38806558.0, "reward": 2.44140625, "reward_std": 2.0023508071899414, "rewards/reward_func/mean": 0.2712673611111111, "rewards/reward_func/std": 0.30397861699263257, "sampling/importance_sampling_ratio/max": 2.9966394901275635, "sampling/importance_sampling_ratio/mean": 0.9430099129676819, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.804689407348633, "sampling/sampling_logp_difference/mean": 0.21144166588783264, "step": 253, "step_time": 136.72357785212807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 875.71875, "completions/mean_terminated_length": 853.2540283203125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7574752420186996, "epoch": 0.625615763546798, "frac_reward_zero_std": 0.0, "grad_norm": 0.04423207121474008, "kl": 0.014196397736668587, "learning_rate": 4.716063080048031e-05, "loss": 0.006076548248529434, "num_tokens": 38944860.0, "reward": 2.1484375, "reward_std": 1.909520149230957, "rewards/reward_func/mean": 0.2387152777777778, "rewards/reward_func/std": 0.24252891540527344, "sampling/importance_sampling_ratio/max": 2.997382640838623, "sampling/importance_sampling_ratio/mean": 0.9493823051452637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.605960845947266, "sampling/sampling_logp_difference/mean": 0.20002678036689758, "step": 254, "step_time": 79.58881280198693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3342.0, "completions/mean_length": 1386.921875, "completions/mean_terminated_length": 1090.717041015625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7339225560426712, "epoch": 0.6280788177339901, "frac_reward_zero_std": 0.0, "grad_norm": 0.025666617853648467, "kl": 0.014441218925639987, "learning_rate": 4.713813457406253e-05, "loss": -0.04929598048329353, "num_tokens": 39111399.0, "reward": 2.23046875, "reward_std": 1.8941768407821655, "rewards/reward_func/mean": 0.2478298611111111, "rewards/reward_func/std": 0.27593246433469987, "sampling/importance_sampling_ratio/max": 2.995208740234375, "sampling/importance_sampling_ratio/mean": 0.9497035145759583, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.89775276184082, "sampling/sampling_logp_difference/mean": 0.18674036860466003, "step": 255, "step_time": 151.48046739259735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3144.0, "completions/mean_length": 1153.453125, "completions/mean_terminated_length": 942.5714721679688, "completions/min_length": 41.0, "completions/min_terminated_length": 297.0, "degenerate_groups_filtered": 0.0, "entropy": 0.714541032910347, "epoch": 0.6305418719211823, "frac_reward_zero_std": 0.0, "grad_norm": 0.030321664334556155, "kl": 0.014391326112672687, "learning_rate": 4.7115554989712185e-05, "loss": -0.045883383601903915, "num_tokens": 39269524.0, "reward": 2.265625, "reward_std": 2.0629358291625977, "rewards/reward_func/mean": 0.2517361111111111, "rewards/reward_func/std": 0.28038328223758274, "sampling/importance_sampling_ratio/max": 2.999256134033203, "sampling/importance_sampling_ratio/mean": 0.9502524733543396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.648883819580078, "sampling/sampling_logp_difference/mean": 0.18797728419303894, "step": 256, "step_time": 131.94062806293368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3354.0, "completions/mean_length": 1080.34375, "completions/mean_terminated_length": 838.2982788085938, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7863501012325287, "epoch": 0.6330049261083743, "frac_reward_zero_std": 0.0, "grad_norm": 0.028309075765895596, "kl": 0.01342243468388915, "learning_rate": 4.709289213244943e-05, "loss": 0.013263358734548092, "num_tokens": 39425706.0, "reward": 2.44921875, "reward_std": 1.9159832000732422, "rewards/reward_func/mean": 0.2721354166666667, "rewards/reward_func/std": 0.2653440617852741, "sampling/importance_sampling_ratio/max": 2.9991848468780518, "sampling/importance_sampling_ratio/mean": 0.9477356672286987, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.094436645507812, "sampling/sampling_logp_difference/mean": 0.20154553651809692, "step": 257, "step_time": 125.2526604111772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3864.0, "completions/mean_length": 1014.109375, "completions/mean_terminated_length": 787.0000610351562, "completions/min_length": 196.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6339272111654282, "epoch": 0.6354679802955665, "frac_reward_zero_std": 0.0, "grad_norm": 0.03390349756126262, "kl": 0.011190615594387054, "learning_rate": 4.707014608760797e-05, "loss": -0.3546624779701233, "num_tokens": 39575569.0, "reward": 2.55078125, "reward_std": 2.0683650970458984, "rewards/reward_func/mean": 0.2834201388888889, "rewards/reward_func/std": 0.2763279626766841, "sampling/importance_sampling_ratio/max": 2.999453544616699, "sampling/importance_sampling_ratio/mean": 0.9555752277374268, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.49900245666504, "sampling/sampling_logp_difference/mean": 0.17099082469940186, "step": 258, "step_time": 116.94798772898503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1224.28125, "completions/mean_terminated_length": 854.0925903320312, "completions/min_length": 172.0, "completions/min_terminated_length": 270.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6828510016202927, "epoch": 0.6379310344827587, "frac_reward_zero_std": 0.0, "grad_norm": 0.03676887179465974, "kl": 0.014392233453691006, "learning_rate": 4.704731694083472e-05, "loss": -0.25384753942489624, "num_tokens": 39742115.0, "reward": 2.53515625, "reward_std": 2.008843183517456, "rewards/reward_func/mean": 0.2816840277777778, "rewards/reward_func/std": 0.2835590210225847, "sampling/importance_sampling_ratio/max": 2.9942209720611572, "sampling/importance_sampling_ratio/mean": 0.9506525993347168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.997421264648438, "sampling/sampling_logp_difference/mean": 0.18737658858299255, "step": 259, "step_time": 125.44051960692741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 1202.4375, "completions/mean_terminated_length": 796.0, "completions/min_length": 157.0, "completions/min_terminated_length": 268.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5874212980270386, "epoch": 0.6403940886699507, "frac_reward_zero_std": 0.0, "grad_norm": 0.027498195369884865, "kl": 0.012420050567016006, "learning_rate": 4.7024404778089535e-05, "loss": -0.21603678166866302, "num_tokens": 39904079.0, "reward": 2.3828125, "reward_std": 1.9220776557922363, "rewards/reward_func/mean": 0.2647569444444444, "rewards/reward_func/std": 0.28968556804789436, "sampling/importance_sampling_ratio/max": 2.998352527618408, "sampling/importance_sampling_ratio/mean": 0.9588257074356079, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.315893173217773, "sampling/sampling_logp_difference/mean": 0.16551700234413147, "step": 260, "step_time": 125.71782796108164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 1213.484375, "completions/mean_terminated_length": 995.732177734375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6492855101823807, "epoch": 0.6428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.030845034510191837, "kl": 0.012909290380775928, "learning_rate": 4.7001409685644824e-05, "loss": -0.0866418108344078, "num_tokens": 40061374.0, "reward": 2.8125, "reward_std": 2.1025304794311523, "rewards/reward_func/mean": 0.3125, "rewards/reward_func/std": 0.2829435136583116, "sampling/importance_sampling_ratio/max": 2.9994871616363525, "sampling/importance_sampling_ratio/mean": 0.9565799236297607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.684528350830078, "sampling/sampling_logp_difference/mean": 0.1711580604314804, "step": 261, "step_time": 120.31280164211057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 1554.015625, "completions/mean_terminated_length": 1114.9599609375, "completions/min_length": 105.0, "completions/min_terminated_length": 249.0, "degenerate_groups_filtered": 0.0, "entropy": 0.762213408946991, "epoch": 0.645320197044335, "frac_reward_zero_std": 0.0, "grad_norm": 0.02861600425215611, "kl": 0.013627588748931885, "learning_rate": 4.697833175008528e-05, "loss": -0.196628600358963, "num_tokens": 40254783.0, "reward": 2.6953125, "reward_std": 2.161839246749878, "rewards/reward_func/mean": 0.2994791666666667, "rewards/reward_func/std": 0.3182392368714015, "sampling/importance_sampling_ratio/max": 2.998302459716797, "sampling/importance_sampling_ratio/mean": 0.9433404207229614, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.0623197555542, "sampling/sampling_logp_difference/mean": 0.21327118575572968, "step": 262, "step_time": 134.2328903088346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3884.0, "completions/mean_length": 1729.53125, "completions/mean_terminated_length": 1249.3199462890625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7841713130474091, "epoch": 0.6477832512315271, "frac_reward_zero_std": 0.0, "grad_norm": 0.0196557696508857, "kl": 0.011178731918334961, "learning_rate": 4.695517105830752e-05, "loss": -0.14888839423656464, "num_tokens": 40469041.0, "reward": 1.72265625, "reward_std": 1.6511883735656738, "rewards/reward_func/mean": 0.19140625, "rewards/reward_func/std": 0.23754462434185875, "sampling/importance_sampling_ratio/max": 2.996554136276245, "sampling/importance_sampling_ratio/mean": 0.9346861839294434, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.43683910369873, "sampling/sampling_logp_difference/mean": 0.23255938291549683, "step": 263, "step_time": 136.18409524089657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4028.0, "completions/mean_length": 1203.421875, "completions/mean_terminated_length": 968.5438842773438, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6230374127626419, "epoch": 0.6502463054187192, "frac_reward_zero_std": 0.0, "grad_norm": 0.02448773823354456, "kl": 0.011263687629252672, "learning_rate": 4.6931927697519764e-05, "loss": 0.030414976179599762, "num_tokens": 40630492.0, "reward": 2.77734375, "reward_std": 2.0567915439605713, "rewards/reward_func/mean": 0.30859375, "rewards/reward_func/std": 0.2995274480846193, "sampling/importance_sampling_ratio/max": 2.9931156635284424, "sampling/importance_sampling_ratio/mean": 0.9626650810241699, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.868885040283203, "sampling/sampling_logp_difference/mean": 0.16547000408172607, "step": 264, "step_time": 127.37039630208164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 991.0, "completions/mean_terminated_length": 890.8386840820312, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6507675051689148, "epoch": 0.6527093596059114, "frac_reward_zero_std": 0.0, "grad_norm": 0.03418920980780656, "kl": 0.013704233802855015, "learning_rate": 4.690860175524151e-05, "loss": 0.01976613700389862, "num_tokens": 40774092.0, "reward": 2.75390625, "reward_std": 2.040508508682251, "rewards/reward_func/mean": 0.3059895833333333, "rewards/reward_func/std": 0.2572001852095127, "sampling/importance_sampling_ratio/max": 2.9980764389038086, "sampling/importance_sampling_ratio/mean": 0.9588420987129211, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.374872207641602, "sampling/sampling_logp_difference/mean": 0.1754309982061386, "step": 265, "step_time": 109.63442648178898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3475.0, "completions/mean_length": 1531.546875, "completions/mean_terminated_length": 1328.73681640625, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6632710099220276, "epoch": 0.6551724137931034, "frac_reward_zero_std": 0.0, "grad_norm": 0.027348136955904885, "kl": 0.010996296303346753, "learning_rate": 4.688519331930321e-05, "loss": -0.09692012518644333, "num_tokens": 40972591.0, "reward": 2.63671875, "reward_std": 1.9745049476623535, "rewards/reward_func/mean": 0.29296875, "rewards/reward_func/std": 0.3187914424472385, "sampling/importance_sampling_ratio/max": 2.999628782272339, "sampling/importance_sampling_ratio/mean": 0.9419499039649963, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.199172973632812, "sampling/sampling_logp_difference/mean": 0.20728717744350433, "step": 266, "step_time": 144.9105388901662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3919.0, "completions/mean_length": 1135.625, "completions/mean_terminated_length": 990.8448486328125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6764895170927048, "epoch": 0.6576354679802956, "frac_reward_zero_std": 0.0, "grad_norm": 0.062197272878801405, "kl": 0.0179288643412292, "learning_rate": 4.6861702477845924e-05, "loss": -0.0492466576397419, "num_tokens": 41136183.0, "reward": 3.02734375, "reward_std": 2.0339953899383545, "rewards/reward_func/mean": 0.3363715277777778, "rewards/reward_func/std": 0.30535148746437496, "sampling/importance_sampling_ratio/max": 2.9956681728363037, "sampling/importance_sampling_ratio/mean": 0.9513673186302185, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.910762786865234, "sampling/sampling_logp_difference/mean": 0.19355283677577972, "step": 267, "step_time": 124.74912125384435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 1603.984375, "completions/mean_terminated_length": 1274.4814453125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8284541666507721, "epoch": 0.6600985221674877, "frac_reward_zero_std": 0.0, "grad_norm": 0.024391601572451942, "kl": 0.01094577624462545, "learning_rate": 4.683812931932103e-05, "loss": -0.14053800702095032, "num_tokens": 41326118.0, "reward": 2.5390625, "reward_std": 2.0573227405548096, "rewards/reward_func/mean": 0.2821180555555556, "rewards/reward_func/std": 0.3195192499293221, "sampling/importance_sampling_ratio/max": 2.9987871646881104, "sampling/importance_sampling_ratio/mean": 0.9462224245071411, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.359637260437012, "sampling/sampling_logp_difference/mean": 0.2225756049156189, "step": 268, "step_time": 131.93645947403274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3042.0, "completions/mean_length": 1036.296875, "completions/mean_terminated_length": 909.7167358398438, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7021192014217377, "epoch": 0.6625615763546798, "frac_reward_zero_std": 0.0, "grad_norm": 0.03431428627892943, "kl": 0.014673095429316163, "learning_rate": 4.681447393248981e-05, "loss": -0.139969140291214, "num_tokens": 41490665.0, "reward": 2.41796875, "reward_std": 1.9267616271972656, "rewards/reward_func/mean": 0.2686631944444444, "rewards/reward_func/std": 0.26451122760772705, "sampling/importance_sampling_ratio/max": 2.997307777404785, "sampling/importance_sampling_ratio/mean": 0.9461147785186768, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.435528755187988, "sampling/sampling_logp_difference/mean": 0.21243739128112793, "step": 269, "step_time": 117.78038617805578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 1038.625, "completions/mean_terminated_length": 1004.258056640625, "completions/min_length": 112.0, "completions/min_terminated_length": 160.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7639897465705872, "epoch": 0.6650246305418719, "frac_reward_zero_std": 0.0, "grad_norm": 0.048844942744014554, "kl": 0.02224674168974161, "learning_rate": 4.679073640642321e-05, "loss": 0.09145447611808777, "num_tokens": 41651041.0, "reward": 2.390625, "reward_std": 1.9121971130371094, "rewards/reward_func/mean": 0.265625, "rewards/reward_func/std": 0.2759961535533269, "sampling/importance_sampling_ratio/max": 2.995248794555664, "sampling/importance_sampling_ratio/mean": 0.9422423243522644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.115524291992188, "sampling/sampling_logp_difference/mean": 0.22200405597686768, "step": 270, "step_time": 124.16956816823222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4085.0, "completions/mean_length": 1330.640625, "completions/mean_terminated_length": 1158.2105712890625, "completions/min_length": 123.0, "completions/min_terminated_length": 297.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6563108712434769, "epoch": 0.6674876847290641, "frac_reward_zero_std": 0.0, "grad_norm": 0.029884683271604665, "kl": 0.011029968969523907, "learning_rate": 4.676691683050142e-05, "loss": -0.035733554512262344, "num_tokens": 41827914.0, "reward": 2.6953125, "reward_std": 2.044868230819702, "rewards/reward_func/mean": 0.2994791666666667, "rewards/reward_func/std": 0.2951077421506246, "sampling/importance_sampling_ratio/max": 2.99714732170105, "sampling/importance_sampling_ratio/mean": 0.946255624294281, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.342223167419434, "sampling/sampling_logp_difference/mean": 0.20004363358020782, "step": 271, "step_time": 140.53300575097091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 1266.453125, "completions/mean_terminated_length": 1266.453125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6599143147468567, "epoch": 0.6699507389162561, "frac_reward_zero_std": 0.0, "grad_norm": 0.034345400977235964, "kl": 0.009904107078909874, "learning_rate": 4.6743015294413606e-05, "loss": 0.11671273410320282, "num_tokens": 41992791.0, "reward": 3.0, "reward_std": 1.9760470390319824, "rewards/reward_func/mean": 0.3333333333333333, "rewards/reward_func/std": 0.26781286464797127, "sampling/importance_sampling_ratio/max": 2.9989171028137207, "sampling/importance_sampling_ratio/mean": 0.9505242109298706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.894290924072266, "sampling/sampling_logp_difference/mean": 0.19252155721187592, "step": 272, "step_time": 90.05021728761494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3122.0, "completions/mean_length": 1176.484375, "completions/mean_terminated_length": 1135.274169921875, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6994865089654922, "epoch": 0.6724137931034483, "frac_reward_zero_std": 0.0, "grad_norm": 0.03406543051003686, "kl": 0.012832643231377006, "learning_rate": 4.671903188815754e-05, "loss": 0.022422391921281815, "num_tokens": 42152934.0, "reward": 2.3203125, "reward_std": 1.9223357439041138, "rewards/reward_func/mean": 0.2578125, "rewards/reward_func/std": 0.2667807986338933, "sampling/importance_sampling_ratio/max": 2.9991848468780518, "sampling/importance_sampling_ratio/mean": 0.9485845565795898, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.556821823120117, "sampling/sampling_logp_difference/mean": 0.197273850440979, "step": 273, "step_time": 139.07146079605445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3412.0, "completions/mean_length": 1000.625, "completions/mean_terminated_length": 837.5667114257812, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6011870503425598, "epoch": 0.6748768472906403, "frac_reward_zero_std": 0.0, "grad_norm": 0.03347541091546316, "kl": 0.013716504909098148, "learning_rate": 4.6694966702039236e-05, "loss": 0.0027394089847803116, "num_tokens": 42302046.0, "reward": 2.984375, "reward_std": 2.0103280544281006, "rewards/reward_func/mean": 0.3315972222222222, "rewards/reward_func/std": 0.26190561801195145, "sampling/importance_sampling_ratio/max": 2.9948132038116455, "sampling/importance_sampling_ratio/mean": 0.9589996933937073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.810711860656738, "sampling/sampling_logp_difference/mean": 0.16590197384357452, "step": 274, "step_time": 120.98849517386407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 1007.5625, "completions/mean_terminated_length": 969.8225708007812, "completions/min_length": 259.0, "completions/min_terminated_length": 270.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7249537408351898, "epoch": 0.6773399014778325, "frac_reward_zero_std": 0.0, "grad_norm": 0.04148532315358456, "kl": 0.012240090407431126, "learning_rate": 4.667081982667269e-05, "loss": -0.2217123806476593, "num_tokens": 42447090.0, "reward": 2.5390625, "reward_std": 2.0912795066833496, "rewards/reward_func/mean": 0.2821180555555556, "rewards/reward_func/std": 0.2783937735690011, "sampling/importance_sampling_ratio/max": 2.9994893074035645, "sampling/importance_sampling_ratio/mean": 0.9549602270126343, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.244540214538574, "sampling/sampling_logp_difference/mean": 0.18825289607048035, "step": 275, "step_time": 131.25489753391594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3414.0, "completions/max_terminated_length": 3414.0, "completions/mean_length": 910.75, "completions/mean_terminated_length": 846.2786254882812, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5957076847553253, "epoch": 0.6798029556650246, "frac_reward_zero_std": 0.0, "grad_norm": 0.03478113035151386, "kl": 0.011456226231530309, "learning_rate": 4.6646591352979416e-05, "loss": -0.03464243561029434, "num_tokens": 42579618.0, "reward": 2.87890625, "reward_std": 2.045607089996338, "rewards/reward_func/mean": 0.3198784722222222, "rewards/reward_func/std": 0.3012109100818634, "sampling/importance_sampling_ratio/max": 2.996354103088379, "sampling/importance_sampling_ratio/mean": 0.9606156945228577, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.22645378112793, "sampling/sampling_logp_difference/mean": 0.15759938955307007, "step": 276, "step_time": 93.82150396401994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3663.0, "completions/mean_length": 1254.34375, "completions/mean_terminated_length": 955.5178833007812, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7541182041168213, "epoch": 0.6822660098522167, "frac_reward_zero_std": 0.0, "grad_norm": 0.0320535517409538, "kl": 0.014798402087762952, "learning_rate": 4.6622281372188246e-05, "loss": -0.2057761400938034, "num_tokens": 42744888.0, "reward": 2.85546875, "reward_std": 1.9709218740463257, "rewards/reward_func/mean": 0.3172743055555556, "rewards/reward_func/std": 0.3003535072008769, "sampling/importance_sampling_ratio/max": 2.9964985847473145, "sampling/importance_sampling_ratio/mean": 0.9525945782661438, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.844799041748047, "sampling/sampling_logp_difference/mean": 0.19868908822536469, "step": 277, "step_time": 154.67366391490214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3779.0, "completions/mean_length": 1221.078125, "completions/mean_terminated_length": 832.92724609375, "completions/min_length": 137.0, "completions/min_terminated_length": 248.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8754851818084717, "epoch": 0.6847290640394089, "frac_reward_zero_std": 0.0, "grad_norm": 0.025582844382099238, "kl": 0.014622328337281942, "learning_rate": 4.6597889975834884e-05, "loss": 0.0713018923997879, "num_tokens": 42910589.0, "reward": 2.44921875, "reward_std": 1.9916391372680664, "rewards/reward_func/mean": 0.2721354166666667, "rewards/reward_func/std": 0.2889885538154178, "sampling/importance_sampling_ratio/max": 2.9914376735687256, "sampling/importance_sampling_ratio/mean": 0.9409353137016296, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.191876411437988, "sampling/sampling_logp_difference/mean": 0.21127820014953613, "step": 278, "step_time": 181.6098464212846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 1036.609375, "completions/mean_terminated_length": 722.22412109375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7221632450819016, "epoch": 0.687192118226601, "frac_reward_zero_std": 0.0, "grad_norm": 0.02575672515313856, "kl": 0.013656545663252473, "learning_rate": 4.657341725576159e-05, "loss": -0.08356674015522003, "num_tokens": 43068612.0, "reward": 2.30078125, "reward_std": 1.8301811218261719, "rewards/reward_func/mean": 0.2556423611111111, "rewards/reward_func/std": 0.2635475728246901, "sampling/importance_sampling_ratio/max": 2.99751353263855, "sampling/importance_sampling_ratio/mean": 0.9555066823959351, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.759946823120117, "sampling/sampling_logp_difference/mean": 0.18397647142410278, "step": 279, "step_time": 131.1261691369582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 795.53125, "completions/mean_terminated_length": 683.6551513671875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5730055123567581, "epoch": 0.6896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.032418687973505944, "kl": 0.014504492050036788, "learning_rate": 4.654886330411682e-05, "loss": -0.023671438917517662, "num_tokens": 43204822.0, "reward": 2.47265625, "reward_std": 1.9776114225387573, "rewards/reward_func/mean": 0.2747395833333333, "rewards/reward_func/std": 0.26046788858042824, "sampling/importance_sampling_ratio/max": 2.9993255138397217, "sampling/importance_sampling_ratio/mean": 0.9594089388847351, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.303296089172363, "sampling/sampling_logp_difference/mean": 0.15541130304336548, "step": 280, "step_time": 115.57059779204428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3354.0, "completions/mean_length": 1572.453125, "completions/mean_terminated_length": 1051.179931640625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "degenerate_groups_filtered": 0.0, "entropy": 1.0034317076206207, "epoch": 0.6921182266009852, "frac_reward_zero_std": 0.0, "grad_norm": 0.03455234529746986, "kl": 0.020893360022455454, "learning_rate": 4.6524228213354935e-05, "loss": -0.0927867442369461, "num_tokens": 43388723.0, "reward": 2.41796875, "reward_std": 2.0519323348999023, "rewards/reward_func/mean": 0.2686631944444444, "rewards/reward_func/std": 0.27052539587020874, "sampling/importance_sampling_ratio/max": 2.997314691543579, "sampling/importance_sampling_ratio/mean": 0.9392765760421753, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.711133003234863, "sampling/sampling_logp_difference/mean": 0.2265164703130722, "step": 281, "step_time": 188.2416215117555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3456.0, "completions/mean_length": 1600.859375, "completions/mean_terminated_length": 1088.019287109375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8214207738637924, "epoch": 0.6945812807881774, "frac_reward_zero_std": 0.0, "grad_norm": 0.02786223477474557, "kl": 0.016221630852669477, "learning_rate": 4.649951207623579e-05, "loss": -0.21129314601421356, "num_tokens": 43579530.0, "reward": 2.44921875, "reward_std": 2.133063554763794, "rewards/reward_func/mean": 0.2721354166666667, "rewards/reward_func/std": 0.3153829375902812, "sampling/importance_sampling_ratio/max": 2.9991559982299805, "sampling/importance_sampling_ratio/mean": 0.9338525533676147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.611785888671875, "sampling/sampling_logp_difference/mean": 0.21409368515014648, "step": 282, "step_time": 134.65818647295237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3185.0, "completions/mean_length": 929.734375, "completions/mean_terminated_length": 777.6896362304688, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7013634294271469, "epoch": 0.6970443349753694, "frac_reward_zero_std": 0.0, "grad_norm": 0.03818994470061745, "kl": 0.016004534903913736, "learning_rate": 4.647471498582441e-05, "loss": -0.08519688248634338, "num_tokens": 43716649.0, "reward": 3.01953125, "reward_std": 2.0559170246124268, "rewards/reward_func/mean": 0.3355034722222222, "rewards/reward_func/std": 0.3121260437700484, "sampling/importance_sampling_ratio/max": 2.998469591140747, "sampling/importance_sampling_ratio/mean": 0.9565503001213074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.915752410888672, "sampling/sampling_logp_difference/mean": 0.16929464042186737, "step": 283, "step_time": 130.65465216012672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 1232.59375, "completions/mean_terminated_length": 858.4181518554688, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6198261976242065, "epoch": 0.6995073891625616, "frac_reward_zero_std": 0.0, "grad_norm": 0.027115989619355605, "kl": 0.011915993178263307, "learning_rate": 4.644983703549063e-05, "loss": -0.18668434023857117, "num_tokens": 43875695.0, "reward": 2.4453125, "reward_std": 2.021446943283081, "rewards/reward_func/mean": 0.2717013888888889, "rewards/reward_func/std": 0.26602285272545284, "sampling/importance_sampling_ratio/max": 2.9976019859313965, "sampling/importance_sampling_ratio/mean": 0.9600894451141357, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.743024826049805, "sampling/sampling_logp_difference/mean": 0.16497497260570526, "step": 284, "step_time": 184.32950897398405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3321.0, "completions/mean_length": 1475.921875, "completions/mean_terminated_length": 1106.818115234375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8161509037017822, "epoch": 0.7019704433497537, "frac_reward_zero_std": 0.0, "grad_norm": 0.03215653834567183, "kl": 0.01845958409830928, "learning_rate": 4.642487831890878e-05, "loss": -0.1152234673500061, "num_tokens": 44054074.0, "reward": 2.1328125, "reward_std": 1.8033086061477661, "rewards/reward_func/mean": 0.23697916666666666, "rewards/reward_func/std": 0.2594776385360294, "sampling/importance_sampling_ratio/max": 2.9978108406066895, "sampling/importance_sampling_ratio/mean": 0.9379119873046875, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.68464469909668, "sampling/sampling_logp_difference/mean": 0.21339459717273712, "step": 285, "step_time": 132.44381054118276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 933.375, "completions/mean_terminated_length": 752.75439453125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7555945515632629, "epoch": 0.7044334975369458, "frac_reward_zero_std": 0.0, "grad_norm": 0.035057609426046354, "kl": 0.015247024362906814, "learning_rate": 4.639983893005728e-05, "loss": -0.09580279141664505, "num_tokens": 44201106.0, "reward": 3.03125, "reward_std": 2.050212860107422, "rewards/reward_func/mean": 0.3368055555555556, "rewards/reward_func/std": 0.2989240421189202, "sampling/importance_sampling_ratio/max": 2.9977879524230957, "sampling/importance_sampling_ratio/mean": 0.9536433219909668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.074197769165039, "sampling/sampling_logp_difference/mean": 0.1975114643573761, "step": 286, "step_time": 117.73847694206052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 1073.0625, "completions/mean_terminated_length": 924.21435546875, "completions/min_length": 25.0, "completions/min_terminated_length": 305.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7307270914316177, "epoch": 0.7068965517241379, "frac_reward_zero_std": 0.0, "grad_norm": 0.03673160946883158, "kl": 0.013983387732878327, "learning_rate": 4.6374718963218306e-05, "loss": -0.0696837306022644, "num_tokens": 44357062.0, "reward": 2.734375, "reward_std": 2.0920650959014893, "rewards/reward_func/mean": 0.3038194444444444, "rewards/reward_func/std": 0.27683138350645703, "sampling/importance_sampling_ratio/max": 2.9985971450805664, "sampling/importance_sampling_ratio/mean": 0.9465474486351013, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.304844856262207, "sampling/sampling_logp_difference/mean": 0.18649157881736755, "step": 287, "step_time": 128.483488796046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1295.15625, "completions/mean_terminated_length": 933.9608154296875, "completions/min_length": 163.0, "completions/min_terminated_length": 258.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6747958213090897, "epoch": 0.7093596059113301, "frac_reward_zero_std": 0.0, "grad_norm": 0.0476698774247551, "kl": 0.021229542326182127, "learning_rate": 4.6349518512977454e-05, "loss": -0.12250405550003052, "num_tokens": 44519904.0, "reward": 2.24609375, "reward_std": 1.9000272750854492, "rewards/reward_func/mean": 0.2495659722222222, "rewards/reward_func/std": 0.2909482667843501, "sampling/importance_sampling_ratio/max": 2.9965250492095947, "sampling/importance_sampling_ratio/mean": 0.9519683718681335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.992049217224121, "sampling/sampling_logp_difference/mean": 0.18970781564712524, "step": 288, "step_time": 129.66917791822925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2655.0, "completions/mean_length": 1327.0625, "completions/mean_terminated_length": 979.8654174804688, "completions/min_length": 58.0, "completions/min_terminated_length": 311.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6578145921230316, "epoch": 0.7118226600985221, "frac_reward_zero_std": 0.0, "grad_norm": 0.0252083094255525, "kl": 0.013971152482554317, "learning_rate": 4.632423767422335e-05, "loss": -0.10885612666606903, "num_tokens": 44693508.0, "reward": 2.58203125, "reward_std": 1.9680254459381104, "rewards/reward_func/mean": 0.2868923611111111, "rewards/reward_func/std": 0.2571699288156297, "sampling/importance_sampling_ratio/max": 2.9997310638427734, "sampling/importance_sampling_ratio/mean": 0.9491783976554871, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.832304000854492, "sampling/sampling_logp_difference/mean": 0.19528895616531372, "step": 289, "step_time": 127.48815377894789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3186.0, "completions/mean_length": 1482.15625, "completions/mean_terminated_length": 1008.3018798828125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7943369448184967, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.02661233898512643, "kl": 0.018482637824490666, "learning_rate": 4.629887654214735e-05, "loss": -0.022595541551709175, "num_tokens": 44881694.0, "reward": 2.85546875, "reward_std": 2.0806195735931396, "rewards/reward_func/mean": 0.3172743055555556, "rewards/reward_func/std": 0.3102165593041314, "sampling/importance_sampling_ratio/max": 2.9986367225646973, "sampling/importance_sampling_ratio/mean": 0.9434466361999512, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.933614730834961, "sampling/sampling_logp_difference/mean": 0.20633208751678467, "step": 290, "step_time": 132.85539328795858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3560.0, "completions/max_terminated_length": 3560.0, "completions/mean_length": 719.015625, "completions/mean_terminated_length": 679.1935424804688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6536745727062225, "epoch": 0.7167487684729064, "frac_reward_zero_std": 0.0, "grad_norm": 0.04246414902090446, "kl": 0.017346940003335476, "learning_rate": 4.627343521224308e-05, "loss": -0.13849687576293945, "num_tokens": 45002463.0, "reward": 2.765625, "reward_std": 2.0245885848999023, "rewards/reward_func/mean": 0.3072916666666667, "rewards/reward_func/std": 0.2708511683675978, "sampling/importance_sampling_ratio/max": 2.9997777938842773, "sampling/importance_sampling_ratio/mean": 0.9552593231201172, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.37160873413086, "sampling/sampling_logp_difference/mean": 0.1778235137462616, "step": 291, "step_time": 93.1697849421762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3734.0, "completions/mean_length": 1379.671875, "completions/mean_terminated_length": 1132.877197265625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7533788681030273, "epoch": 0.7192118226600985, "frac_reward_zero_std": 0.0, "grad_norm": 0.18056458605098777, "kl": 0.013372477609664202, "learning_rate": 4.62479137803062e-05, "loss": 0.04155343025922775, "num_tokens": 45180074.0, "reward": 2.390625, "reward_std": 2.0686986446380615, "rewards/reward_func/mean": 0.265625, "rewards/reward_func/std": 0.2828355067306095, "sampling/importance_sampling_ratio/max": 2.9985997676849365, "sampling/importance_sampling_ratio/mean": 0.9440683126449585, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.936153411865234, "sampling/sampling_logp_difference/mean": 0.20370152592658997, "step": 292, "step_time": 142.51414789189585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 813.6875, "completions/mean_terminated_length": 767.0655517578125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6714795529842377, "epoch": 0.7216748768472906, "frac_reward_zero_std": 0.0, "grad_norm": 0.041960239215964895, "kl": 0.015974524896591902, "learning_rate": 4.6222312342433946e-05, "loss": 0.15983834862709045, "num_tokens": 45315654.0, "reward": 2.90234375, "reward_std": 2.0762343406677246, "rewards/reward_func/mean": 0.3224826388888889, "rewards/reward_func/std": 0.2825869388050503, "sampling/importance_sampling_ratio/max": 2.99784779548645, "sampling/importance_sampling_ratio/mean": 0.957971453666687, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.999382972717285, "sampling/sampling_logp_difference/mean": 0.17292124032974243, "step": 293, "step_time": 113.661530460231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3627.0, "completions/mean_length": 1062.609375, "completions/mean_terminated_length": 938.2500610351562, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7374127954244614, "epoch": 0.7241379310344828, "frac_reward_zero_std": 0.0, "grad_norm": 0.03368874150557419, "kl": 0.016085440292954445, "learning_rate": 4.6196630995024836e-05, "loss": -0.02485065534710884, "num_tokens": 45483533.0, "reward": 2.12890625, "reward_std": 1.9039392471313477, "rewards/reward_func/mean": 0.2365451388888889, "rewards/reward_func/std": 0.26622918744881946, "sampling/importance_sampling_ratio/max": 2.9999284744262695, "sampling/importance_sampling_ratio/mean": 0.9374723434448242, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.421833992004395, "sampling/sampling_logp_difference/mean": 0.20632421970367432, "step": 294, "step_time": 137.75506053632125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 960.90625, "completions/mean_terminated_length": 731.7719116210938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6728474646806717, "epoch": 0.7266009852216748, "frac_reward_zero_std": 0.0, "grad_norm": 0.0371418483372251, "kl": 0.015450865030288696, "learning_rate": 4.617086983477823e-05, "loss": -0.23130658268928528, "num_tokens": 45622935.0, "reward": 2.84375, "reward_std": 2.0975704193115234, "rewards/reward_func/mean": 0.3159722222222222, "rewards/reward_func/std": 0.3071197188562817, "sampling/importance_sampling_ratio/max": 2.993969202041626, "sampling/importance_sampling_ratio/mean": 0.9563298225402832, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.287585258483887, "sampling/sampling_logp_difference/mean": 0.18286284804344177, "step": 295, "step_time": 115.8733365512453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 1296.0625, "completions/mean_terminated_length": 984.129638671875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7566079348325729, "epoch": 0.729064039408867, "frac_reward_zero_std": 0.0, "grad_norm": 0.03087457463428798, "kl": 0.013222290901467204, "learning_rate": 4.614502895869405e-05, "loss": -0.13276249170303345, "num_tokens": 45794731.0, "reward": 2.55078125, "reward_std": 2.0712409019470215, "rewards/reward_func/mean": 0.2834201388888889, "rewards/reward_func/std": 0.2835977425177892, "sampling/importance_sampling_ratio/max": 2.9988484382629395, "sampling/importance_sampling_ratio/mean": 0.9488980770111084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.859027862548828, "sampling/sampling_logp_difference/mean": 0.20444755256175995, "step": 296, "step_time": 133.08838602085598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 1113.171875, "completions/mean_terminated_length": 809.5614013671875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8098693490028381, "epoch": 0.7315270935960592, "frac_reward_zero_std": 0.0, "grad_norm": 0.031108469717071485, "kl": 0.018688400741666555, "learning_rate": 4.611910846407237e-05, "loss": -0.18503710627555847, "num_tokens": 45944214.0, "reward": 2.7890625, "reward_std": 2.0865302085876465, "rewards/reward_func/mean": 0.3098958333333333, "rewards/reward_func/std": 0.28468377225928837, "sampling/importance_sampling_ratio/max": 2.9885876178741455, "sampling/importance_sampling_ratio/mean": 0.957790195941925, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.835963249206543, "sampling/sampling_logp_difference/mean": 0.18420608341693878, "step": 297, "step_time": 125.53129041637294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 1286.109375, "completions/mean_terminated_length": 893.4706420898438, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6500519216060638, "epoch": 0.7339901477832512, "frac_reward_zero_std": 0.0, "grad_norm": 0.027085670223584147, "kl": 0.014615545980632305, "learning_rate": 4.6093108448513035e-05, "loss": -0.17975877225399017, "num_tokens": 46115437.0, "reward": 2.5, "reward_std": 2.040269136428833, "rewards/reward_func/mean": 0.2777777777777778, "rewards/reward_func/std": 0.2975609948237737, "sampling/importance_sampling_ratio/max": 2.9830408096313477, "sampling/importance_sampling_ratio/mean": 0.9469696283340454, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.986211776733398, "sampling/sampling_logp_difference/mean": 0.19814851880073547, "step": 298, "step_time": 131.17711529205553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3694.0, "completions/mean_length": 1236.609375, "completions/mean_terminated_length": 1096.11865234375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8978325873613358, "epoch": 0.7364532019704434, "frac_reward_zero_std": 0.0, "grad_norm": 0.050533787797288364, "kl": 0.015605957480147481, "learning_rate": 4.6067029009915345e-05, "loss": -0.1004580408334732, "num_tokens": 46276308.0, "reward": 2.4609375, "reward_std": 1.9498772621154785, "rewards/reward_func/mean": 0.2734375, "rewards/reward_func/std": 0.2673751397265328, "sampling/importance_sampling_ratio/max": 2.999791145324707, "sampling/importance_sampling_ratio/mean": 0.9466140270233154, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.746261596679688, "sampling/sampling_logp_difference/mean": 0.20931372046470642, "step": 299, "step_time": 132.20947787398472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 1200.5, "completions/mean_terminated_length": 1008.8965454101562, "completions/min_length": 203.0, "completions/min_terminated_length": 284.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7067240923643112, "epoch": 0.7389162561576355, "frac_reward_zero_std": 0.0, "grad_norm": 0.040695395873213816, "kl": 0.014059947803616524, "learning_rate": 4.6040870246477636e-05, "loss": -0.11653882265090942, "num_tokens": 46446148.0, "reward": 3.0, "reward_std": 2.1588687896728516, "rewards/reward_func/mean": 0.3333333333333333, "rewards/reward_func/std": 0.2878561284806993, "sampling/importance_sampling_ratio/max": 2.9961273670196533, "sampling/importance_sampling_ratio/mean": 0.9437478184700012, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.122971534729004, "sampling/sampling_logp_difference/mean": 0.20880259573459625, "step": 300, "step_time": 130.5556501680985 } ], "logging_steps": 1, "max_steps": 1624, "num_input_tokens_seen": 46446148, "num_train_epochs": 4, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }