{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1690794974565506, "epoch": 0.001, "frac_reward_zero_std": 0.0, "grad_norm": 1.7694201469421387, "kl": 0.0, "learning_rate": 0.0, "loss": -0.2838, "num_tokens": 2891.0, "reward": 0.45499998331069946, "reward_std": 0.6123452186584473, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.612345278263092, "sampling/importance_sampling_ratio/max": 1.8600257635116577, "sampling/importance_sampling_ratio/mean": 1.3248674869537354, "sampling/importance_sampling_ratio/min": 0.6954079270362854, "sampling/sampling_logp_difference/max": 0.3488762378692627, "sampling/sampling_logp_difference/mean": 0.019007110968232155, "step": 1, "step_time": 28.526343904959504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.25211217999458313, "epoch": 0.002, "frac_reward_zero_std": 0.0, "grad_norm": 1.065087080001831, "kl": 0.0, "learning_rate": 1.6666666666666668e-07, "loss": 0.1109, "num_tokens": 5506.0, "reward": 0.45500001311302185, "reward_std": 0.6066575646400452, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6066575050354004, "sampling/importance_sampling_ratio/max": 1.186666488647461, "sampling/importance_sampling_ratio/mean": 0.9661478996276855, "sampling/importance_sampling_ratio/min": 0.43538862466812134, "sampling/sampling_logp_difference/max": 0.32524752616882324, "sampling/sampling_logp_difference/mean": 0.01666935160756111, "step": 2, "step_time": 23.224473382986616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.15375655889511108, "epoch": 0.003, "frac_reward_zero_std": 0.0, "grad_norm": 0.7260156869888306, "kl": 0.000387201172998175, "learning_rate": 3.3333333333333335e-07, "loss": -0.0158, "num_tokens": 7967.0, "reward": 0.7250000238418579, "reward_std": 0.4850085973739624, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.4850085973739624, "sampling/importance_sampling_ratio/max": 1.035571575164795, "sampling/importance_sampling_ratio/mean": 0.8206162452697754, "sampling/importance_sampling_ratio/min": 0.6426183581352234, "sampling/sampling_logp_difference/max": 0.2029057741165161, "sampling/sampling_logp_difference/mean": 0.011877943761646748, "step": 3, "step_time": 17.72353470302187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.2212316393852234, "epoch": 0.004, "frac_reward_zero_std": 0.0, "grad_norm": 0.7334376573562622, "kl": 0.0022102410439401865, "learning_rate": 5.000000000000001e-07, "loss": -0.1995, "num_tokens": 10952.0, "reward": 0.7325000166893005, "reward_std": 0.488629013299942, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.488629013299942, "sampling/importance_sampling_ratio/max": 1.0403285026550293, "sampling/importance_sampling_ratio/mean": 0.761037290096283, "sampling/importance_sampling_ratio/min": 0.3470890522003174, "sampling/sampling_logp_difference/max": 0.9722568988800049, "sampling/sampling_logp_difference/mean": 0.023567143827676773, "step": 4, "step_time": 26.211164197011385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.14337338507175446, "epoch": 0.005, "frac_reward_zero_std": 0.0, "grad_norm": 0.4706338346004486, "kl": 0.00046622363151982427, "learning_rate": 6.666666666666667e-07, "loss": 0.0025, "num_tokens": 14061.0, "reward": 0.21000000834465027, "reward_std": 0.52801513671875, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.52801513671875, "sampling/importance_sampling_ratio/max": 1.1055026054382324, "sampling/importance_sampling_ratio/mean": 0.9579700231552124, "sampling/importance_sampling_ratio/min": 0.850264310836792, "sampling/sampling_logp_difference/max": 0.21829354763031006, "sampling/sampling_logp_difference/mean": 0.00928188394755125, "step": 5, "step_time": 30.61694531404646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 44.5, "completions/mean_terminated_length": 44.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.19655971229076385, "epoch": 0.006, "frac_reward_zero_std": 0.0, "grad_norm": 4.993420124053955, "kl": 0.001883858349174261, "learning_rate": 8.333333333333333e-07, "loss": -0.3115, "num_tokens": 16326.0, "reward": 0.9900000095367432, "reward_std": 0.008164958097040653, "rewards/reward_func/mean": 0.9900000095367432, "rewards/reward_func/std": 0.008164958097040653, "sampling/importance_sampling_ratio/max": 1.9480223655700684, "sampling/importance_sampling_ratio/mean": 1.2064706087112427, "sampling/importance_sampling_ratio/min": 0.786892831325531, "sampling/sampling_logp_difference/max": 0.552229642868042, "sampling/sampling_logp_difference/mean": 0.02012234926223755, "step": 6, "step_time": 13.275526605022606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.21491773426532745, "epoch": 0.007, "frac_reward_zero_std": 0.0, "grad_norm": 0.6698219776153564, "kl": 0.001308084581978619, "learning_rate": 1.0000000000000002e-06, "loss": 0.0146, "num_tokens": 19580.0, "reward": 0.20000000298023224, "reward_std": 0.5343531966209412, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5343531966209412, "sampling/importance_sampling_ratio/max": 0.8031184077262878, "sampling/importance_sampling_ratio/mean": 0.6652683019638062, "sampling/importance_sampling_ratio/min": 0.511317789554596, "sampling/sampling_logp_difference/max": 0.4191169738769531, "sampling/sampling_logp_difference/mean": 0.019288605079054832, "step": 7, "step_time": 33.92142505198717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1761166900396347, "epoch": 0.008, "frac_reward_zero_std": 0.0, "grad_norm": 1.2735919952392578, "kl": 0.0024672183208167553, "learning_rate": 1.1666666666666668e-06, "loss": 0.0549, "num_tokens": 22411.0, "reward": -0.06499999761581421, "reward_std": 0.05916079506278038, "rewards/reward_func/mean": -0.06499999761581421, "rewards/reward_func/std": 0.05916079878807068, "sampling/importance_sampling_ratio/max": 1.1058381795883179, "sampling/importance_sampling_ratio/mean": 0.9285240173339844, "sampling/importance_sampling_ratio/min": 0.6420994997024536, "sampling/sampling_logp_difference/max": 0.3237142562866211, "sampling/sampling_logp_difference/mean": 0.01667601801455021, "step": 8, "step_time": 27.820944591017906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.19499044120311737, "epoch": 0.009, "frac_reward_zero_std": 0.0, "grad_norm": 0.6493493914604187, "kl": 0.0014519351534545422, "learning_rate": 1.3333333333333334e-06, "loss": 0.027, "num_tokens": 25058.0, "reward": 0.20749999582767487, "reward_std": 0.5302436351776123, "rewards/reward_func/mean": 0.20749999582767487, "rewards/reward_func/std": 0.5302436351776123, "sampling/importance_sampling_ratio/max": 1.536926031112671, "sampling/importance_sampling_ratio/mean": 0.8416682481765747, "sampling/importance_sampling_ratio/min": 0.5120096802711487, "sampling/sampling_logp_difference/max": 0.3361164331436157, "sampling/sampling_logp_difference/mean": 0.015237247571349144, "step": 9, "step_time": 23.23614341497887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.18325170874595642, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 1.2250245809555054, "kl": 0.0007307051564566791, "learning_rate": 1.5e-06, "loss": -0.0388, "num_tokens": 27748.0, "reward": 0.20499999821186066, "reward_std": 0.525642454624176, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.525642454624176, "sampling/importance_sampling_ratio/max": 1.5835230350494385, "sampling/importance_sampling_ratio/mean": 1.143486499786377, "sampling/importance_sampling_ratio/min": 0.7374402284622192, "sampling/sampling_logp_difference/max": 0.3930445909500122, "sampling/sampling_logp_difference/mean": 0.01502042543143034, "step": 10, "step_time": 27.988202619017102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 61.25, "completions/mean_terminated_length": 61.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.21740968525409698, "epoch": 0.011, "frac_reward_zero_std": 0.0, "grad_norm": 1.256651520729065, "kl": 0.0032303757034242153, "learning_rate": 1.6666666666666667e-06, "loss": -0.0885, "num_tokens": 30979.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.3089040517807007, "sampling/importance_sampling_ratio/mean": 1.1207859516143799, "sampling/importance_sampling_ratio/min": 0.7530092000961304, "sampling/sampling_logp_difference/max": 0.8883767127990723, "sampling/sampling_logp_difference/mean": 0.020340321585536003, "step": 11, "step_time": 19.88501465099398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17663566768169403, "epoch": 0.012, "frac_reward_zero_std": 0.0, "grad_norm": 2.204960346221924, "kl": 0.0012955483980476856, "learning_rate": 1.8333333333333333e-06, "loss": -0.1753, "num_tokens": 33906.0, "reward": 0.4775000214576721, "reward_std": 0.5980733036994934, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5980733036994934, "sampling/importance_sampling_ratio/max": 1.6643222570419312, "sampling/importance_sampling_ratio/mean": 0.8751918077468872, "sampling/importance_sampling_ratio/min": 0.39312252402305603, "sampling/sampling_logp_difference/max": 0.4464702606201172, "sampling/sampling_logp_difference/mean": 0.018733447417616844, "step": 12, "step_time": 23.57167529099388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.2222597599029541, "epoch": 0.013, "frac_reward_zero_std": 0.0, "grad_norm": 1.3614099025726318, "kl": 0.0021189097315073013, "learning_rate": 2.0000000000000003e-06, "loss": -0.3654, "num_tokens": 37185.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 2.2725558280944824, "sampling/importance_sampling_ratio/mean": 1.10244619846344, "sampling/importance_sampling_ratio/min": 0.41275277733802795, "sampling/sampling_logp_difference/max": 0.49567699432373047, "sampling/sampling_logp_difference/mean": 0.0183317419141531, "step": 13, "step_time": 27.432764305965975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17930717766284943, "epoch": 0.014, "frac_reward_zero_std": 0.0, "grad_norm": 1.4246803522109985, "kl": 0.0013900657650083303, "learning_rate": 2.166666666666667e-06, "loss": 0.0577, "num_tokens": 39521.0, "reward": 0.9900000095367432, "reward_std": 0.019999999552965164, "rewards/reward_func/mean": 0.9900000095367432, "rewards/reward_func/std": 0.02000001072883606, "sampling/importance_sampling_ratio/max": 1.34236478805542, "sampling/importance_sampling_ratio/mean": 1.1689629554748535, "sampling/importance_sampling_ratio/min": 1.0116561651229858, "sampling/sampling_logp_difference/max": 0.3754175901412964, "sampling/sampling_logp_difference/mean": 0.01138111762702465, "step": 14, "step_time": 12.762974106008187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.181468665599823, "epoch": 0.015, "frac_reward_zero_std": 0.0, "grad_norm": 0.7324386835098267, "kl": 0.0012912629172205925, "learning_rate": 2.3333333333333336e-06, "loss": 0.2737, "num_tokens": 42388.0, "reward": 0.23750001192092896, "reward_std": 0.5084207653999329, "rewards/reward_func/mean": 0.23750001192092896, "rewards/reward_func/std": 0.5084207653999329, "sampling/importance_sampling_ratio/max": 1.5870178937911987, "sampling/importance_sampling_ratio/mean": 1.0515062808990479, "sampling/importance_sampling_ratio/min": 0.5773301720619202, "sampling/sampling_logp_difference/max": 0.6747937202453613, "sampling/sampling_logp_difference/mean": 0.015356915071606636, "step": 15, "step_time": 28.87673272797838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.17757576704025269, "epoch": 0.016, "frac_reward_zero_std": 0.0, "grad_norm": 0.7791656851768494, "kl": 0.0011586352484300733, "learning_rate": 2.5e-06, "loss": 0.0899, "num_tokens": 45137.0, "reward": 0.21000000834465027, "reward_std": 0.5212165117263794, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.5212165117263794, "sampling/importance_sampling_ratio/max": 0.9811292290687561, "sampling/importance_sampling_ratio/mean": 0.8586262464523315, "sampling/importance_sampling_ratio/min": 0.6450943946838379, "sampling/sampling_logp_difference/max": 0.31294405460357666, "sampling/sampling_logp_difference/mean": 0.011509422212839127, "step": 16, "step_time": 29.512227962026373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.2108621597290039, "epoch": 0.017, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017789137782528996, "kl": 0.0008321161731146276, "learning_rate": 2.666666666666667e-06, "loss": 0.0, "num_tokens": 47503.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2581455707550049, "sampling/importance_sampling_ratio/mean": 1.0041334629058838, "sampling/importance_sampling_ratio/min": 0.7038960456848145, "sampling/sampling_logp_difference/max": 0.33078861236572266, "sampling/sampling_logp_difference/mean": 0.012301802635192871, "step": 17, "step_time": 9.200428290001582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2184532880783081, "epoch": 0.018, "frac_reward_zero_std": 0.0, "grad_norm": 1.601157784461975, "kl": 0.0019322438165545464, "learning_rate": 2.8333333333333335e-06, "loss": 0.4322, "num_tokens": 50320.0, "reward": 0.4775000214576721, "reward_std": 0.5804811120033264, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5804811120033264, "sampling/importance_sampling_ratio/max": 1.7848742008209229, "sampling/importance_sampling_ratio/mean": 1.0737558603286743, "sampling/importance_sampling_ratio/min": 0.3417838513851166, "sampling/sampling_logp_difference/max": 0.6254115104675293, "sampling/sampling_logp_difference/mean": 0.023144036531448364, "step": 18, "step_time": 27.39610268798424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 62.75, "completions/mean_terminated_length": 62.75, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.23026728630065918, "epoch": 0.019, "frac_reward_zero_std": 0.0, "grad_norm": 0.7251322865486145, "kl": 0.001008337247185409, "learning_rate": 3e-06, "loss": -0.1061, "num_tokens": 52849.0, "reward": 0.6749999523162842, "reward_std": 0.538918673992157, "rewards/reward_func/mean": 0.6749999523162842, "rewards/reward_func/std": 0.538918673992157, "sampling/importance_sampling_ratio/max": 1.2397409677505493, "sampling/importance_sampling_ratio/mean": 0.8263118863105774, "sampling/importance_sampling_ratio/min": 0.6256706118583679, "sampling/sampling_logp_difference/max": 0.4246499538421631, "sampling/sampling_logp_difference/mean": 0.01839565299451351, "step": 19, "step_time": 23.762960355030373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.20103175938129425, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.8409279584884644, "kl": 0.0011635932605713606, "learning_rate": 3.1666666666666667e-06, "loss": 0.02, "num_tokens": 55976.0, "reward": 0.1899999976158142, "reward_std": 0.5212165117263794, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.5212165713310242, "sampling/importance_sampling_ratio/max": 0.8928262591362, "sampling/importance_sampling_ratio/mean": 0.8174147009849548, "sampling/importance_sampling_ratio/min": 0.6498741507530212, "sampling/sampling_logp_difference/max": 0.3282057046890259, "sampling/sampling_logp_difference/mean": 0.016716880723834038, "step": 20, "step_time": 34.79140626901062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.24307209253311157, "epoch": 0.021, "frac_reward_zero_std": 0.0, "grad_norm": 2.4089884757995605, "kl": 0.005192960146814585, "learning_rate": 3.3333333333333333e-06, "loss": 0.0297, "num_tokens": 58947.0, "reward": 0.49000000953674316, "reward_std": 0.5889538526535034, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5889538526535034, "sampling/importance_sampling_ratio/max": 1.8641705513000488, "sampling/importance_sampling_ratio/mean": 1.017917275428772, "sampling/importance_sampling_ratio/min": 0.15175653994083405, "sampling/sampling_logp_difference/max": 1.413463830947876, "sampling/sampling_logp_difference/mean": 0.038106679916381836, "step": 21, "step_time": 19.72863965400029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16037502884864807, "epoch": 0.022, "frac_reward_zero_std": 0.0, "grad_norm": 1.9025684595108032, "kl": 0.0024386204313486814, "learning_rate": 3.5e-06, "loss": -0.3497, "num_tokens": 61866.0, "reward": 0.4724999964237213, "reward_std": 0.6091181635856628, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6091182231903076, "sampling/importance_sampling_ratio/max": 2.9466702938079834, "sampling/importance_sampling_ratio/mean": 1.4579466581344604, "sampling/importance_sampling_ratio/min": 0.7423606514930725, "sampling/sampling_logp_difference/max": 0.34435367584228516, "sampling/sampling_logp_difference/mean": 0.018071508035063744, "step": 22, "step_time": 23.186725946026854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19720782339572906, "epoch": 0.023, "frac_reward_zero_std": 0.0, "grad_norm": 0.9149781465530396, "kl": 0.0011606084881350398, "learning_rate": 3.6666666666666666e-06, "loss": 0.0361, "num_tokens": 64949.0, "reward": 0.48750001192092896, "reward_std": 0.5921359658241272, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.592136025428772, "sampling/importance_sampling_ratio/max": 1.0435854196548462, "sampling/importance_sampling_ratio/mean": 0.862576961517334, "sampling/importance_sampling_ratio/min": 0.7559348344802856, "sampling/sampling_logp_difference/max": 0.34287071228027344, "sampling/sampling_logp_difference/mean": 0.017276717349886894, "step": 23, "step_time": 22.19894163700519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.15801791846752167, "epoch": 0.024, "frac_reward_zero_std": 0.0, "grad_norm": 0.7708470821380615, "kl": 0.000878384686075151, "learning_rate": 3.833333333333334e-06, "loss": -0.0433, "num_tokens": 67889.0, "reward": 0.7124999761581421, "reward_std": 0.535062313079834, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.5350623726844788, "sampling/importance_sampling_ratio/max": 1.8083628416061401, "sampling/importance_sampling_ratio/mean": 0.807941198348999, "sampling/importance_sampling_ratio/min": 0.3332298994064331, "sampling/sampling_logp_difference/max": 0.474312424659729, "sampling/sampling_logp_difference/mean": 0.021255692467093468, "step": 24, "step_time": 28.712539033964276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.19440847635269165, "epoch": 0.025, "frac_reward_zero_std": 0.0, "grad_norm": 3.9135429859161377, "kl": 0.001603958779014647, "learning_rate": 4.000000000000001e-06, "loss": 0.3185, "num_tokens": 70486.0, "reward": 0.7175000309944153, "reward_std": 0.4797481894493103, "rewards/reward_func/mean": 0.7175000309944153, "rewards/reward_func/std": 0.4797481894493103, "sampling/importance_sampling_ratio/max": 2.502864360809326, "sampling/importance_sampling_ratio/mean": 1.469748616218567, "sampling/importance_sampling_ratio/min": 0.8149593472480774, "sampling/sampling_logp_difference/max": 0.3406977653503418, "sampling/sampling_logp_difference/mean": 0.017325270920991898, "step": 25, "step_time": 21.12462861801032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18733122944831848, "epoch": 0.026, "frac_reward_zero_std": 0.0, "grad_norm": 1.9654920101165771, "kl": 0.0021408493630588055, "learning_rate": 4.166666666666667e-06, "loss": 0.0049, "num_tokens": 73363.0, "reward": 0.7325000166893005, "reward_std": 0.5283543467521667, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5283544063568115, "sampling/importance_sampling_ratio/max": 1.5510928630828857, "sampling/importance_sampling_ratio/mean": 1.1689436435699463, "sampling/importance_sampling_ratio/min": 0.8162763714790344, "sampling/sampling_logp_difference/max": 0.7810912132263184, "sampling/sampling_logp_difference/mean": 0.018673431128263474, "step": 26, "step_time": 21.690037710010074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.24256083369255066, "epoch": 0.027, "frac_reward_zero_std": 0.0, "grad_norm": 0.740149199962616, "kl": 0.00063083361601457, "learning_rate": 4.333333333333334e-06, "loss": -0.0407, "num_tokens": 75881.0, "reward": 0.9900000095367432, "reward_std": 0.019999999552965164, "rewards/reward_func/mean": 0.9900000095367432, "rewards/reward_func/std": 0.02000001072883606, "sampling/importance_sampling_ratio/max": 1.0160430669784546, "sampling/importance_sampling_ratio/mean": 0.7664977312088013, "sampling/importance_sampling_ratio/min": 0.5705021619796753, "sampling/sampling_logp_difference/max": 0.3010873794555664, "sampling/sampling_logp_difference/mean": 0.018683861941099167, "step": 27, "step_time": 12.669248107995372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2366439402103424, "epoch": 0.028, "frac_reward_zero_std": 0.0, "grad_norm": 1.0369964838027954, "kl": 0.0012403648579493165, "learning_rate": 4.5e-06, "loss": -0.1528, "num_tokens": 78878.0, "reward": 0.4925000071525574, "reward_std": 0.5860247611999512, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.1833999156951904, "sampling/importance_sampling_ratio/mean": 0.9274792671203613, "sampling/importance_sampling_ratio/min": 0.6121615767478943, "sampling/sampling_logp_difference/max": 0.3726520538330078, "sampling/sampling_logp_difference/mean": 0.016752678900957108, "step": 28, "step_time": 22.686215320019983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.17702580988407135, "epoch": 0.029, "frac_reward_zero_std": 0.0, "grad_norm": 0.9103659391403198, "kl": 0.0025775833055377007, "learning_rate": 4.666666666666667e-06, "loss": 0.0843, "num_tokens": 82198.0, "reward": 0.20000000298023224, "reward_std": 0.5358482599258423, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5358482599258423, "sampling/importance_sampling_ratio/max": 1.0262494087219238, "sampling/importance_sampling_ratio/mean": 0.8293523788452148, "sampling/importance_sampling_ratio/min": 0.6377174854278564, "sampling/sampling_logp_difference/max": 0.5646309852600098, "sampling/sampling_logp_difference/mean": 0.020577674731612206, "step": 29, "step_time": 33.133232187945396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.19803452491760254, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 1.4241100549697876, "kl": 0.0012677285121753812, "learning_rate": 4.833333333333333e-06, "loss": 0.0696, "num_tokens": 85079.0, "reward": 0.19749999046325684, "reward_std": 0.5361825227737427, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.5361825227737427, "sampling/importance_sampling_ratio/max": 1.7217131853103638, "sampling/importance_sampling_ratio/mean": 1.4462528228759766, "sampling/importance_sampling_ratio/min": 1.1936447620391846, "sampling/sampling_logp_difference/max": 0.3546537160873413, "sampling/sampling_logp_difference/mean": 0.015671029686927795, "step": 30, "step_time": 32.31156441604253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.2113637775182724, "epoch": 0.031, "frac_reward_zero_std": 0.0, "grad_norm": 1.3625730276107788, "kl": 0.0018731742165982723, "learning_rate": 5e-06, "loss": 0.2336, "num_tokens": 87815.0, "reward": 0.7100000381469727, "reward_std": 0.5600595474243164, "rewards/reward_func/mean": 0.7100000381469727, "rewards/reward_func/std": 0.5600595474243164, "sampling/importance_sampling_ratio/max": 1.1667720079421997, "sampling/importance_sampling_ratio/mean": 0.7437838315963745, "sampling/importance_sampling_ratio/min": 0.4153501093387604, "sampling/sampling_logp_difference/max": 0.4128420352935791, "sampling/sampling_logp_difference/mean": 0.016734153032302856, "step": 31, "step_time": 23.09143936599139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.20192047953605652, "epoch": 0.032, "frac_reward_zero_std": 0.0, "grad_norm": 1.5310053825378418, "kl": 0.0009873858653008938, "learning_rate": 4.99998688809149e-06, "loss": -0.2058, "num_tokens": 90384.0, "reward": 0.7425000071525574, "reward_std": 0.4950000047683716, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.4950000047683716, "sampling/importance_sampling_ratio/max": 1.8033684492111206, "sampling/importance_sampling_ratio/mean": 1.410750389099121, "sampling/importance_sampling_ratio/min": 0.9831962585449219, "sampling/sampling_logp_difference/max": 0.48407530784606934, "sampling/sampling_logp_difference/mean": 0.01803717575967312, "step": 32, "step_time": 16.232356516004074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15265171229839325, "epoch": 0.033, "frac_reward_zero_std": 0.0, "grad_norm": 1.2789109945297241, "kl": 0.0014273010892793536, "learning_rate": 4.9999475525034974e-06, "loss": 0.2443, "num_tokens": 93459.0, "reward": 0.21250000596046448, "reward_std": 0.5273439288139343, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.5273439884185791, "sampling/importance_sampling_ratio/max": 1.7152032852172852, "sampling/importance_sampling_ratio/mean": 1.0911431312561035, "sampling/importance_sampling_ratio/min": 0.6519092321395874, "sampling/sampling_logp_difference/max": 0.3830188512802124, "sampling/sampling_logp_difference/mean": 0.015168999321758747, "step": 33, "step_time": 27.38160407100804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16871626675128937, "epoch": 0.034, "frac_reward_zero_std": 0.0, "grad_norm": 1.5993002653121948, "kl": 0.0012848010519519448, "learning_rate": 4.999881993648633e-06, "loss": 0.6006, "num_tokens": 96452.0, "reward": 0.4700000286102295, "reward_std": 0.5838379263877869, "rewards/reward_func/mean": 0.4700000286102295, "rewards/reward_func/std": 0.5838378667831421, "sampling/importance_sampling_ratio/max": 2.7293713092803955, "sampling/importance_sampling_ratio/mean": 1.2541234493255615, "sampling/importance_sampling_ratio/min": 0.551033079624176, "sampling/sampling_logp_difference/max": 0.4532332420349121, "sampling/sampling_logp_difference/mean": 0.017988380044698715, "step": 34, "step_time": 27.34414886799641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1804264634847641, "epoch": 0.035, "frac_reward_zero_std": 0.0, "grad_norm": 0.9022880792617798, "kl": 0.0009833512594923377, "learning_rate": 4.99979021221458e-06, "loss": -0.0283, "num_tokens": 99630.0, "reward": 0.20499999821186066, "reward_std": 0.5313190817832947, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.5313190817832947, "sampling/importance_sampling_ratio/max": 0.9818359613418579, "sampling/importance_sampling_ratio/mean": 0.7903547286987305, "sampling/importance_sampling_ratio/min": 0.5214921236038208, "sampling/sampling_logp_difference/max": 0.5723757743835449, "sampling/sampling_logp_difference/mean": 0.020144499838352203, "step": 35, "step_time": 31.463614504027646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.18060731887817383, "epoch": 0.036, "frac_reward_zero_std": 0.0, "grad_norm": 0.9135256409645081, "kl": 0.001348503283225, "learning_rate": 4.9996722091640805e-06, "loss": 0.1413, "num_tokens": 102328.0, "reward": 0.45499998331069946, "reward_std": 0.6297882795333862, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.629788339138031, "sampling/importance_sampling_ratio/max": 1.2054076194763184, "sampling/importance_sampling_ratio/mean": 0.7335603833198547, "sampling/importance_sampling_ratio/min": 0.3148641586303711, "sampling/sampling_logp_difference/max": 0.4715766906738281, "sampling/sampling_logp_difference/mean": 0.02093822881579399, "step": 36, "step_time": 22.815022797964048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.18031904101371765, "epoch": 0.037, "frac_reward_zero_std": 0.0, "grad_norm": 1.479186773300171, "kl": 0.0007664791774004698, "learning_rate": 4.999527985734932e-06, "loss": -0.0005, "num_tokens": 104722.0, "reward": 0.19249999523162842, "reward_std": 0.5384777784347534, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.5384777784347534, "sampling/importance_sampling_ratio/max": 1.2917379140853882, "sampling/importance_sampling_ratio/mean": 1.1469188928604126, "sampling/importance_sampling_ratio/min": 1.0468064546585083, "sampling/sampling_logp_difference/max": 0.3037198781967163, "sampling/sampling_logp_difference/mean": 0.01134363655000925, "step": 37, "step_time": 24.720144782040734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.22419334948062897, "epoch": 0.038, "frac_reward_zero_std": 0.0, "grad_norm": 0.7955169677734375, "kl": 0.0014140852726995945, "learning_rate": 4.999357543439969e-06, "loss": 0.0602, "num_tokens": 107099.0, "reward": 0.20749999582767487, "reward_std": 0.5303693413734436, "rewards/reward_func/mean": 0.20749999582767487, "rewards/reward_func/std": 0.5303693413734436, "sampling/importance_sampling_ratio/max": 1.0965179204940796, "sampling/importance_sampling_ratio/mean": 0.9815975427627563, "sampling/importance_sampling_ratio/min": 0.9259361624717712, "sampling/sampling_logp_difference/max": 0.29874086380004883, "sampling/sampling_logp_difference/mean": 0.013619343750178814, "step": 38, "step_time": 29.20699664397398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.18815650045871735, "epoch": 0.039, "frac_reward_zero_std": 0.0, "grad_norm": 1.5825306177139282, "kl": 0.0023592538200318813, "learning_rate": 4.999160884067051e-06, "loss": 0.0457, "num_tokens": 110346.0, "reward": 0.2199999988079071, "reward_std": 0.5208326578140259, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5208326578140259, "sampling/importance_sampling_ratio/max": 1.646105170249939, "sampling/importance_sampling_ratio/mean": 1.2207181453704834, "sampling/importance_sampling_ratio/min": 0.7354973554611206, "sampling/sampling_logp_difference/max": 0.39037227630615234, "sampling/sampling_logp_difference/mean": 0.02397393435239792, "step": 39, "step_time": 30.393466276000254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16454768180847168, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 2.114875555038452, "kl": 0.0021705932449549437, "learning_rate": 4.9989380096790416e-06, "loss": 0.2242, "num_tokens": 113456.0, "reward": 0.7475000023841858, "reward_std": 0.4983556270599365, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.4983556270599365, "sampling/importance_sampling_ratio/max": 1.4531859159469604, "sampling/importance_sampling_ratio/mean": 1.0833818912506104, "sampling/importance_sampling_ratio/min": 0.7822138071060181, "sampling/sampling_logp_difference/max": 0.6953859329223633, "sampling/sampling_logp_difference/mean": 0.023247824981808662, "step": 40, "step_time": 25.57675183797255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.20822198688983917, "epoch": 0.041, "frac_reward_zero_std": 0.0, "grad_norm": 1.7324362993240356, "kl": 0.0022688840981572866, "learning_rate": 4.998688922613788e-06, "loss": 0.0637, "num_tokens": 116049.0, "reward": 0.22499999403953552, "reward_std": 0.48363208770751953, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.4836321473121643, "sampling/importance_sampling_ratio/max": 1.0998011827468872, "sampling/importance_sampling_ratio/mean": 0.8031355142593384, "sampling/importance_sampling_ratio/min": 0.4663705825805664, "sampling/sampling_logp_difference/max": 0.389176607131958, "sampling/sampling_logp_difference/mean": 0.019496317952871323, "step": 41, "step_time": 26.19419524195837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1925763040781021, "epoch": 0.042, "frac_reward_zero_std": 0.0, "grad_norm": 0.6590908169746399, "kl": 0.0009437088738195598, "learning_rate": 4.998413625484095e-06, "loss": 0.0197, "num_tokens": 118321.0, "reward": 0.45500001311302185, "reward_std": 0.6312157511711121, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6312157511711121, "sampling/importance_sampling_ratio/max": 0.7790407538414001, "sampling/importance_sampling_ratio/mean": 0.5937222242355347, "sampling/importance_sampling_ratio/min": 0.38133394718170166, "sampling/sampling_logp_difference/max": 0.3181338310241699, "sampling/sampling_logp_difference/mean": 0.015428215265274048, "step": 42, "step_time": 25.28124860598473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16625115275382996, "epoch": 0.043, "frac_reward_zero_std": 0.0, "grad_norm": 0.6327119469642639, "kl": 0.0006698658689856529, "learning_rate": 4.9981121211777e-06, "loss": 0.1213, "num_tokens": 121199.0, "reward": 0.22750000655651093, "reward_std": 0.5155822038650513, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5155822038650513, "sampling/importance_sampling_ratio/max": 0.9043940901756287, "sampling/importance_sampling_ratio/mean": 0.7394412159919739, "sampling/importance_sampling_ratio/min": 0.5218236446380615, "sampling/sampling_logp_difference/max": 0.5116550922393799, "sampling/sampling_logp_difference/mean": 0.010956826619803905, "step": 43, "step_time": 26.191963229037356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.21497398614883423, "epoch": 0.044, "frac_reward_zero_std": 0.0, "grad_norm": 2.5146665573120117, "kl": 0.001651956234127283, "learning_rate": 4.997784412857239e-06, "loss": -0.7275, "num_tokens": 124708.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 2.5137343406677246, "sampling/importance_sampling_ratio/mean": 1.6056748628616333, "sampling/importance_sampling_ratio/min": 0.775516927242279, "sampling/sampling_logp_difference/max": 0.7024750709533691, "sampling/sampling_logp_difference/mean": 0.027263030409812927, "step": 44, "step_time": 30.708443564013578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2064676135778427, "epoch": 0.045, "frac_reward_zero_std": 0.0, "grad_norm": 0.78768390417099, "kl": 0.0010001033078879118, "learning_rate": 4.99743050396022e-06, "loss": -0.0239, "num_tokens": 127107.0, "reward": 0.20000001788139343, "reward_std": 0.5147815346717834, "rewards/reward_func/mean": 0.20000001788139343, "rewards/reward_func/std": 0.5147814750671387, "sampling/importance_sampling_ratio/max": 1.0544236898422241, "sampling/importance_sampling_ratio/mean": 0.8732937574386597, "sampling/importance_sampling_ratio/min": 0.4507087171077728, "sampling/sampling_logp_difference/max": 0.454801082611084, "sampling/sampling_logp_difference/mean": 0.018568003550171852, "step": 45, "step_time": 30.19020549702691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.21479113399982452, "epoch": 0.046, "frac_reward_zero_std": 0.0, "grad_norm": 1.1975597143173218, "kl": 0.0035949531011283398, "learning_rate": 4.997050398198977e-06, "loss": 0.1127, "num_tokens": 129712.0, "reward": 0.45750001072883606, "reward_std": 0.5987974405288696, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.5987974405288696, "sampling/importance_sampling_ratio/max": 1.4185817241668701, "sampling/importance_sampling_ratio/mean": 0.8823011517524719, "sampling/importance_sampling_ratio/min": 0.3842265009880066, "sampling/sampling_logp_difference/max": 0.4681262969970703, "sampling/sampling_logp_difference/mean": 0.015924835577607155, "step": 46, "step_time": 27.530584376014303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.15177828073501587, "epoch": 0.047, "frac_reward_zero_std": 0.0, "grad_norm": 0.7548965215682983, "kl": 0.001141039072535932, "learning_rate": 4.9966440995606415e-06, "loss": -0.0115, "num_tokens": 132538.0, "reward": 0.45500001311302185, "reward_std": 0.6242061853408813, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6242061257362366, "sampling/importance_sampling_ratio/max": 1.2547972202301025, "sampling/importance_sampling_ratio/mean": 0.9954470992088318, "sampling/importance_sampling_ratio/min": 0.6697695255279541, "sampling/sampling_logp_difference/max": 0.2353825569152832, "sampling/sampling_logp_difference/mean": 0.010764792561531067, "step": 47, "step_time": 22.281028501980472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.14704622328281403, "epoch": 0.048, "frac_reward_zero_std": 0.0, "grad_norm": 0.6075945496559143, "kl": 0.0006156707531772554, "learning_rate": 4.9962116123070925e-06, "loss": -0.1026, "num_tokens": 135628.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.010442852973938, "sampling/importance_sampling_ratio/mean": 0.8380812406539917, "sampling/importance_sampling_ratio/min": 0.6370492577552795, "sampling/sampling_logp_difference/max": 0.3473479747772217, "sampling/sampling_logp_difference/mean": 0.014985868707299232, "step": 48, "step_time": 21.22760385699803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1836264580488205, "epoch": 0.049, "frac_reward_zero_std": 0.0, "grad_norm": 2.2631165981292725, "kl": 0.003190340008586645, "learning_rate": 4.9957529409749185e-06, "loss": -0.5156, "num_tokens": 138252.0, "reward": 0.4675000011920929, "reward_std": 0.6096104979515076, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6096105575561523, "sampling/importance_sampling_ratio/max": 2.915334701538086, "sampling/importance_sampling_ratio/mean": 1.6397058963775635, "sampling/importance_sampling_ratio/min": 0.5036308765411377, "sampling/sampling_logp_difference/max": 0.5744954347610474, "sampling/sampling_logp_difference/mean": 0.0214998796582222, "step": 49, "step_time": 23.344489980023354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16471697390079498, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.7013834714889526, "kl": 0.0007354745175689459, "learning_rate": 4.995268090375362e-06, "loss": -0.0902, "num_tokens": 141347.0, "reward": 0.48000001907348633, "reward_std": 0.6009436845779419, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.6009436845779419, "sampling/importance_sampling_ratio/max": 1.1693246364593506, "sampling/importance_sampling_ratio/mean": 0.8908734321594238, "sampling/importance_sampling_ratio/min": 0.5056505799293518, "sampling/sampling_logp_difference/max": 0.3043835163116455, "sampling/sampling_logp_difference/mean": 0.013043339364230633, "step": 50, "step_time": 25.42511347203981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1841271072626114, "epoch": 0.051, "frac_reward_zero_std": 0.0, "grad_norm": 0.6301296353340149, "kl": 0.0012232137378305197, "learning_rate": 4.99475706559428e-06, "loss": 0.2905, "num_tokens": 144262.0, "reward": 0.20000000298023224, "reward_std": 0.5282676219940186, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5282676219940186, "sampling/importance_sampling_ratio/max": 1.2973392009735107, "sampling/importance_sampling_ratio/mean": 0.8705179691314697, "sampling/importance_sampling_ratio/min": 0.25880172848701477, "sampling/sampling_logp_difference/max": 0.6381580829620361, "sampling/sampling_logp_difference/mean": 0.015857556834816933, "step": 51, "step_time": 30.46159318694845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.18686933815479279, "epoch": 0.052, "frac_reward_zero_std": 0.0, "grad_norm": 0.848724901676178, "kl": 0.006351604126393795, "learning_rate": 4.994219871992077e-06, "loss": -0.2645, "num_tokens": 147200.0, "reward": 0.22500000894069672, "reward_std": 0.5100653171539307, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.5100653171539307, "sampling/importance_sampling_ratio/max": 1.0454251766204834, "sampling/importance_sampling_ratio/mean": 0.5861590504646301, "sampling/importance_sampling_ratio/min": 0.2766340374946594, "sampling/sampling_logp_difference/max": 0.5210893154144287, "sampling/sampling_logp_difference/mean": 0.024404888972640038, "step": 52, "step_time": 28.86923932802165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18316607177257538, "epoch": 0.053, "frac_reward_zero_std": 0.0, "grad_norm": 1.149577260017395, "kl": 0.0012563634663820267, "learning_rate": 4.993656515203662e-06, "loss": 0.1632, "num_tokens": 150395.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 1.133528470993042, "sampling/importance_sampling_ratio/mean": 0.841528058052063, "sampling/importance_sampling_ratio/min": 0.4379132390022278, "sampling/sampling_logp_difference/max": 0.5306804180145264, "sampling/sampling_logp_difference/mean": 0.0160613264888525, "step": 53, "step_time": 19.896256964013446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19266362488269806, "epoch": 0.054, "frac_reward_zero_std": 0.0, "grad_norm": 0.4872373938560486, "kl": 0.003327696118503809, "learning_rate": 4.99306700113838e-06, "loss": -0.075, "num_tokens": 153401.0, "reward": 0.4625000059604645, "reward_std": 0.6175422072410583, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.6175422072410583, "sampling/importance_sampling_ratio/max": 0.4976256489753723, "sampling/importance_sampling_ratio/mean": 0.29365402460098267, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5140385627746582, "sampling/sampling_logp_difference/mean": 0.025473102927207947, "step": 54, "step_time": 31.636155979998875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.20710141956806183, "epoch": 0.055, "frac_reward_zero_std": 0.0, "grad_norm": 1.149296760559082, "kl": 0.0011379503412172198, "learning_rate": 4.9924513359799555e-06, "loss": -0.1354, "num_tokens": 156173.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.4572755098342896, "sampling/importance_sampling_ratio/mean": 0.9767011404037476, "sampling/importance_sampling_ratio/min": 0.5357866287231445, "sampling/sampling_logp_difference/max": 0.6271877288818359, "sampling/sampling_logp_difference/mean": 0.02084926702082157, "step": 55, "step_time": 17.177234951988794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.21721361577510834, "epoch": 0.056, "frac_reward_zero_std": 0.0, "grad_norm": 1.5740578174591064, "kl": 0.0007997292559593916, "learning_rate": 4.991809526186424e-06, "loss": -0.0199, "num_tokens": 158618.0, "reward": 0.7450000047683716, "reward_std": 0.5033554434776306, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5033553838729858, "sampling/importance_sampling_ratio/max": 1.350252628326416, "sampling/importance_sampling_ratio/mean": 1.130388617515564, "sampling/importance_sampling_ratio/min": 0.731552243232727, "sampling/sampling_logp_difference/max": 0.4004800319671631, "sampling/sampling_logp_difference/mean": 0.016493387520313263, "step": 56, "step_time": 12.045671095023863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1513984054327011, "epoch": 0.057, "frac_reward_zero_std": 0.0, "grad_norm": 1.7655246257781982, "kl": 0.002159195253625512, "learning_rate": 4.991141578490066e-06, "loss": 0.2972, "num_tokens": 161834.0, "reward": 0.4675000011920929, "reward_std": 0.6152167916297913, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6152167916297913, "sampling/importance_sampling_ratio/max": 2.2511866092681885, "sampling/importance_sampling_ratio/mean": 1.2370312213897705, "sampling/importance_sampling_ratio/min": 0.5680360198020935, "sampling/sampling_logp_difference/max": 0.6727904081344604, "sampling/sampling_logp_difference/mean": 0.018129004165530205, "step": 57, "step_time": 27.834994815988466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.19728638231754303, "epoch": 0.058, "frac_reward_zero_std": 0.0, "grad_norm": 0.9852524995803833, "kl": 0.001406394992955029, "learning_rate": 4.990447499897339e-06, "loss": 0.0504, "num_tokens": 164513.0, "reward": 0.2224999964237213, "reward_std": 0.5209206938743591, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.5209206938743591, "sampling/importance_sampling_ratio/max": 1.1135812997817993, "sampling/importance_sampling_ratio/mean": 0.873077929019928, "sampling/importance_sampling_ratio/min": 0.7462978363037109, "sampling/sampling_logp_difference/max": 0.24789834022521973, "sampling/sampling_logp_difference/mean": 0.012925262562930584, "step": 58, "step_time": 27.526946430967655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18410339951515198, "epoch": 0.059, "frac_reward_zero_std": 0.0, "grad_norm": 0.8322009444236755, "kl": 0.0017056845827028155, "learning_rate": 4.989727297688797e-06, "loss": 0.0509, "num_tokens": 167678.0, "reward": 0.4925000071525574, "reward_std": 0.5860247015953064, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.188061237335205, "sampling/importance_sampling_ratio/mean": 0.75825035572052, "sampling/importance_sampling_ratio/min": 0.4281614124774933, "sampling/sampling_logp_difference/max": 0.7175836563110352, "sampling/sampling_logp_difference/mean": 0.02080303058028221, "step": 59, "step_time": 30.796453499002382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16519904136657715, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.9648726582527161, "kl": 0.0009010934154503047, "learning_rate": 4.98898097941902e-06, "loss": -0.0568, "num_tokens": 170221.0, "reward": 0.7475000023841858, "reward_std": 0.4983556270599365, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.4983556270599365, "sampling/importance_sampling_ratio/max": 1.946204423904419, "sampling/importance_sampling_ratio/mean": 1.0818874835968018, "sampling/importance_sampling_ratio/min": 0.33341312408447266, "sampling/sampling_logp_difference/max": 0.5747513771057129, "sampling/sampling_logp_difference/mean": 0.014254895970225334, "step": 60, "step_time": 16.04147014999762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.22097966074943542, "epoch": 0.061, "frac_reward_zero_std": 0.0, "grad_norm": 1.0013903379440308, "kl": 0.0009198183543048799, "learning_rate": 4.988208552916535e-06, "loss": 0.199, "num_tokens": 172687.0, "reward": 0.4399999976158142, "reward_std": 0.6408848166465759, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.6408848166465759, "sampling/importance_sampling_ratio/max": 1.5965169668197632, "sampling/importance_sampling_ratio/mean": 1.2557064294815063, "sampling/importance_sampling_ratio/min": 0.8887098431587219, "sampling/sampling_logp_difference/max": 0.34328269958496094, "sampling/sampling_logp_difference/mean": 0.014658640138804913, "step": 61, "step_time": 23.913908172980882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.23701642453670502, "epoch": 0.062, "frac_reward_zero_std": 0.0, "grad_norm": 1.1638054847717285, "kl": 0.001260383171029389, "learning_rate": 4.98741002628373e-06, "loss": -0.0259, "num_tokens": 175422.0, "reward": 0.48500001430511475, "reward_std": 0.5948948860168457, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5948949456214905, "sampling/importance_sampling_ratio/max": 1.1163086891174316, "sampling/importance_sampling_ratio/mean": 0.9336670637130737, "sampling/importance_sampling_ratio/min": 0.7791392207145691, "sampling/sampling_logp_difference/max": 0.4493459463119507, "sampling/sampling_logp_difference/mean": 0.01823272742331028, "step": 62, "step_time": 20.333050184999593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2120150476694107, "epoch": 0.063, "frac_reward_zero_std": 0.0, "grad_norm": 0.9452406764030457, "kl": 0.002651732647791505, "learning_rate": 4.9865854078967715e-06, "loss": -0.2391, "num_tokens": 178057.0, "reward": 0.7275000214576721, "reward_std": 0.518676221370697, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.518676221370697, "sampling/importance_sampling_ratio/max": 2.074232578277588, "sampling/importance_sampling_ratio/mean": 1.0131702423095703, "sampling/importance_sampling_ratio/min": 0.6011484265327454, "sampling/sampling_logp_difference/max": 0.5899467468261719, "sampling/sampling_logp_difference/mean": 0.023687591776251793, "step": 63, "step_time": 23.110207741963677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1633850336074829, "epoch": 0.064, "frac_reward_zero_std": 0.0, "grad_norm": 2.0651257038116455, "kl": 0.0009649393032304943, "learning_rate": 4.985734706405516e-06, "loss": -0.0381, "num_tokens": 181165.0, "reward": 0.17250001430511475, "reward_std": 0.5539178848266602, "rewards/reward_func/mean": 0.17250001430511475, "rewards/reward_func/std": 0.5539178848266602, "sampling/importance_sampling_ratio/max": 1.7304970026016235, "sampling/importance_sampling_ratio/mean": 1.3624366521835327, "sampling/importance_sampling_ratio/min": 0.8324790000915527, "sampling/sampling_logp_difference/max": 0.3183094263076782, "sampling/sampling_logp_difference/mean": 0.01424376666545868, "step": 64, "step_time": 30.054957942047622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.15541672706604004, "epoch": 0.065, "frac_reward_zero_std": 0.0, "grad_norm": 1.1293599605560303, "kl": 0.0013445726362988353, "learning_rate": 4.9848579307334195e-06, "loss": -0.0138, "num_tokens": 184433.0, "reward": 0.2150000035762787, "reward_std": 0.5274782776832581, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.5274782776832581, "sampling/importance_sampling_ratio/max": 1.3569507598876953, "sampling/importance_sampling_ratio/mean": 0.9434778690338135, "sampling/importance_sampling_ratio/min": 0.6079815030097961, "sampling/sampling_logp_difference/max": 0.33409833908081055, "sampling/sampling_logp_difference/mean": 0.014387022703886032, "step": 65, "step_time": 34.34038391697686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.19945991039276123, "epoch": 0.066, "frac_reward_zero_std": 0.0, "grad_norm": 0.8952962160110474, "kl": 0.00123073800932616, "learning_rate": 4.983955090077445e-06, "loss": -0.0551, "num_tokens": 186965.0, "reward": 0.18250000476837158, "reward_std": 0.5473801493644714, "rewards/reward_func/mean": 0.18250000476837158, "rewards/reward_func/std": 0.5473801493644714, "sampling/importance_sampling_ratio/max": 0.8925302624702454, "sampling/importance_sampling_ratio/mean": 0.779961109161377, "sampling/importance_sampling_ratio/min": 0.6194220185279846, "sampling/sampling_logp_difference/max": 0.29934167861938477, "sampling/sampling_logp_difference/mean": 0.01775932125747204, "step": 66, "step_time": 24.28925737796817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.14749397337436676, "epoch": 0.067, "frac_reward_zero_std": 0.0, "grad_norm": 0.8485484719276428, "kl": 0.0005320626078173518, "learning_rate": 4.983026193907962e-06, "loss": -0.1738, "num_tokens": 189831.0, "reward": 0.45500001311302185, "reward_std": 0.6319019198417664, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6319018602371216, "sampling/importance_sampling_ratio/max": 1.3435710668563843, "sampling/importance_sampling_ratio/mean": 1.0019091367721558, "sampling/importance_sampling_ratio/min": 0.497560977935791, "sampling/sampling_logp_difference/max": 0.2902107238769531, "sampling/sampling_logp_difference/mean": 0.010316692292690277, "step": 67, "step_time": 25.627289718016982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.20792441070079803, "epoch": 0.068, "frac_reward_zero_std": 0.0, "grad_norm": 1.1592576503753662, "kl": 0.0019044190412387252, "learning_rate": 4.982071251968653e-06, "loss": -0.1445, "num_tokens": 192559.0, "reward": 0.2199999988079071, "reward_std": 0.5137444138526917, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5137444734573364, "sampling/importance_sampling_ratio/max": 1.0615043640136719, "sampling/importance_sampling_ratio/mean": 0.6696117520332336, "sampling/importance_sampling_ratio/min": 0.3953132629394531, "sampling/sampling_logp_difference/max": 0.2613469958305359, "sampling/sampling_logp_difference/mean": 0.01951698027551174, "step": 68, "step_time": 25.178531533980276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.21523474156856537, "epoch": 0.069, "frac_reward_zero_std": 0.0, "grad_norm": 1.128862977027893, "kl": 0.0025856378488242626, "learning_rate": 4.981090274276406e-06, "loss": 0.2825, "num_tokens": 195423.0, "reward": 0.21000000834465027, "reward_std": 0.5302829146385193, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.5302829146385193, "sampling/importance_sampling_ratio/max": 1.4800848960876465, "sampling/importance_sampling_ratio/mean": 1.0121972560882568, "sampling/importance_sampling_ratio/min": 0.41341984272003174, "sampling/sampling_logp_difference/max": 0.6674971580505371, "sampling/sampling_logp_difference/mean": 0.01932966522872448, "step": 69, "step_time": 29.600733365979977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.19063298404216766, "epoch": 0.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.8831535577774048, "kl": 0.0012035000836476684, "learning_rate": 4.980083271121215e-06, "loss": -0.0351, "num_tokens": 198196.0, "reward": 0.4750000238418579, "reward_std": 0.6008049845695496, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.6008050441741943, "sampling/importance_sampling_ratio/max": 1.550776481628418, "sampling/importance_sampling_ratio/mean": 1.093490719795227, "sampling/importance_sampling_ratio/min": 0.7317591309547424, "sampling/sampling_logp_difference/max": 0.2983529567718506, "sampling/sampling_logp_difference/mean": 0.01806030422449112, "step": 70, "step_time": 22.614920030988287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1471492499113083, "epoch": 0.071, "frac_reward_zero_std": 0.0, "grad_norm": 0.8415653705596924, "kl": 0.001774770556949079, "learning_rate": 4.979050253066064e-06, "loss": 0.1344, "num_tokens": 201574.0, "reward": 0.20749999582767487, "reward_std": 0.5343766212463379, "rewards/reward_func/mean": 0.20749999582767487, "rewards/reward_func/std": 0.5343766212463379, "sampling/importance_sampling_ratio/max": 1.2315179109573364, "sampling/importance_sampling_ratio/mean": 0.8319957256317139, "sampling/importance_sampling_ratio/min": 0.37346959114074707, "sampling/sampling_logp_difference/max": 0.3038468360900879, "sampling/sampling_logp_difference/mean": 0.01399692241102457, "step": 71, "step_time": 32.291977098968346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2548491060733795, "epoch": 0.072, "frac_reward_zero_std": 0.0, "grad_norm": 0.9007646441459656, "kl": 0.001842888887040317, "learning_rate": 4.977991230946824e-06, "loss": -0.1085, "num_tokens": 204193.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5773502588272095, "sampling/importance_sampling_ratio/max": 1.322192668914795, "sampling/importance_sampling_ratio/mean": 0.6403644680976868, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0362257957458496, "sampling/sampling_logp_difference/mean": 0.02476266585290432, "step": 72, "step_time": 21.036906433990225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1888900250196457, "epoch": 0.073, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625698566436768, "kl": 0.001492207869887352, "learning_rate": 4.976906215872137e-06, "loss": 0.603, "num_tokens": 207373.0, "reward": 0.47749999165534973, "reward_std": 0.5977945327758789, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.5977945923805237, "sampling/importance_sampling_ratio/max": 2.7633776664733887, "sampling/importance_sampling_ratio/mean": 1.0478754043579102, "sampling/importance_sampling_ratio/min": 0.4228649139404297, "sampling/sampling_logp_difference/max": 0.43424272537231445, "sampling/sampling_logp_difference/mean": 0.02459087036550045, "step": 73, "step_time": 25.471207298978698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18793362379074097, "epoch": 0.074, "frac_reward_zero_std": 0.0, "grad_norm": 1.508971929550171, "kl": 0.0018537156283855438, "learning_rate": 4.975795219223299e-06, "loss": -0.2752, "num_tokens": 210184.0, "reward": 0.48249998688697815, "reward_std": 0.5976830124855042, "rewards/reward_func/mean": 0.48249998688697815, "rewards/reward_func/std": 0.5976830720901489, "sampling/importance_sampling_ratio/max": 2.1085121631622314, "sampling/importance_sampling_ratio/mean": 1.0750091075897217, "sampling/importance_sampling_ratio/min": 0.4717519283294678, "sampling/sampling_logp_difference/max": 0.29877281188964844, "sampling/sampling_logp_difference/mean": 0.018414776772260666, "step": 74, "step_time": 25.304523543978576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.20891088247299194, "epoch": 0.075, "frac_reward_zero_std": 0.0, "grad_norm": 1.2454321384429932, "kl": 0.0012727654539048672, "learning_rate": 4.974658252654135e-06, "loss": -0.257, "num_tokens": 212856.0, "reward": 0.4375, "reward_std": 0.644586443901062, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.644586443901062, "sampling/importance_sampling_ratio/max": 1.3624351024627686, "sampling/importance_sampling_ratio/mean": 1.0055991411209106, "sampling/importance_sampling_ratio/min": 0.42678892612457275, "sampling/sampling_logp_difference/max": 0.3524940013885498, "sampling/sampling_logp_difference/mean": 0.018451165407896042, "step": 75, "step_time": 24.537633324973285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.23340889811515808, "epoch": 0.076, "frac_reward_zero_std": 0.0, "grad_norm": 1.38158118724823, "kl": 0.008782871998846531, "learning_rate": 4.973495328090891e-06, "loss": 0.0165, "num_tokens": 215244.0, "reward": 0.7325000166893005, "reward_std": 0.5217518210411072, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.521751880645752, "sampling/importance_sampling_ratio/max": 1.4231406450271606, "sampling/importance_sampling_ratio/mean": 1.1423513889312744, "sampling/importance_sampling_ratio/min": 0.8611574172973633, "sampling/sampling_logp_difference/max": 0.42988771200180054, "sampling/sampling_logp_difference/mean": 0.01642424799501896, "step": 76, "step_time": 18.15869120496791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.17539867758750916, "epoch": 0.077, "frac_reward_zero_std": 0.0, "grad_norm": 0.9459503293037415, "kl": 0.0010257470421493053, "learning_rate": 4.972306457732091e-06, "loss": 0.2077, "num_tokens": 217792.0, "reward": 0.19249999523162842, "reward_std": 0.533565104007721, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.533565104007721, "sampling/importance_sampling_ratio/max": 1.1146260499954224, "sampling/importance_sampling_ratio/mean": 0.9004780650138855, "sampling/importance_sampling_ratio/min": 0.6381573677062988, "sampling/sampling_logp_difference/max": 0.4264984130859375, "sampling/sampling_logp_difference/mean": 0.016182271763682365, "step": 77, "step_time": 31.208338773983996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.19025379419326782, "epoch": 0.078, "frac_reward_zero_std": 0.0, "grad_norm": 1.831222653388977, "kl": 0.0025972435250878334, "learning_rate": 4.971091654048427e-06, "loss": 0.1881, "num_tokens": 221116.0, "reward": 0.737500011920929, "reward_std": 0.511753499507904, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.511753499507904, "sampling/importance_sampling_ratio/max": 1.188819408416748, "sampling/importance_sampling_ratio/mean": 0.7440864443778992, "sampling/importance_sampling_ratio/min": 0.2582259178161621, "sampling/sampling_logp_difference/max": 0.33910036087036133, "sampling/sampling_logp_difference/mean": 0.020248090848326683, "step": 78, "step_time": 30.436808540020138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18020538985729218, "epoch": 0.079, "frac_reward_zero_std": 0.0, "grad_norm": 0.8101328611373901, "kl": 0.0009451335063204169, "learning_rate": 4.96985092978261e-06, "loss": 0.1431, "num_tokens": 223580.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 0.8986702561378479, "sampling/importance_sampling_ratio/mean": 0.7042682766914368, "sampling/importance_sampling_ratio/min": 0.49243295192718506, "sampling/sampling_logp_difference/max": 0.4497513771057129, "sampling/sampling_logp_difference/mean": 0.019172092899680138, "step": 79, "step_time": 15.142612004012335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17248542606830597, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 2.111847162246704, "kl": 0.002328401431441307, "learning_rate": 4.968584297949255e-06, "loss": 0.3656, "num_tokens": 226495.0, "reward": 0.24250000715255737, "reward_std": 0.5051980018615723, "rewards/reward_func/mean": 0.24250000715255737, "rewards/reward_func/std": 0.5051980018615723, "sampling/importance_sampling_ratio/max": 2.5861730575561523, "sampling/importance_sampling_ratio/mean": 1.5023744106292725, "sampling/importance_sampling_ratio/min": 0.8174070119857788, "sampling/sampling_logp_difference/max": 0.35844993591308594, "sampling/sampling_logp_difference/mean": 0.01990363374352455, "step": 80, "step_time": 23.18481891101692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18824531137943268, "epoch": 0.081, "frac_reward_zero_std": 0.0, "grad_norm": 0.9466284513473511, "kl": 0.004662039689719677, "learning_rate": 4.967291771834727e-06, "loss": 0.1046, "num_tokens": 229393.0, "reward": 0.2175000011920929, "reward_std": 0.5103185176849365, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.5103185772895813, "sampling/importance_sampling_ratio/max": 1.0310555696487427, "sampling/importance_sampling_ratio/mean": 0.7246109843254089, "sampling/importance_sampling_ratio/min": 0.5019353628158569, "sampling/sampling_logp_difference/max": 0.5024052858352661, "sampling/sampling_logp_difference/mean": 0.013435584492981434, "step": 81, "step_time": 27.394535820989404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 60.25, "completions/mean_terminated_length": 60.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19830913841724396, "epoch": 0.082, "frac_reward_zero_std": 0.0, "grad_norm": 0.7175694704055786, "kl": 0.0016878106398507953, "learning_rate": 4.965973364997015e-06, "loss": 0.1584, "num_tokens": 232316.0, "reward": 0.47999998927116394, "reward_std": 0.6009436845779419, "rewards/reward_func/mean": 0.47999998927116394, "rewards/reward_func/std": 0.6009436845779419, "sampling/importance_sampling_ratio/max": 0.9893266558647156, "sampling/importance_sampling_ratio/mean": 0.7159191370010376, "sampling/importance_sampling_ratio/min": 0.48449647426605225, "sampling/sampling_logp_difference/max": 0.353057861328125, "sampling/sampling_logp_difference/mean": 0.016375279054045677, "step": 82, "step_time": 24.000578142004088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.2068842500448227, "epoch": 0.083, "frac_reward_zero_std": 0.0, "grad_norm": 1.5572787523269653, "kl": 0.0030831273179501295, "learning_rate": 4.964629091265583e-06, "loss": -0.0832, "num_tokens": 235059.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.7095023393630981, "sampling/importance_sampling_ratio/mean": 1.261395812034607, "sampling/importance_sampling_ratio/min": 0.795553982257843, "sampling/sampling_logp_difference/max": 0.3227384090423584, "sampling/sampling_logp_difference/mean": 0.02054617740213871, "step": 83, "step_time": 15.635594933002722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18001852929592133, "epoch": 0.084, "frac_reward_zero_std": 0.0, "grad_norm": 0.7639207243919373, "kl": 0.0012803806457668543, "learning_rate": 4.963258964741227e-06, "loss": -0.0943, "num_tokens": 237679.0, "reward": 0.7350000143051147, "reward_std": 0.49047595262527466, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.49047595262527466, "sampling/importance_sampling_ratio/max": 1.057940125465393, "sampling/importance_sampling_ratio/mean": 0.9070847034454346, "sampling/importance_sampling_ratio/min": 0.7359876036643982, "sampling/sampling_logp_difference/max": 0.3406977653503418, "sampling/sampling_logp_difference/mean": 0.011335974559187889, "step": 84, "step_time": 22.35556443000678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.25080421566963196, "epoch": 0.085, "frac_reward_zero_std": 0.0, "grad_norm": 1.5009522438049316, "kl": 0.0018894955283030868, "learning_rate": 4.961862999795923e-06, "loss": -0.0785, "num_tokens": 240234.0, "reward": 0.7400000095367432, "reward_std": 0.4934234321117401, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.4934234619140625, "sampling/importance_sampling_ratio/max": 1.9614378213882446, "sampling/importance_sampling_ratio/mean": 1.3484985828399658, "sampling/importance_sampling_ratio/min": 0.7529999017715454, "sampling/sampling_logp_difference/max": 0.2998523712158203, "sampling/sampling_logp_difference/mean": 0.018537577241659164, "step": 85, "step_time": 17.208555177028757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17496506869792938, "epoch": 0.086, "frac_reward_zero_std": 0.0, "grad_norm": 1.0767855644226074, "kl": 0.0010021874913945794, "learning_rate": 4.960441211072686e-06, "loss": -0.1408, "num_tokens": 242994.0, "reward": 0.4724999964237213, "reward_std": 0.6091181635856628, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6091182231903076, "sampling/importance_sampling_ratio/max": 1.1346582174301147, "sampling/importance_sampling_ratio/mean": 0.9156138896942139, "sampling/importance_sampling_ratio/min": 0.7800959348678589, "sampling/sampling_logp_difference/max": 0.2250080704689026, "sampling/sampling_logp_difference/mean": 0.013982197269797325, "step": 86, "step_time": 21.610419663018547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1998775154352188, "epoch": 0.087, "frac_reward_zero_std": 0.0, "grad_norm": 1.9723621606826782, "kl": 0.0023004813119769096, "learning_rate": 4.958993613485406e-06, "loss": -0.1847, "num_tokens": 245681.0, "reward": 0.4674999713897705, "reward_std": 0.6152167916297913, "rewards/reward_func/mean": 0.4674999713897705, "rewards/reward_func/std": 0.6152167916297913, "sampling/importance_sampling_ratio/max": 2.5097455978393555, "sampling/importance_sampling_ratio/mean": 1.5100226402282715, "sampling/importance_sampling_ratio/min": 0.7366971969604492, "sampling/sampling_logp_difference/max": 0.48386502265930176, "sampling/sampling_logp_difference/mean": 0.021351447328925133, "step": 87, "step_time": 21.223093150998466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.19157151877880096, "epoch": 0.088, "frac_reward_zero_std": 0.0, "grad_norm": 1.1788418292999268, "kl": 0.0008473507477901876, "learning_rate": 4.957520222218695e-06, "loss": -0.1798, "num_tokens": 248600.0, "reward": 0.48000001907348633, "reward_std": 0.5663921236991882, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.5663921236991882, "sampling/importance_sampling_ratio/max": 1.2281941175460815, "sampling/importance_sampling_ratio/mean": 0.9877327680587769, "sampling/importance_sampling_ratio/min": 0.8069263696670532, "sampling/sampling_logp_difference/max": 0.574695348739624, "sampling/sampling_logp_difference/mean": 0.015736455097794533, "step": 88, "step_time": 26.748015739023685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.2343548834323883, "epoch": 0.089, "frac_reward_zero_std": 0.0, "grad_norm": 1.8951774835586548, "kl": 0.002801395719870925, "learning_rate": 4.956021052727731e-06, "loss": 0.4673, "num_tokens": 251348.0, "reward": 0.48500001430511475, "reward_std": 0.5948949456214905, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5948949456214905, "sampling/importance_sampling_ratio/max": 2.407419204711914, "sampling/importance_sampling_ratio/mean": 1.3294519186019897, "sampling/importance_sampling_ratio/min": 0.7976511716842651, "sampling/sampling_logp_difference/max": 0.6647517681121826, "sampling/sampling_logp_difference/mean": 0.023183371871709824, "step": 89, "step_time": 22.024159724009223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.20605750381946564, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 2.48577880859375, "kl": 0.0008850714657455683, "learning_rate": 4.954496120738094e-06, "loss": 0.0909, "num_tokens": 253927.0, "reward": 0.7350000143051147, "reward_std": 0.5233545899391174, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 1.0736109018325806, "sampling/importance_sampling_ratio/mean": 0.9583907723426819, "sampling/importance_sampling_ratio/min": 0.8531460762023926, "sampling/sampling_logp_difference/max": 0.2530559301376343, "sampling/sampling_logp_difference/mean": 0.009179707616567612, "step": 90, "step_time": 21.953675083001144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.16737303137779236, "epoch": 0.091, "frac_reward_zero_std": 0.0, "grad_norm": 1.6307145357131958, "kl": 0.0016736111138015985, "learning_rate": 4.952945442245598e-06, "loss": 0.0807, "num_tokens": 256877.0, "reward": 0.24500000476837158, "reward_std": 0.5034216046333313, "rewards/reward_func/mean": 0.24500000476837158, "rewards/reward_func/std": 0.5034216642379761, "sampling/importance_sampling_ratio/max": 1.270249843597412, "sampling/importance_sampling_ratio/mean": 1.1158859729766846, "sampling/importance_sampling_ratio/min": 0.9899454712867737, "sampling/sampling_logp_difference/max": 0.2251741886138916, "sampling/sampling_logp_difference/mean": 0.009074542671442032, "step": 91, "step_time": 26.747375831997488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15791700780391693, "epoch": 0.092, "frac_reward_zero_std": 0.0, "grad_norm": 4.263246059417725, "kl": 0.008684685453772545, "learning_rate": 4.951369033516127e-06, "loss": 0.7711, "num_tokens": 259942.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 2.5689008235931396, "sampling/importance_sampling_ratio/mean": 1.2723312377929688, "sampling/importance_sampling_ratio/min": 0.6566939949989319, "sampling/sampling_logp_difference/max": 0.4236961007118225, "sampling/sampling_logp_difference/mean": 0.015969693660736084, "step": 92, "step_time": 19.015668170992285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18421852588653564, "epoch": 0.093, "frac_reward_zero_std": 0.0, "grad_norm": 0.9175493121147156, "kl": 0.00123626331333071, "learning_rate": 4.949766911085461e-06, "loss": 0.0242, "num_tokens": 262792.0, "reward": 0.23250000178813934, "reward_std": 0.5127296447753906, "rewards/reward_func/mean": 0.23250000178813934, "rewards/reward_func/std": 0.5127295851707458, "sampling/importance_sampling_ratio/max": 1.4501962661743164, "sampling/importance_sampling_ratio/mean": 1.0343022346496582, "sampling/importance_sampling_ratio/min": 0.6067715883255005, "sampling/sampling_logp_difference/max": 0.37281107902526855, "sampling/sampling_logp_difference/mean": 0.01619715429842472, "step": 93, "step_time": 27.5614887670381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2494964897632599, "epoch": 0.094, "frac_reward_zero_std": 0.0, "grad_norm": 1.3134595155715942, "kl": 0.005017485003918409, "learning_rate": 4.948139091759108e-06, "loss": 0.2517, "num_tokens": 265919.0, "reward": -0.019999999552965164, "reward_std": 0.027080127969384193, "rewards/reward_func/mean": -0.019999999552965164, "rewards/reward_func/std": 0.027080127969384193, "sampling/importance_sampling_ratio/max": 1.3162477016448975, "sampling/importance_sampling_ratio/mean": 0.7721202373504639, "sampling/importance_sampling_ratio/min": 0.23980712890625, "sampling/sampling_logp_difference/max": 0.7502391338348389, "sampling/sampling_logp_difference/mean": 0.022354744374752045, "step": 94, "step_time": 35.42817664000904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2056226134300232, "epoch": 0.095, "frac_reward_zero_std": 0.0, "grad_norm": 1.3646328449249268, "kl": 0.001280627097003162, "learning_rate": 4.946485592612122e-06, "loss": -0.0886, "num_tokens": 268803.0, "reward": 0.4424999952316284, "reward_std": 0.6380373239517212, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.638037383556366, "sampling/importance_sampling_ratio/max": 1.4729505777359009, "sampling/importance_sampling_ratio/mean": 1.1322357654571533, "sampling/importance_sampling_ratio/min": 0.9130431413650513, "sampling/sampling_logp_difference/max": 0.320281982421875, "sampling/sampling_logp_difference/mean": 0.017941193655133247, "step": 95, "step_time": 35.223688244994264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.15846936404705048, "epoch": 0.096, "frac_reward_zero_std": 0.0, "grad_norm": 0.9042047262191772, "kl": 0.0009366063750348985, "learning_rate": 4.944806430988927e-06, "loss": -0.1322, "num_tokens": 271285.0, "reward": 0.7150000333786011, "reward_std": 0.5633530616760254, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5633530616760254, "sampling/importance_sampling_ratio/max": 2.0537869930267334, "sampling/importance_sampling_ratio/mean": 1.0663601160049438, "sampling/importance_sampling_ratio/min": 0.6622300148010254, "sampling/sampling_logp_difference/max": 0.646028995513916, "sampling/sampling_logp_difference/mean": 0.013127650134265423, "step": 96, "step_time": 18.855733782984316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.18483242392539978, "epoch": 0.097, "frac_reward_zero_std": 0.0, "grad_norm": 1.6808007955551147, "kl": 0.0032876559998840094, "learning_rate": 4.943101624503133e-06, "loss": -0.2944, "num_tokens": 274318.0, "reward": 0.4925000071525574, "reward_std": 0.5860247611999512, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.4065831899642944, "sampling/importance_sampling_ratio/mean": 0.9736905097961426, "sampling/importance_sampling_ratio/min": 0.455060750246048, "sampling/sampling_logp_difference/max": 0.4783846139907837, "sampling/sampling_logp_difference/mean": 0.016842158511281013, "step": 97, "step_time": 22.708153731015045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18852512538433075, "epoch": 0.098, "frac_reward_zero_std": 0.0, "grad_norm": 1.4312896728515625, "kl": 0.0015657518524676561, "learning_rate": 4.941371191037353e-06, "loss": -0.0718, "num_tokens": 277555.0, "reward": 0.2199999988079071, "reward_std": 0.5217278599739075, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5217278599739075, "sampling/importance_sampling_ratio/max": 1.343669056892395, "sampling/importance_sampling_ratio/mean": 1.0693325996398926, "sampling/importance_sampling_ratio/min": 0.5463237166404724, "sampling/sampling_logp_difference/max": 0.2894654870033264, "sampling/sampling_logp_difference/mean": 0.015972686931490898, "step": 98, "step_time": 34.76966292003635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.20890052616596222, "epoch": 0.099, "frac_reward_zero_std": 0.0, "grad_norm": 0.9855401515960693, "kl": 0.0025055576115846634, "learning_rate": 4.939615148743017e-06, "loss": 0.1927, "num_tokens": 280440.0, "reward": 0.49000000953674316, "reward_std": 0.5831523537635803, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5831523537635803, "sampling/importance_sampling_ratio/max": 0.9756810665130615, "sampling/importance_sampling_ratio/mean": 0.8230453729629517, "sampling/importance_sampling_ratio/min": 0.5430414080619812, "sampling/sampling_logp_difference/max": 0.45201241970062256, "sampling/sampling_logp_difference/mean": 0.02280852198600769, "step": 99, "step_time": 23.544920118991286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1777784526348114, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 1.6256330013275146, "kl": 0.0027319176588207483, "learning_rate": 4.937833516040177e-06, "loss": 0.1325, "num_tokens": 283886.0, "reward": 0.4449999928474426, "reward_std": 0.6027989983558655, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.6027990579605103, "sampling/importance_sampling_ratio/max": 1.459632396697998, "sampling/importance_sampling_ratio/mean": 1.0663809776306152, "sampling/importance_sampling_ratio/min": 0.8068637251853943, "sampling/sampling_logp_difference/max": 0.4505075216293335, "sampling/sampling_logp_difference/mean": 0.018568670377135277, "step": 100, "step_time": 30.95737542200368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1763390153646469, "epoch": 0.101, "frac_reward_zero_std": 0.0, "grad_norm": 1.2336703538894653, "kl": 0.0019096658797934651, "learning_rate": 4.936026311617316e-06, "loss": -0.3896, "num_tokens": 286837.0, "reward": 0.48500001430511475, "reward_std": 0.5832380652427673, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5832380652427673, "sampling/importance_sampling_ratio/max": 1.2133924961090088, "sampling/importance_sampling_ratio/mean": 0.6677452325820923, "sampling/importance_sampling_ratio/min": 0.1646837443113327, "sampling/sampling_logp_difference/max": 1.1721301078796387, "sampling/sampling_logp_difference/mean": 0.025110039860010147, "step": 101, "step_time": 28.715264655009378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.21230566501617432, "epoch": 0.102, "frac_reward_zero_std": 0.0, "grad_norm": 1.7928258180618286, "kl": 0.0024363533593714237, "learning_rate": 4.9341935544311536e-06, "loss": -0.1466, "num_tokens": 289268.0, "reward": 0.4725000262260437, "reward_std": 0.6035657525062561, "rewards/reward_func/mean": 0.4725000262260437, "rewards/reward_func/std": 0.6035658121109009, "sampling/importance_sampling_ratio/max": 1.2063745260238647, "sampling/importance_sampling_ratio/mean": 0.9619500637054443, "sampling/importance_sampling_ratio/min": 0.8435264229774475, "sampling/sampling_logp_difference/max": 0.33896470069885254, "sampling/sampling_logp_difference/mean": 0.015319399535655975, "step": 102, "step_time": 20.099383049004246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.17486608028411865, "epoch": 0.103, "frac_reward_zero_std": 0.0, "grad_norm": 1.3083218336105347, "kl": 0.0022497796453535557, "learning_rate": 4.932335263706446e-06, "loss": -0.1562, "num_tokens": 292203.0, "reward": 0.22499999403953552, "reward_std": 0.5188127756118774, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.5188127756118774, "sampling/importance_sampling_ratio/max": 1.17838454246521, "sampling/importance_sampling_ratio/mean": 0.9927463531494141, "sampling/importance_sampling_ratio/min": 0.8120310306549072, "sampling/sampling_logp_difference/max": 0.4807014465332031, "sampling/sampling_logp_difference/mean": 0.01787315495312214, "step": 103, "step_time": 29.151268461020663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1726616770029068, "epoch": 0.104, "frac_reward_zero_std": 0.0, "grad_norm": 1.7738467454910278, "kl": 0.0010201664408668876, "learning_rate": 4.930451458935783e-06, "loss": -0.2733, "num_tokens": 294707.0, "reward": 0.19500000774860382, "reward_std": 0.5389804840087891, "rewards/reward_func/mean": 0.19500000774860382, "rewards/reward_func/std": 0.5389805436134338, "sampling/importance_sampling_ratio/max": 1.75444757938385, "sampling/importance_sampling_ratio/mean": 1.210362195968628, "sampling/importance_sampling_ratio/min": 0.7859510779380798, "sampling/sampling_logp_difference/max": 0.23338282108306885, "sampling/sampling_logp_difference/mean": 0.011113667860627174, "step": 104, "step_time": 25.10467550699832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19033846259117126, "epoch": 0.105, "frac_reward_zero_std": 0.0, "grad_norm": 1.3573626279830933, "kl": 0.001154035795480013, "learning_rate": 4.928542159879386e-06, "loss": -0.004, "num_tokens": 297420.0, "reward": 0.23500001430511475, "reward_std": 0.5100653171539307, "rewards/reward_func/mean": 0.23500001430511475, "rewards/reward_func/std": 0.5100653767585754, "sampling/importance_sampling_ratio/max": 1.1488821506500244, "sampling/importance_sampling_ratio/mean": 0.9753479361534119, "sampling/importance_sampling_ratio/min": 0.8727383017539978, "sampling/sampling_logp_difference/max": 0.2987060546875, "sampling/sampling_logp_difference/mean": 0.01720704510807991, "step": 105, "step_time": 26.004101942991838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.17758287489414215, "epoch": 0.106, "frac_reward_zero_std": 0.0, "grad_norm": 1.4572887420654297, "kl": 0.0018109632655978203, "learning_rate": 4.926607386564898e-06, "loss": -0.0799, "num_tokens": 299794.0, "reward": 0.17750000953674316, "reward_std": 0.5496286749839783, "rewards/reward_func/mean": 0.17750000953674316, "rewards/reward_func/std": 0.5496286749839783, "sampling/importance_sampling_ratio/max": 1.25441312789917, "sampling/importance_sampling_ratio/mean": 0.9649294018745422, "sampling/importance_sampling_ratio/min": 0.705916702747345, "sampling/sampling_logp_difference/max": 0.5950506925582886, "sampling/sampling_logp_difference/mean": 0.023428868502378464, "step": 106, "step_time": 23.797343968006317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.16303661465644836, "epoch": 0.107, "frac_reward_zero_std": 0.0, "grad_norm": 1.331335425376892, "kl": 0.001623444608412683, "learning_rate": 4.924647159287176e-06, "loss": -0.0747, "num_tokens": 302650.0, "reward": 0.7350000143051147, "reward_std": 0.49047595262527466, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.49047595262527466, "sampling/importance_sampling_ratio/max": 1.2468457221984863, "sampling/importance_sampling_ratio/mean": 0.8416212797164917, "sampling/importance_sampling_ratio/min": 0.13962090015411377, "sampling/sampling_logp_difference/max": 0.48709654808044434, "sampling/sampling_logp_difference/mean": 0.019597221165895462, "step": 107, "step_time": 22.32623191800667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.19394847750663757, "epoch": 0.108, "frac_reward_zero_std": 0.0, "grad_norm": 1.8492478132247925, "kl": 0.001253225258551538, "learning_rate": 4.922661498608077e-06, "loss": 0.0901, "num_tokens": 305378.0, "reward": 0.7100000381469727, "reward_std": 0.5534738898277283, "rewards/reward_func/mean": 0.7100000381469727, "rewards/reward_func/std": 0.5534738898277283, "sampling/importance_sampling_ratio/max": 1.3100117444992065, "sampling/importance_sampling_ratio/mean": 0.9863351583480835, "sampling/importance_sampling_ratio/min": 0.7732612490653992, "sampling/sampling_logp_difference/max": 0.4558746814727783, "sampling/sampling_logp_difference/mean": 0.017765268683433533, "step": 108, "step_time": 27.578786914993543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.19924715161323547, "epoch": 0.109, "frac_reward_zero_std": 0.0, "grad_norm": 1.2670817375183105, "kl": 0.0021331363823264837, "learning_rate": 4.920650425356239e-06, "loss": -0.0794, "num_tokens": 307941.0, "reward": 0.7124999761581421, "reward_std": 0.5683528780937195, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.5683528780937195, "sampling/importance_sampling_ratio/max": 1.0697344541549683, "sampling/importance_sampling_ratio/mean": 0.9342882037162781, "sampling/importance_sampling_ratio/min": 0.7822202444076538, "sampling/sampling_logp_difference/max": 0.334092378616333, "sampling/sampling_logp_difference/mean": 0.015028641559183598, "step": 109, "step_time": 21.83331077598268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18487322330474854, "epoch": 0.11, "frac_reward_zero_std": 0.0, "grad_norm": 0.8698979020118713, "kl": 0.0013737177941948175, "learning_rate": 4.9186139606268735e-06, "loss": -0.1567, "num_tokens": 310942.0, "reward": 0.7475000023841858, "reward_std": 0.4983556270599365, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.4983556270599365, "sampling/importance_sampling_ratio/max": 1.161033272743225, "sampling/importance_sampling_ratio/mean": 0.7982594966888428, "sampling/importance_sampling_ratio/min": 0.5104525089263916, "sampling/sampling_logp_difference/max": 0.32977819442749023, "sampling/sampling_logp_difference/mean": 0.01356328371912241, "step": 110, "step_time": 20.629428792977706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16910772025585175, "epoch": 0.111, "frac_reward_zero_std": 0.0, "grad_norm": 1.021267056465149, "kl": 0.0019139840733259916, "learning_rate": 4.916552125781529e-06, "loss": 0.1452, "num_tokens": 313605.0, "reward": 0.48250001668930054, "reward_std": 0.5923048257827759, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5923048257827759, "sampling/importance_sampling_ratio/max": 1.3576353788375854, "sampling/importance_sampling_ratio/mean": 0.881765604019165, "sampling/importance_sampling_ratio/min": 0.49340105056762695, "sampling/sampling_logp_difference/max": 0.3430800437927246, "sampling/sampling_logp_difference/mean": 0.01439821720123291, "step": 111, "step_time": 22.287479745980818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16936790943145752, "epoch": 0.112, "frac_reward_zero_std": 0.0, "grad_norm": 0.8302887082099915, "kl": 0.0005847630091011524, "learning_rate": 4.9144649424478765e-06, "loss": -0.0653, "num_tokens": 316123.0, "reward": 0.4675000011920929, "reward_std": 0.6045039296150208, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6045039892196655, "sampling/importance_sampling_ratio/max": 1.0760005712509155, "sampling/importance_sampling_ratio/mean": 0.8613239526748657, "sampling/importance_sampling_ratio/min": 0.6867309808731079, "sampling/sampling_logp_difference/max": 0.3250398635864258, "sampling/sampling_logp_difference/mean": 0.01495566125959158, "step": 112, "step_time": 21.551890997972805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18435673415660858, "epoch": 0.113, "frac_reward_zero_std": 0.0, "grad_norm": 2.0658011436462402, "kl": 0.0016558639472350478, "learning_rate": 4.912352432519484e-06, "loss": -0.1396, "num_tokens": 318583.0, "reward": 0.18250000476837158, "reward_std": 0.5468927621841431, "rewards/reward_func/mean": 0.18250000476837158, "rewards/reward_func/std": 0.5468927621841431, "sampling/importance_sampling_ratio/max": 1.5218075513839722, "sampling/importance_sampling_ratio/mean": 1.205528736114502, "sampling/importance_sampling_ratio/min": 0.8835570812225342, "sampling/sampling_logp_difference/max": 0.35492515563964844, "sampling/sampling_logp_difference/mean": 0.014847726561129093, "step": 113, "step_time": 25.027594926999882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18846319615840912, "epoch": 0.114, "frac_reward_zero_std": 0.0, "grad_norm": 1.306510329246521, "kl": 0.00534132681787014, "learning_rate": 4.910214618155579e-06, "loss": 0.2203, "num_tokens": 321476.0, "reward": 0.48750001192092896, "reward_std": 0.5862521529197693, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5862522125244141, "sampling/importance_sampling_ratio/max": 2.1268928050994873, "sampling/importance_sampling_ratio/mean": 0.9861550331115723, "sampling/importance_sampling_ratio/min": 0.37154924869537354, "sampling/sampling_logp_difference/max": 0.7350797653198242, "sampling/sampling_logp_difference/mean": 0.020567119121551514, "step": 114, "step_time": 22.9863213620265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.20240023732185364, "epoch": 0.115, "frac_reward_zero_std": 0.0, "grad_norm": 0.9395687580108643, "kl": 0.00462345452979207, "learning_rate": 4.908051521780824e-06, "loss": -0.1204, "num_tokens": 324242.0, "reward": 0.20250000059604645, "reward_std": 0.5320009589195251, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.5320009589195251, "sampling/importance_sampling_ratio/max": 0.9608266949653625, "sampling/importance_sampling_ratio/mean": 0.7054085731506348, "sampling/importance_sampling_ratio/min": 0.29834046959877014, "sampling/sampling_logp_difference/max": 0.34456348419189453, "sampling/sampling_logp_difference/mean": 0.019352423027157784, "step": 115, "step_time": 23.02357772999676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.14507544040679932, "epoch": 0.116, "frac_reward_zero_std": 0.0, "grad_norm": 0.9986281394958496, "kl": 0.0013664300786331296, "learning_rate": 4.905863166085076e-06, "loss": 0.0092, "num_tokens": 327030.0, "reward": 0.22750000655651093, "reward_std": 0.5155822038650513, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5155822038650513, "sampling/importance_sampling_ratio/max": 1.2624555826187134, "sampling/importance_sampling_ratio/mean": 1.0419719219207764, "sampling/importance_sampling_ratio/min": 0.8140959143638611, "sampling/sampling_logp_difference/max": 0.5329241752624512, "sampling/sampling_logp_difference/mean": 0.014649423770606518, "step": 116, "step_time": 24.891790401015896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.1511373668909073, "epoch": 0.117, "frac_reward_zero_std": 0.0, "grad_norm": 1.4632400274276733, "kl": 0.003277445677667856, "learning_rate": 4.903649574023151e-06, "loss": -0.2623, "num_tokens": 330536.0, "reward": 0.22750000655651093, "reward_std": 0.5158407688140869, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5158407688140869, "sampling/importance_sampling_ratio/max": 1.2867978811264038, "sampling/importance_sampling_ratio/mean": 0.945495069026947, "sampling/importance_sampling_ratio/min": 0.41177240014076233, "sampling/sampling_logp_difference/max": 0.5260498523712158, "sampling/sampling_logp_difference/mean": 0.014777141623198986, "step": 117, "step_time": 31.515298081969377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1575075387954712, "epoch": 0.118, "frac_reward_zero_std": 0.0, "grad_norm": 0.7817814350128174, "kl": 0.0026800138875842094, "learning_rate": 4.901410768814581e-06, "loss": 0.1879, "num_tokens": 333499.0, "reward": 0.2149999886751175, "reward_std": 0.5243726372718811, "rewards/reward_func/mean": 0.2149999886751175, "rewards/reward_func/std": 0.5243726968765259, "sampling/importance_sampling_ratio/max": 1.7551790475845337, "sampling/importance_sampling_ratio/mean": 1.0281250476837158, "sampling/importance_sampling_ratio/min": 0.3870733082294464, "sampling/sampling_logp_difference/max": 0.6194639205932617, "sampling/sampling_logp_difference/mean": 0.019190561026334763, "step": 118, "step_time": 29.27046132297255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19239160418510437, "epoch": 0.119, "frac_reward_zero_std": 0.0, "grad_norm": 1.4547138214111328, "kl": 0.002587935421615839, "learning_rate": 4.899146773943374e-06, "loss": 0.0165, "num_tokens": 335899.0, "reward": 0.7225000262260437, "reward_std": 0.5351868867874146, "rewards/reward_func/mean": 0.7225000262260437, "rewards/reward_func/std": 0.5351868867874146, "sampling/importance_sampling_ratio/max": 1.160620927810669, "sampling/importance_sampling_ratio/mean": 0.9701583981513977, "sampling/importance_sampling_ratio/min": 0.800391674041748, "sampling/sampling_logp_difference/max": 0.3428153991699219, "sampling/sampling_logp_difference/mean": 0.019333243370056152, "step": 119, "step_time": 16.85417275497457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.1734311431646347, "epoch": 0.12, "frac_reward_zero_std": 0.0, "grad_norm": 1.0400452613830566, "kl": 0.0025399529840797186, "learning_rate": 4.896857613157765e-06, "loss": 0.2624, "num_tokens": 338755.0, "reward": 0.4925000071525574, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.1858400106430054, "sampling/importance_sampling_ratio/mean": 0.6300135850906372, "sampling/importance_sampling_ratio/min": 0.2771003544330597, "sampling/sampling_logp_difference/max": 0.7855987548828125, "sampling/sampling_logp_difference/mean": 0.020576544106006622, "step": 120, "step_time": 24.53228131495416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.19949737191200256, "epoch": 0.121, "frac_reward_zero_std": 0.0, "grad_norm": 1.3258157968521118, "kl": 0.002931914757937193, "learning_rate": 4.894543310469968e-06, "loss": 0.3289, "num_tokens": 341359.0, "reward": 0.44749999046325684, "reward_std": 0.637985110282898, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.6379851698875427, "sampling/importance_sampling_ratio/max": 1.1372867822647095, "sampling/importance_sampling_ratio/mean": 0.895554780960083, "sampling/importance_sampling_ratio/min": 0.6283366084098816, "sampling/sampling_logp_difference/max": 0.2662011384963989, "sampling/sampling_logp_difference/mean": 0.016341285780072212, "step": 121, "step_time": 22.990031427994836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1831965446472168, "epoch": 0.122, "frac_reward_zero_std": 0.0, "grad_norm": 0.9080422520637512, "kl": 0.003034950466826558, "learning_rate": 4.8922038901559225e-06, "loss": -0.0419, "num_tokens": 344525.0, "reward": 0.2199999988079071, "reward_std": 0.5201922655105591, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5201922655105591, "sampling/importance_sampling_ratio/max": 1.4576102495193481, "sampling/importance_sampling_ratio/mean": 0.911910891532898, "sampling/importance_sampling_ratio/min": 0.4527578055858612, "sampling/sampling_logp_difference/max": 0.45814383029937744, "sampling/sampling_logp_difference/mean": 0.016755251213908195, "step": 122, "step_time": 32.600413289968856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.15501417219638824, "epoch": 0.123, "frac_reward_zero_std": 0.0, "grad_norm": 0.8831087946891785, "kl": 0.001959488494321704, "learning_rate": 4.889839376755041e-06, "loss": -0.0704, "num_tokens": 347641.0, "reward": 0.48000001907348633, "reward_std": 0.5890104174613953, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.58901047706604, "sampling/importance_sampling_ratio/max": 1.093756079673767, "sampling/importance_sampling_ratio/mean": 0.8259649276733398, "sampling/importance_sampling_ratio/min": 0.6506116986274719, "sampling/sampling_logp_difference/max": 0.34856700897216797, "sampling/sampling_logp_difference/mean": 0.015370824374258518, "step": 123, "step_time": 26.753427614981774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.19339172542095184, "epoch": 0.124, "frac_reward_zero_std": 0.0, "grad_norm": 0.9692634344100952, "kl": 0.0023476029746234417, "learning_rate": 4.887449795069948e-06, "loss": 0.3666, "num_tokens": 350417.0, "reward": 0.4424999952316284, "reward_std": 0.6405401229858398, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.6405401229858398, "sampling/importance_sampling_ratio/max": 1.3793091773986816, "sampling/importance_sampling_ratio/mean": 0.8251407146453857, "sampling/importance_sampling_ratio/min": 0.2590537369251251, "sampling/sampling_logp_difference/max": 0.8335009813308716, "sampling/sampling_logp_difference/mean": 0.022826556116342545, "step": 124, "step_time": 26.513902084960137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.21818757057189941, "epoch": 0.125, "frac_reward_zero_std": 0.0, "grad_norm": 1.1452463865280151, "kl": 0.003137273248285055, "learning_rate": 4.885035170166229e-06, "loss": -0.012, "num_tokens": 353362.0, "reward": 0.49000000953674316, "reward_std": 0.5891236066818237, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5891236066818237, "sampling/importance_sampling_ratio/max": 1.3936811685562134, "sampling/importance_sampling_ratio/mean": 0.7410004138946533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5321125984191895, "sampling/sampling_logp_difference/mean": 0.022329842671751976, "step": 125, "step_time": 26.399805816996377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1601344794034958, "epoch": 0.126, "frac_reward_zero_std": 0.0, "grad_norm": 1.2917815446853638, "kl": 0.005123843904584646, "learning_rate": 4.8825955273721524e-06, "loss": 0.2648, "num_tokens": 356345.0, "reward": 0.1875, "reward_std": 0.5387872457504272, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.5387872457504272, "sampling/importance_sampling_ratio/max": 1.6720771789550781, "sampling/importance_sampling_ratio/mean": 0.9762254953384399, "sampling/importance_sampling_ratio/min": 0.51832115650177, "sampling/sampling_logp_difference/max": 0.35707569122314453, "sampling/sampling_logp_difference/mean": 0.01943013444542885, "step": 126, "step_time": 32.152043080015574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1779920905828476, "epoch": 0.127, "frac_reward_zero_std": 0.0, "grad_norm": 0.7333633899688721, "kl": 0.0010852138511836529, "learning_rate": 4.88013089227842e-06, "loss": -0.0002, "num_tokens": 358697.0, "reward": 0.7124999761581421, "reward_std": 0.5683528780937195, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.5683528780937195, "sampling/importance_sampling_ratio/max": 1.054613709449768, "sampling/importance_sampling_ratio/mean": 0.9735892415046692, "sampling/importance_sampling_ratio/min": 0.8428576588630676, "sampling/sampling_logp_difference/max": 0.27741706371307373, "sampling/sampling_logp_difference/mean": 0.014077696017920971, "step": 127, "step_time": 23.273927707981784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.17302820086479187, "epoch": 0.128, "frac_reward_zero_std": 0.0, "grad_norm": 1.4835617542266846, "kl": 0.0013798907166346908, "learning_rate": 4.8776412907378845e-06, "loss": 0.1084, "num_tokens": 361824.0, "reward": -0.03500000014901161, "reward_std": 0.03696845471858978, "rewards/reward_func/mean": -0.03500000014901161, "rewards/reward_func/std": 0.03696845471858978, "sampling/importance_sampling_ratio/max": 1.229792833328247, "sampling/importance_sampling_ratio/mean": 1.099616289138794, "sampling/importance_sampling_ratio/min": 0.9551407694816589, "sampling/sampling_logp_difference/max": 0.37505388259887695, "sampling/sampling_logp_difference/mean": 0.017457885667681694, "step": 128, "step_time": 34.91214869701071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.16820167005062103, "epoch": 0.129, "frac_reward_zero_std": 0.0, "grad_norm": 0.8028147220611572, "kl": 0.0026636586990207434, "learning_rate": 4.87512674886529e-06, "loss": 0.0052, "num_tokens": 364380.0, "reward": 0.7300000190734863, "reward_std": 0.5400000214576721, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 1.2031774520874023, "sampling/importance_sampling_ratio/mean": 0.8647279143333435, "sampling/importance_sampling_ratio/min": 0.611112117767334, "sampling/sampling_logp_difference/max": 0.46569156646728516, "sampling/sampling_logp_difference/mean": 0.012592722661793232, "step": 129, "step_time": 18.17711754696211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.21742700040340424, "epoch": 0.13, "frac_reward_zero_std": 0.0, "grad_norm": 0.7760071158409119, "kl": 0.0016762377927079797, "learning_rate": 4.872587293036991e-06, "loss": 0.0654, "num_tokens": 366925.0, "reward": 0.987500011920929, "reward_std": 0.02499999664723873, "rewards/reward_func/mean": 0.987500011920929, "rewards/reward_func/std": 0.025000005960464478, "sampling/importance_sampling_ratio/max": 0.9263357520103455, "sampling/importance_sampling_ratio/mean": 0.8110896944999695, "sampling/importance_sampling_ratio/min": 0.7218844890594482, "sampling/sampling_logp_difference/max": 0.3987913131713867, "sampling/sampling_logp_difference/mean": 0.01586347073316574, "step": 130, "step_time": 17.876564481004607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.2075773924589157, "epoch": 0.131, "frac_reward_zero_std": 0.0, "grad_norm": 1.248842716217041, "kl": 0.003222368424758315, "learning_rate": 4.870022949890676e-06, "loss": -0.1597, "num_tokens": 370326.0, "reward": 0.7200000286102295, "reward_std": 0.5204485058784485, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5204485654830933, "sampling/importance_sampling_ratio/max": 1.5396089553833008, "sampling/importance_sampling_ratio/mean": 1.0513349771499634, "sampling/importance_sampling_ratio/min": 0.5521257519721985, "sampling/sampling_logp_difference/max": 0.6210081577301025, "sampling/sampling_logp_difference/mean": 0.02196306549012661, "step": 131, "step_time": 29.977244819980115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.18867281079292297, "epoch": 0.132, "frac_reward_zero_std": 0.0, "grad_norm": 1.271755576133728, "kl": 0.003018892602995038, "learning_rate": 4.867433746325093e-06, "loss": 0.1805, "num_tokens": 373336.0, "reward": -0.054999999701976776, "reward_std": 0.023804761469364166, "rewards/reward_func/mean": -0.054999999701976776, "rewards/reward_func/std": 0.023804763332009315, "sampling/importance_sampling_ratio/max": 1.4447154998779297, "sampling/importance_sampling_ratio/mean": 1.2086310386657715, "sampling/importance_sampling_ratio/min": 1.0474393367767334, "sampling/sampling_logp_difference/max": 0.5519063472747803, "sampling/sampling_logp_difference/mean": 0.01733344793319702, "step": 132, "step_time": 35.95452493597986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18559078872203827, "epoch": 0.133, "frac_reward_zero_std": 0.0, "grad_norm": 1.0295324325561523, "kl": 0.0020022341050207615, "learning_rate": 4.864819709499762e-06, "loss": -0.2011, "num_tokens": 376427.0, "reward": 0.7300000190734863, "reward_std": 0.5333541631698608, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5333541631698608, "sampling/importance_sampling_ratio/max": 1.607855200767517, "sampling/importance_sampling_ratio/mean": 1.0655417442321777, "sampling/importance_sampling_ratio/min": 0.6664072275161743, "sampling/sampling_logp_difference/max": 0.2661879062652588, "sampling/sampling_logp_difference/mean": 0.017720265313982964, "step": 133, "step_time": 29.736854650021996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.19027705490589142, "epoch": 0.134, "frac_reward_zero_std": 0.0, "grad_norm": 2.035038471221924, "kl": 0.002520474838092923, "learning_rate": 4.862180866834691e-06, "loss": -0.1515, "num_tokens": 379048.0, "reward": 0.22500000894069672, "reward_std": 0.517332911491394, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.517332911491394, "sampling/importance_sampling_ratio/max": 1.7936463356018066, "sampling/importance_sampling_ratio/mean": 1.1562602519989014, "sampling/importance_sampling_ratio/min": 0.6556832194328308, "sampling/sampling_logp_difference/max": 0.6591966152191162, "sampling/sampling_logp_difference/mean": 0.013582798652350903, "step": 134, "step_time": 26.4526345669874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1789800375699997, "epoch": 0.135, "frac_reward_zero_std": 0.0, "grad_norm": 1.0653369426727295, "kl": 0.0019823943730443716, "learning_rate": 4.8595172460100914e-06, "loss": 0.1384, "num_tokens": 381741.0, "reward": 0.22750000655651093, "reward_std": 0.5154528617858887, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5154528617858887, "sampling/importance_sampling_ratio/max": 2.1499485969543457, "sampling/importance_sampling_ratio/mean": 1.2793291807174683, "sampling/importance_sampling_ratio/min": 0.7535057663917542, "sampling/sampling_logp_difference/max": 0.3313789367675781, "sampling/sampling_logp_difference/mean": 0.01310935989022255, "step": 135, "step_time": 29.42839711002307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16091498732566833, "epoch": 0.136, "frac_reward_zero_std": 0.0, "grad_norm": 1.3879694938659668, "kl": 0.0016879815375432372, "learning_rate": 4.856828874966086e-06, "loss": 0.1266, "num_tokens": 384852.0, "reward": 0.7350000143051147, "reward_std": 0.5300000309944153, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5300000309944153, "sampling/importance_sampling_ratio/max": 1.0974900722503662, "sampling/importance_sampling_ratio/mean": 0.7954087257385254, "sampling/importance_sampling_ratio/min": 0.6193658113479614, "sampling/sampling_logp_difference/max": 0.46332716941833496, "sampling/sampling_logp_difference/mean": 0.017306067049503326, "step": 136, "step_time": 23.437782098015305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.210345059633255, "epoch": 0.137, "frac_reward_zero_std": 0.0, "grad_norm": 1.5120803117752075, "kl": 0.0034511496778577566, "learning_rate": 4.854115781902414e-06, "loss": -0.0655, "num_tokens": 387635.0, "reward": 0.7425000071525574, "reward_std": 0.5149999856948853, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5149999856948853, "sampling/importance_sampling_ratio/max": 1.0200783014297485, "sampling/importance_sampling_ratio/mean": 0.9129359126091003, "sampling/importance_sampling_ratio/min": 0.7975046038627625, "sampling/sampling_logp_difference/max": 0.35558605194091797, "sampling/sampling_logp_difference/mean": 0.02118685655295849, "step": 137, "step_time": 14.472190893022344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.17188337445259094, "epoch": 0.138, "frac_reward_zero_std": 0.0, "grad_norm": 0.9994949102401733, "kl": 0.0007843458442948759, "learning_rate": 4.851377995278138e-06, "loss": -0.2126, "num_tokens": 390190.0, "reward": 0.987500011920929, "reward_std": 0.00957426242530346, "rewards/reward_func/mean": 0.987500011920929, "rewards/reward_func/std": 0.00957426242530346, "sampling/importance_sampling_ratio/max": 1.1960774660110474, "sampling/importance_sampling_ratio/mean": 0.8785350322723389, "sampling/importance_sampling_ratio/min": 0.47124701738357544, "sampling/sampling_logp_difference/max": 0.3400733470916748, "sampling/sampling_logp_difference/mean": 0.01250863540917635, "step": 138, "step_time": 13.101944390975405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19658631086349487, "epoch": 0.139, "frac_reward_zero_std": 0.0, "grad_norm": 0.8522583842277527, "kl": 0.001935296575538814, "learning_rate": 4.8486155438113455e-06, "loss": 0.1246, "num_tokens": 392994.0, "reward": 0.4699999988079071, "reward_std": 0.6128621697425842, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6128621101379395, "sampling/importance_sampling_ratio/max": 1.2607589960098267, "sampling/importance_sampling_ratio/mean": 0.8876631259918213, "sampling/importance_sampling_ratio/min": 0.6906230449676514, "sampling/sampling_logp_difference/max": 0.3345675468444824, "sampling/sampling_logp_difference/mean": 0.012971877120435238, "step": 139, "step_time": 25.09990402305266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.18408119678497314, "epoch": 0.14, "frac_reward_zero_std": 0.0, "grad_norm": 1.0098986625671387, "kl": 0.02210911363363266, "learning_rate": 4.845828456478843e-06, "loss": -0.1546, "num_tokens": 395430.0, "reward": 0.7300000190734863, "reward_std": 0.5069516897201538, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5069516897201538, "sampling/importance_sampling_ratio/max": 2.5110630989074707, "sampling/importance_sampling_ratio/mean": 1.2796778678894043, "sampling/importance_sampling_ratio/min": 0.764377236366272, "sampling/sampling_logp_difference/max": 0.5120794773101807, "sampling/sampling_logp_difference/mean": 0.017754213884472847, "step": 140, "step_time": 17.618704948981758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.19033658504486084, "epoch": 0.141, "frac_reward_zero_std": 0.0, "grad_norm": 1.1990196704864502, "kl": 0.0015178037574514747, "learning_rate": 4.84301676251586e-06, "loss": 0.0158, "num_tokens": 398059.0, "reward": -0.07750000059604645, "reward_std": 0.05377421900629997, "rewards/reward_func/mean": -0.07750000059604645, "rewards/reward_func/std": 0.05377421900629997, "sampling/importance_sampling_ratio/max": 0.9383327960968018, "sampling/importance_sampling_ratio/mean": 0.8809807300567627, "sampling/importance_sampling_ratio/min": 0.8248416781425476, "sampling/sampling_logp_difference/max": 0.3222709894180298, "sampling/sampling_logp_difference/mean": 0.012866639532148838, "step": 141, "step_time": 35.01830234401859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.21558424830436707, "epoch": 0.142, "frac_reward_zero_std": 0.0, "grad_norm": 1.4753899574279785, "kl": 0.0027551008388400078, "learning_rate": 4.840180491415733e-06, "loss": -0.1084, "num_tokens": 400642.0, "reward": 0.9900000095367432, "reward_std": 0.011546994559466839, "rewards/reward_func/mean": 0.9900000095367432, "rewards/reward_func/std": 0.011546994559466839, "sampling/importance_sampling_ratio/max": 1.1445531845092773, "sampling/importance_sampling_ratio/mean": 1.0267938375473022, "sampling/importance_sampling_ratio/min": 0.9095835089683533, "sampling/sampling_logp_difference/max": 0.31305789947509766, "sampling/sampling_logp_difference/mean": 0.017091326415538788, "step": 142, "step_time": 13.977392985019833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.20263966917991638, "epoch": 0.143, "frac_reward_zero_std": 0.0, "grad_norm": 0.9371317028999329, "kl": 0.0016381170134991407, "learning_rate": 4.837319672929606e-06, "loss": 0.0381, "num_tokens": 404035.0, "reward": 0.19750000536441803, "reward_std": 0.489515095949173, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.489515095949173, "sampling/importance_sampling_ratio/max": 1.6764967441558838, "sampling/importance_sampling_ratio/mean": 1.0880968570709229, "sampling/importance_sampling_ratio/min": 0.6929658651351929, "sampling/sampling_logp_difference/max": 0.3251023292541504, "sampling/sampling_logp_difference/mean": 0.01909804344177246, "step": 143, "step_time": 42.47529549297178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.13846175372600555, "epoch": 0.144, "frac_reward_zero_std": 0.0, "grad_norm": 0.8642736673355103, "kl": 0.0007228488102555275, "learning_rate": 4.834434337066112e-06, "loss": -0.239, "num_tokens": 407276.0, "reward": 0.22750000655651093, "reward_std": 0.5154528617858887, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5154528617858887, "sampling/importance_sampling_ratio/max": 1.364630103111267, "sampling/importance_sampling_ratio/mean": 1.0348033905029297, "sampling/importance_sampling_ratio/min": 0.7004866003990173, "sampling/sampling_logp_difference/max": 0.3332326412200928, "sampling/sampling_logp_difference/mean": 0.014587691985070705, "step": 144, "step_time": 32.28191443299875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.2000749707221985, "epoch": 0.145, "frac_reward_zero_std": 0.0, "grad_norm": 0.7233874201774597, "kl": 0.0011953753419220448, "learning_rate": 4.831524514091056e-06, "loss": 0.1124, "num_tokens": 409894.0, "reward": 0.20000000298023224, "reward_std": 0.5416025519371033, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5416025519371033, "sampling/importance_sampling_ratio/max": 0.8989771008491516, "sampling/importance_sampling_ratio/mean": 0.691437840461731, "sampling/importance_sampling_ratio/min": 0.506276547908783, "sampling/sampling_logp_difference/max": 0.5178370475769043, "sampling/sampling_logp_difference/mean": 0.01640663482248783, "step": 145, "step_time": 27.35345201002201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18750903010368347, "epoch": 0.146, "frac_reward_zero_std": 0.0, "grad_norm": 1.1483471393585205, "kl": 0.0016143718967214227, "learning_rate": 4.828590234527107e-06, "loss": -0.2299, "num_tokens": 412491.0, "reward": 0.7325000166893005, "reward_std": 0.5349999666213989, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5349999666213989, "sampling/importance_sampling_ratio/max": 2.257571220397949, "sampling/importance_sampling_ratio/mean": 1.2990715503692627, "sampling/importance_sampling_ratio/min": 0.8438032865524292, "sampling/sampling_logp_difference/max": 0.4340285062789917, "sampling/sampling_logp_difference/mean": 0.013792783953249454, "step": 146, "step_time": 14.770862923003733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.17875780165195465, "epoch": 0.147, "frac_reward_zero_std": 0.0, "grad_norm": 0.9584639668464661, "kl": 0.001465148408897221, "learning_rate": 4.825631529153466e-06, "loss": 0.1668, "num_tokens": 414918.0, "reward": 0.45249998569488525, "reward_std": 0.6344223022460938, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6344223022460938, "sampling/importance_sampling_ratio/max": 1.451632022857666, "sampling/importance_sampling_ratio/mean": 0.9425681829452515, "sampling/importance_sampling_ratio/min": 0.5965684652328491, "sampling/sampling_logp_difference/max": 0.27742254734039307, "sampling/sampling_logp_difference/mean": 0.012992608360946178, "step": 147, "step_time": 25.662666889023967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.17511898279190063, "epoch": 0.148, "frac_reward_zero_std": 0.0, "grad_norm": 0.8411564826965332, "kl": 0.0017402897356078029, "learning_rate": 4.8226484290055544e-06, "loss": -0.0883, "num_tokens": 417946.0, "reward": 0.49000000953674316, "reward_std": 0.5889538526535034, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5889538526535034, "sampling/importance_sampling_ratio/max": 1.1265987157821655, "sampling/importance_sampling_ratio/mean": 0.8272252082824707, "sampling/importance_sampling_ratio/min": 0.652218222618103, "sampling/sampling_logp_difference/max": 0.37691307067871094, "sampling/sampling_logp_difference/mean": 0.01742238737642765, "step": 148, "step_time": 23.18166967300931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.14965252578258514, "epoch": 0.149, "frac_reward_zero_std": 0.0, "grad_norm": 0.9575255513191223, "kl": 0.0014907231088727713, "learning_rate": 4.8196409653746815e-06, "loss": -0.3235, "num_tokens": 420898.0, "reward": 0.4675000011920929, "reward_std": 0.5525923371315002, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.552592396736145, "sampling/importance_sampling_ratio/max": 1.6635464429855347, "sampling/importance_sampling_ratio/mean": 1.0050544738769531, "sampling/importance_sampling_ratio/min": 0.6449732780456543, "sampling/sampling_logp_difference/max": 0.45191502571105957, "sampling/sampling_logp_difference/mean": 0.016319045796990395, "step": 149, "step_time": 29.55009562795749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18218925595283508, "epoch": 0.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.8477475643157959, "kl": 0.0021763560362160206, "learning_rate": 4.8166091698077165e-06, "loss": -0.2946, "num_tokens": 423516.0, "reward": 0.4749999940395355, "reward_std": 0.6070969104766846, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6070969104766846, "sampling/importance_sampling_ratio/max": 1.363287329673767, "sampling/importance_sampling_ratio/mean": 0.7946721911430359, "sampling/importance_sampling_ratio/min": 0.3996957838535309, "sampling/sampling_logp_difference/max": 0.31358540058135986, "sampling/sampling_logp_difference/mean": 0.016757085919380188, "step": 150, "step_time": 22.52863380900817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1769341230392456, "epoch": 0.151, "frac_reward_zero_std": 0.0, "grad_norm": 0.9006563425064087, "kl": 0.001339763985015452, "learning_rate": 4.813553074106761e-06, "loss": -0.0657, "num_tokens": 426235.0, "reward": 0.7275000214576721, "reward_std": 0.5317502617835999, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5317502617835999, "sampling/importance_sampling_ratio/max": 1.5590115785598755, "sampling/importance_sampling_ratio/mean": 1.0348320007324219, "sampling/importance_sampling_ratio/min": 0.7299727201461792, "sampling/sampling_logp_difference/max": 0.2869887351989746, "sampling/sampling_logp_difference/mean": 0.013297591358423233, "step": 151, "step_time": 20.749853953020647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.13252630829811096, "epoch": 0.152, "frac_reward_zero_std": 0.0, "grad_norm": 1.153368353843689, "kl": 0.0020579483825713396, "learning_rate": 4.8104727103288125e-06, "loss": -0.0782, "num_tokens": 429323.0, "reward": 0.4724999964237213, "reward_std": 0.6107577681541443, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6107577681541443, "sampling/importance_sampling_ratio/max": 1.1006298065185547, "sampling/importance_sampling_ratio/mean": 0.9568758010864258, "sampling/importance_sampling_ratio/min": 0.6699782609939575, "sampling/sampling_logp_difference/max": 0.3107858896255493, "sampling/sampling_logp_difference/mean": 0.011886066757142544, "step": 152, "step_time": 25.256749020016287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.19969619810581207, "epoch": 0.153, "frac_reward_zero_std": 0.0, "grad_norm": 0.6522009968757629, "kl": 0.0023764586076140404, "learning_rate": 4.80736811078543e-06, "loss": 0.1158, "num_tokens": 432044.0, "reward": 0.4725000262260437, "reward_std": 0.5688804984092712, "rewards/reward_func/mean": 0.4725000262260437, "rewards/reward_func/std": 0.5688804388046265, "sampling/importance_sampling_ratio/max": 0.8768420815467834, "sampling/importance_sampling_ratio/mean": 0.6932048797607422, "sampling/importance_sampling_ratio/min": 0.4451182186603546, "sampling/sampling_logp_difference/max": 0.3406975269317627, "sampling/sampling_logp_difference/mean": 0.01567997597157955, "step": 153, "step_time": 21.767912975978106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.17119455337524414, "epoch": 0.154, "frac_reward_zero_std": 0.0, "grad_norm": 0.8653021454811096, "kl": 0.0015755370259284973, "learning_rate": 4.804239308042392e-06, "loss": -0.2339, "num_tokens": 434669.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.6845382452011108, "sampling/importance_sampling_ratio/mean": 1.1859914064407349, "sampling/importance_sampling_ratio/min": 0.7358487248420715, "sampling/sampling_logp_difference/max": 0.3513331413269043, "sampling/sampling_logp_difference/mean": 0.012898587621748447, "step": 154, "step_time": 16.419501213997137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2016332894563675, "epoch": 0.155, "frac_reward_zero_std": 0.0, "grad_norm": 2.0963315963745117, "kl": 0.002774738008156419, "learning_rate": 4.8010863349193605e-06, "loss": -0.208, "num_tokens": 437115.0, "reward": 0.1824999898672104, "reward_std": 0.5489004254341125, "rewards/reward_func/mean": 0.1824999898672104, "rewards/reward_func/std": 0.5489004254341125, "sampling/importance_sampling_ratio/max": 1.6835180521011353, "sampling/importance_sampling_ratio/mean": 1.1130659580230713, "sampling/importance_sampling_ratio/min": 0.6809583902359009, "sampling/sampling_logp_difference/max": 0.295072078704834, "sampling/sampling_logp_difference/mean": 0.01348087191581726, "step": 155, "step_time": 25.86080577399116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1756555140018463, "epoch": 0.156, "frac_reward_zero_std": 0.0, "grad_norm": 1.0347280502319336, "kl": 0.0033476268872618675, "learning_rate": 4.797909224489531e-06, "loss": 0.0957, "num_tokens": 440082.0, "reward": -0.05000000074505806, "reward_std": 0.057735029608011246, "rewards/reward_func/mean": -0.05000000074505806, "rewards/reward_func/std": 0.057735029608011246, "sampling/importance_sampling_ratio/max": 1.3452192544937134, "sampling/importance_sampling_ratio/mean": 1.0866371393203735, "sampling/importance_sampling_ratio/min": 0.78570157289505, "sampling/sampling_logp_difference/max": 0.34517359733581543, "sampling/sampling_logp_difference/mean": 0.010137263685464859, "step": 156, "step_time": 41.00264837103896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17797067761421204, "epoch": 0.157, "frac_reward_zero_std": 0.0, "grad_norm": 1.3172663450241089, "kl": 0.0038546982686966658, "learning_rate": 4.794708010079288e-06, "loss": -0.1684, "num_tokens": 443367.0, "reward": 0.4399999976158142, "reward_std": 0.647919774055481, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.647919774055481, "sampling/importance_sampling_ratio/max": 1.448460340499878, "sampling/importance_sampling_ratio/mean": 1.1500673294067383, "sampling/importance_sampling_ratio/min": 0.8967432975769043, "sampling/sampling_logp_difference/max": 0.3056824207305908, "sampling/sampling_logp_difference/mean": 0.014047392643988132, "step": 157, "step_time": 31.882866240979638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.15875573456287384, "epoch": 0.158, "frac_reward_zero_std": 0.0, "grad_norm": 1.1629942655563354, "kl": 0.0017290109535679221, "learning_rate": 4.791482725267858e-06, "loss": -0.0512, "num_tokens": 446024.0, "reward": 0.48750001192092896, "reward_std": 0.5860816240310669, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5860816240310669, "sampling/importance_sampling_ratio/max": 1.110757827758789, "sampling/importance_sampling_ratio/mean": 0.9287973642349243, "sampling/importance_sampling_ratio/min": 0.719914972782135, "sampling/sampling_logp_difference/max": 0.33409690856933594, "sampling/sampling_logp_difference/mean": 0.010397966019809246, "step": 158, "step_time": 22.092239080986474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1464797407388687, "epoch": 0.159, "frac_reward_zero_std": 0.0, "grad_norm": 1.0900577306747437, "kl": 0.002194772707298398, "learning_rate": 4.78823340388695e-06, "loss": 0.1577, "num_tokens": 449412.0, "reward": 0.48750001192092896, "reward_std": 0.5860816240310669, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5860816240310669, "sampling/importance_sampling_ratio/max": 1.2988736629486084, "sampling/importance_sampling_ratio/mean": 0.9668689370155334, "sampling/importance_sampling_ratio/min": 0.6903460025787354, "sampling/sampling_logp_difference/max": 0.6943702697753906, "sampling/sampling_logp_difference/mean": 0.018337421119213104, "step": 159, "step_time": 29.217307554965373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.19643019139766693, "epoch": 0.16, "frac_reward_zero_std": 0.0, "grad_norm": 1.9810487031936646, "kl": 0.017961928620934486, "learning_rate": 4.7849600800204075e-06, "loss": -0.3103, "num_tokens": 452637.0, "reward": 0.1899999976158142, "reward_std": 0.5196152329444885, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.5196152329444885, "sampling/importance_sampling_ratio/max": 1.8016902208328247, "sampling/importance_sampling_ratio/mean": 1.127899169921875, "sampling/importance_sampling_ratio/min": 0.7193431258201599, "sampling/sampling_logp_difference/max": 0.35339099168777466, "sampling/sampling_logp_difference/mean": 0.016977809369564056, "step": 160, "step_time": 32.891525785962585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.21899795532226562, "epoch": 0.161, "frac_reward_zero_std": 0.0, "grad_norm": 1.1688151359558105, "kl": 0.002222121926024556, "learning_rate": 4.781662788003851e-06, "loss": 0.0173, "num_tokens": 454947.0, "reward": 0.9700000286102295, "reward_std": 0.04242638871073723, "rewards/reward_func/mean": 0.9700000286102295, "rewards/reward_func/std": 0.042426396161317825, "sampling/importance_sampling_ratio/max": 1.1345146894454956, "sampling/importance_sampling_ratio/mean": 1.0357433557510376, "sampling/importance_sampling_ratio/min": 0.919882595539093, "sampling/sampling_logp_difference/max": 0.3441588878631592, "sampling/sampling_logp_difference/mean": 0.01572081632912159, "step": 161, "step_time": 15.506941428000573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.17953595519065857, "epoch": 0.162, "frac_reward_zero_std": 0.0, "grad_norm": 1.65281081199646, "kl": 0.01579761691391468, "learning_rate": 4.778341562424312e-06, "loss": 0.1935, "num_tokens": 457503.0, "reward": 0.4925000071525574, "reward_std": 0.5861384868621826, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5861384868621826, "sampling/importance_sampling_ratio/max": 1.0671262741088867, "sampling/importance_sampling_ratio/mean": 0.8697945475578308, "sampling/importance_sampling_ratio/min": 0.5890854597091675, "sampling/sampling_logp_difference/max": 0.7086801528930664, "sampling/sampling_logp_difference/mean": 0.018484894186258316, "step": 162, "step_time": 19.816631291003432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1783134937286377, "epoch": 0.163, "frac_reward_zero_std": 0.0, "grad_norm": 0.8144586682319641, "kl": 0.002263593254610896, "learning_rate": 4.774996438119876e-06, "loss": 0.1088, "num_tokens": 460223.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 0.9133305549621582, "sampling/importance_sampling_ratio/mean": 0.7190232276916504, "sampling/importance_sampling_ratio/min": 0.4618311822414398, "sampling/sampling_logp_difference/max": 0.389129638671875, "sampling/sampling_logp_difference/mean": 0.015049528330564499, "step": 163, "step_time": 23.000495705986395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1487078070640564, "epoch": 0.164, "frac_reward_zero_std": 0.0, "grad_norm": 1.245559811592102, "kl": 0.004358990583568811, "learning_rate": 4.771627450179315e-06, "loss": 0.2281, "num_tokens": 463334.0, "reward": 0.2224999964237213, "reward_std": 0.4986899495124817, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.4986899793148041, "sampling/importance_sampling_ratio/max": 1.8270612955093384, "sampling/importance_sampling_ratio/mean": 1.0030848979949951, "sampling/importance_sampling_ratio/min": 0.40425020456314087, "sampling/sampling_logp_difference/max": 0.6674942970275879, "sampling/sampling_logp_difference/mean": 0.023439306765794754, "step": 164, "step_time": 28.421368504001293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16806593537330627, "epoch": 0.165, "frac_reward_zero_std": 0.0, "grad_norm": 1.237927794456482, "kl": 0.0039779855869710445, "learning_rate": 4.768234633941716e-06, "loss": -0.079, "num_tokens": 466928.0, "reward": -0.06750000268220901, "reward_std": 0.08995369076728821, "rewards/reward_func/mean": -0.06750000268220901, "rewards/reward_func/std": 0.08995369076728821, "sampling/importance_sampling_ratio/max": 1.4824180603027344, "sampling/importance_sampling_ratio/mean": 1.1120562553405762, "sampling/importance_sampling_ratio/min": 0.7071133852005005, "sampling/sampling_logp_difference/max": 0.5340650081634521, "sampling/sampling_logp_difference/mean": 0.01966848596930504, "step": 165, "step_time": 40.03841173899127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.15517666935920715, "epoch": 0.166, "frac_reward_zero_std": 0.0, "grad_norm": 1.092002034187317, "kl": 0.0041062175296247005, "learning_rate": 4.764818024996117e-06, "loss": -0.2908, "num_tokens": 470436.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 1.9394787549972534, "sampling/importance_sampling_ratio/mean": 1.0613539218902588, "sampling/importance_sampling_ratio/min": 0.628749430179596, "sampling/sampling_logp_difference/max": 0.5289945602416992, "sampling/sampling_logp_difference/mean": 0.016753708943724632, "step": 166, "step_time": 23.91539337602444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.17606212198734283, "epoch": 0.167, "frac_reward_zero_std": 0.0, "grad_norm": 0.7879842519760132, "kl": 0.002886286238208413, "learning_rate": 4.76137765918113e-06, "loss": 0.1853, "num_tokens": 472959.0, "reward": -0.09999999403953552, "reward_std": 0.03464101254940033, "rewards/reward_func/mean": -0.09999999403953552, "rewards/reward_func/std": 0.03464101254940033, "sampling/importance_sampling_ratio/max": 1.032845377922058, "sampling/importance_sampling_ratio/mean": 0.5582603812217712, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.546900749206543, "sampling/sampling_logp_difference/mean": 0.020056232810020447, "step": 167, "step_time": 35.982216293981764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19584688544273376, "epoch": 0.168, "frac_reward_zero_std": 0.0, "grad_norm": 1.3102340698242188, "kl": 0.0016309436177834868, "learning_rate": 4.757913572584564e-06, "loss": 0.0429, "num_tokens": 475432.0, "reward": 0.48500001430511475, "reward_std": 0.5948948860168457, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5948949456214905, "sampling/importance_sampling_ratio/max": 1.630489468574524, "sampling/importance_sampling_ratio/mean": 1.2867040634155273, "sampling/importance_sampling_ratio/min": 1.0027376413345337, "sampling/sampling_logp_difference/max": 0.3504819869995117, "sampling/sampling_logp_difference/mean": 0.01472636591643095, "step": 168, "step_time": 22.28222359402571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.19525575637817383, "epoch": 0.169, "frac_reward_zero_std": 0.0, "grad_norm": 0.7274377346038818, "kl": 0.0039355577901005745, "learning_rate": 4.754425801543047e-06, "loss": 0.0435, "num_tokens": 478727.0, "reward": 0.20750001072883606, "reward_std": 0.5296146273612976, "rewards/reward_func/mean": 0.20750001072883606, "rewards/reward_func/std": 0.5296146273612976, "sampling/importance_sampling_ratio/max": 1.3858546018600464, "sampling/importance_sampling_ratio/mean": 0.639397382736206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3783402442932129, "sampling/sampling_logp_difference/mean": 0.021254975348711014, "step": 169, "step_time": 36.52186313801212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.224461629986763, "epoch": 0.17, "frac_reward_zero_std": 0.0, "grad_norm": 1.3898884057998657, "kl": 0.004125731065869331, "learning_rate": 4.750914382641647e-06, "loss": -0.1721, "num_tokens": 481509.0, "reward": 0.4750000238418579, "reward_std": 0.5831809639930725, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.5831809639930725, "sampling/importance_sampling_ratio/max": 1.3199305534362793, "sampling/importance_sampling_ratio/mean": 0.9330511093139648, "sampling/importance_sampling_ratio/min": 0.5573926568031311, "sampling/sampling_logp_difference/max": 0.6526844501495361, "sampling/sampling_logp_difference/mean": 0.0285993292927742, "step": 170, "step_time": 25.501637825975195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.14968155324459076, "epoch": 0.171, "frac_reward_zero_std": 0.0, "grad_norm": 0.8178985714912415, "kl": 0.0012402908178046346, "learning_rate": 4.747379352713489e-06, "loss": 0.0075, "num_tokens": 484453.0, "reward": 0.4699999988079071, "reward_std": 0.6133514642715454, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6133514046669006, "sampling/importance_sampling_ratio/max": 1.0991504192352295, "sampling/importance_sampling_ratio/mean": 0.9805469512939453, "sampling/importance_sampling_ratio/min": 0.7882593870162964, "sampling/sampling_logp_difference/max": 0.5523756742477417, "sampling/sampling_logp_difference/mean": 0.012993918731808662, "step": 171, "step_time": 32.51496501202928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1859351545572281, "epoch": 0.172, "frac_reward_zero_std": 0.0, "grad_norm": 0.7170812487602234, "kl": 0.0026098366361111403, "learning_rate": 4.743820748839362e-06, "loss": 0.0413, "num_tokens": 487251.0, "reward": 0.22750000655651093, "reward_std": 0.5084863305091858, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5084863305091858, "sampling/importance_sampling_ratio/max": 1.182456374168396, "sampling/importance_sampling_ratio/mean": 0.7572699785232544, "sampling/importance_sampling_ratio/min": 0.4331490099430084, "sampling/sampling_logp_difference/max": 0.4196054935455322, "sampling/sampling_logp_difference/mean": 0.015737760812044144, "step": 172, "step_time": 29.891263522033114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.17518748342990875, "epoch": 0.173, "frac_reward_zero_std": 0.0, "grad_norm": 0.9168515205383301, "kl": 0.0015706719132140279, "learning_rate": 4.740238608347337e-06, "loss": -0.0518, "num_tokens": 490109.0, "reward": 0.45250001549720764, "reward_std": 0.5872747302055359, "rewards/reward_func/mean": 0.45250001549720764, "rewards/reward_func/std": 0.5872748494148254, "sampling/importance_sampling_ratio/max": 1.3373171091079712, "sampling/importance_sampling_ratio/mean": 1.0171103477478027, "sampling/importance_sampling_ratio/min": 0.7078337669372559, "sampling/sampling_logp_difference/max": 0.2535383701324463, "sampling/sampling_logp_difference/mean": 0.013486744835972786, "step": 173, "step_time": 27.10193809797056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.18130265176296234, "epoch": 0.174, "frac_reward_zero_std": 0.0, "grad_norm": 0.8532293438911438, "kl": 0.0019429734675213695, "learning_rate": 4.736632968812374e-06, "loss": -0.0639, "num_tokens": 492848.0, "reward": 0.48000001907348633, "reward_std": 0.5950350165367126, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.5950350165367126, "sampling/importance_sampling_ratio/max": 1.0050729513168335, "sampling/importance_sampling_ratio/mean": 0.7542091608047485, "sampling/importance_sampling_ratio/min": 0.5756633281707764, "sampling/sampling_logp_difference/max": 0.3250858783721924, "sampling/sampling_logp_difference/mean": 0.017727231606841087, "step": 174, "step_time": 20.16466292599216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1878201961517334, "epoch": 0.175, "frac_reward_zero_std": 0.0, "grad_norm": 1.1454460620880127, "kl": 0.0018710215808823705, "learning_rate": 4.733003868055923e-06, "loss": 0.1514, "num_tokens": 496236.0, "reward": 0.21250000596046448, "reward_std": 0.5297405123710632, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.529740571975708, "sampling/importance_sampling_ratio/max": 1.8199222087860107, "sampling/importance_sampling_ratio/mean": 1.3411989212036133, "sampling/importance_sampling_ratio/min": 0.9902960658073425, "sampling/sampling_logp_difference/max": 0.44612765312194824, "sampling/sampling_logp_difference/mean": 0.016621703281998634, "step": 175, "step_time": 34.79833930899622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.16727329790592194, "epoch": 0.176, "frac_reward_zero_std": 0.0, "grad_norm": 5.747654438018799, "kl": 0.0012095842976123095, "learning_rate": 4.729351344145536e-06, "loss": -0.3064, "num_tokens": 499073.0, "reward": 0.21000000834465027, "reward_std": 0.5280783176422119, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.5280782580375671, "sampling/importance_sampling_ratio/max": 1.7933062314987183, "sampling/importance_sampling_ratio/mean": 1.2109875679016113, "sampling/importance_sampling_ratio/min": 0.9138539433479309, "sampling/sampling_logp_difference/max": 0.29658639430999756, "sampling/sampling_logp_difference/mean": 0.016849784180521965, "step": 176, "step_time": 30.884794701007195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.17437633872032166, "epoch": 0.177, "frac_reward_zero_std": 0.0, "grad_norm": 0.9134334325790405, "kl": 0.0027399002574384212, "learning_rate": 4.725675435394461e-06, "loss": -0.0815, "num_tokens": 501850.0, "reward": 0.9950000047683716, "reward_std": 0.005773497279733419, "rewards/reward_func/mean": 0.9950000047683716, "rewards/reward_func/std": 0.005773497279733419, "sampling/importance_sampling_ratio/max": 1.1949089765548706, "sampling/importance_sampling_ratio/mean": 0.9399723410606384, "sampling/importance_sampling_ratio/min": 0.557498574256897, "sampling/sampling_logp_difference/max": 0.40509772300720215, "sampling/sampling_logp_difference/mean": 0.018100522458553314, "step": 177, "step_time": 17.259996051958296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.17229580879211426, "epoch": 0.178, "frac_reward_zero_std": 0.0, "grad_norm": 0.6053732633590698, "kl": 0.0020448777358978987, "learning_rate": 4.721976180361239e-06, "loss": -0.0121, "num_tokens": 504517.0, "reward": 0.7325000166893005, "reward_std": 0.5150647163391113, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5150647163391113, "sampling/importance_sampling_ratio/max": 1.195574402809143, "sampling/importance_sampling_ratio/mean": 0.9067440032958984, "sampling/importance_sampling_ratio/min": 0.6278784871101379, "sampling/sampling_logp_difference/max": 0.3325514793395996, "sampling/sampling_logp_difference/mean": 0.016679037362337112, "step": 178, "step_time": 21.488752770994324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19630061089992523, "epoch": 0.179, "frac_reward_zero_std": 0.0, "grad_norm": 0.6928678750991821, "kl": 0.005598170217126608, "learning_rate": 4.718253617849306e-06, "loss": 0.0082, "num_tokens": 507338.0, "reward": 0.48500001430511475, "reward_std": 0.5951750874519348, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5951750874519348, "sampling/importance_sampling_ratio/max": 1.2143689393997192, "sampling/importance_sampling_ratio/mean": 0.7499825358390808, "sampling/importance_sampling_ratio/min": 0.3230113387107849, "sampling/sampling_logp_difference/max": 0.575941801071167, "sampling/sampling_logp_difference/mean": 0.02463117614388466, "step": 179, "step_time": 24.413800164998975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 62.0, "completions/mean_terminated_length": 62.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1679321527481079, "epoch": 0.18, "frac_reward_zero_std": 0.0, "grad_norm": 0.9597435593605042, "kl": 0.001463412307202816, "learning_rate": 4.7145077869065815e-06, "loss": -0.0811, "num_tokens": 510393.0, "reward": 0.4300000071525574, "reward_std": 0.6594442129135132, "rewards/reward_func/mean": 0.4300000071525574, "rewards/reward_func/std": 0.6594442129135132, "sampling/importance_sampling_ratio/max": 1.1826465129852295, "sampling/importance_sampling_ratio/mean": 1.024086594581604, "sampling/importance_sampling_ratio/min": 0.8835458159446716, "sampling/sampling_logp_difference/max": 0.27237796783447266, "sampling/sampling_logp_difference/mean": 0.011888536624610424, "step": 180, "step_time": 32.19210964202648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.16897247731685638, "epoch": 0.181, "frac_reward_zero_std": 0.0, "grad_norm": 1.964337706565857, "kl": 0.0011811513686552644, "learning_rate": 4.710738726825059e-06, "loss": -0.0191, "num_tokens": 513311.0, "reward": 0.7300000190734863, "reward_std": 0.49362605810165405, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.49362605810165405, "sampling/importance_sampling_ratio/max": 1.3036000728607178, "sampling/importance_sampling_ratio/mean": 1.0771503448486328, "sampling/importance_sampling_ratio/min": 0.7432346940040588, "sampling/sampling_logp_difference/max": 0.3772556781768799, "sampling/sampling_logp_difference/mean": 0.011708532460033894, "step": 181, "step_time": 23.40327527699992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.196078822016716, "epoch": 0.182, "frac_reward_zero_std": 0.0, "grad_norm": 1.496968388557434, "kl": 0.09362910687923431, "learning_rate": 4.706946477140396e-06, "loss": -0.019, "num_tokens": 515719.0, "reward": 0.4675000011920929, "reward_std": 0.6152167916297913, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6152167916297913, "sampling/importance_sampling_ratio/max": 1.0825995206832886, "sampling/importance_sampling_ratio/mean": 0.8314149379730225, "sampling/importance_sampling_ratio/min": 0.6482289433479309, "sampling/sampling_logp_difference/max": 0.7126073837280273, "sampling/sampling_logp_difference/mean": 0.021470265462994576, "step": 182, "step_time": 18.59843238198664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17555762827396393, "epoch": 0.183, "frac_reward_zero_std": 0.0, "grad_norm": 0.9557614922523499, "kl": 0.0021633566357195377, "learning_rate": 4.703131077631498e-06, "loss": -0.0824, "num_tokens": 518199.0, "reward": 0.24250000715255737, "reward_std": 0.5050659775733948, "rewards/reward_func/mean": 0.24250000715255737, "rewards/reward_func/std": 0.5050659775733948, "sampling/importance_sampling_ratio/max": 1.79241144657135, "sampling/importance_sampling_ratio/mean": 1.166793704032898, "sampling/importance_sampling_ratio/min": 0.7166504263877869, "sampling/sampling_logp_difference/max": 0.2971842288970947, "sampling/sampling_logp_difference/mean": 0.01348396111279726, "step": 183, "step_time": 24.774854800023604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1438012570142746, "epoch": 0.184, "frac_reward_zero_std": 0.0, "grad_norm": 0.816080629825592, "kl": 0.0035170961637049913, "learning_rate": 4.699292568320097e-06, "loss": 0.0692, "num_tokens": 521095.0, "reward": 0.22750000655651093, "reward_std": 0.5162283182144165, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5162283182144165, "sampling/importance_sampling_ratio/max": 1.1711204051971436, "sampling/importance_sampling_ratio/mean": 0.7586665153503418, "sampling/importance_sampling_ratio/min": 0.5861088037490845, "sampling/sampling_logp_difference/max": 0.3250613212585449, "sampling/sampling_logp_difference/mean": 0.013953878544270992, "step": 184, "step_time": 31.118160058977082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.18122437596321106, "epoch": 0.185, "frac_reward_zero_std": 0.0, "grad_norm": 1.2473357915878296, "kl": 0.0016663582064211369, "learning_rate": 4.6954309894703435e-06, "loss": 0.0388, "num_tokens": 523472.0, "reward": 0.9900000095367432, "reward_std": 0.019999999552965164, "rewards/reward_func/mean": 0.9900000095367432, "rewards/reward_func/std": 0.02000001072883606, "sampling/importance_sampling_ratio/max": 1.3397892713546753, "sampling/importance_sampling_ratio/mean": 0.8994681239128113, "sampling/importance_sampling_ratio/min": 0.4870939552783966, "sampling/sampling_logp_difference/max": 0.3438701629638672, "sampling/sampling_logp_difference/mean": 0.018603259697556496, "step": 185, "step_time": 10.989855285966769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.20536096394062042, "epoch": 0.186, "frac_reward_zero_std": 0.0, "grad_norm": 1.1676692962646484, "kl": 0.003371760481968522, "learning_rate": 4.69154638158837e-06, "loss": -0.1568, "num_tokens": 525967.0, "reward": 0.7300000190734863, "reward_std": 0.5399999618530273, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 1.2683820724487305, "sampling/importance_sampling_ratio/mean": 1.0525885820388794, "sampling/importance_sampling_ratio/min": 0.9126219749450684, "sampling/sampling_logp_difference/max": 0.5066227912902832, "sampling/sampling_logp_difference/mean": 0.01730816625058651, "step": 186, "step_time": 19.78294134902535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.17162413895130157, "epoch": 0.187, "frac_reward_zero_std": 0.0, "grad_norm": 1.2625066041946411, "kl": 0.006463764235377312, "learning_rate": 4.687638785421875e-06, "loss": 0.2805, "num_tokens": 529704.0, "reward": 0.24000000953674316, "reward_std": 0.5067543387413025, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.5067543983459473, "sampling/importance_sampling_ratio/max": 2.251146078109741, "sampling/importance_sampling_ratio/mean": 1.3049869537353516, "sampling/importance_sampling_ratio/min": 0.625739336013794, "sampling/sampling_logp_difference/max": 0.7086482048034668, "sampling/sampling_logp_difference/mean": 0.019078198820352554, "step": 187, "step_time": 37.20276992302388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17984598875045776, "epoch": 0.188, "frac_reward_zero_std": 0.0, "grad_norm": 0.6802559494972229, "kl": 0.0016403241315856576, "learning_rate": 4.683708241959694e-06, "loss": 0.1015, "num_tokens": 532246.0, "reward": 0.7300000190734863, "reward_std": 0.5399999618530273, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 0.9991980791091919, "sampling/importance_sampling_ratio/mean": 0.8855483531951904, "sampling/importance_sampling_ratio/min": 0.7635801434516907, "sampling/sampling_logp_difference/max": 0.30162954330444336, "sampling/sampling_logp_difference/mean": 0.01767743192613125, "step": 188, "step_time": 17.10081228200579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.16766847670078278, "epoch": 0.189, "frac_reward_zero_std": 0.0, "grad_norm": 1.084822654724121, "kl": 0.001347738434560597, "learning_rate": 4.679754792431368e-06, "loss": 0.1293, "num_tokens": 534877.0, "reward": 0.19749999046325684, "reward_std": 0.4950673580169678, "rewards/reward_func/mean": 0.19749999046325684, "rewards/reward_func/std": 0.4950673282146454, "sampling/importance_sampling_ratio/max": 1.9094799757003784, "sampling/importance_sampling_ratio/mean": 1.0937261581420898, "sampling/importance_sampling_ratio/min": 0.3938526511192322, "sampling/sampling_logp_difference/max": 0.42304253578186035, "sampling/sampling_logp_difference/mean": 0.015901612117886543, "step": 189, "step_time": 33.134304473001976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.20352166891098022, "epoch": 0.19, "frac_reward_zero_std": 0.0, "grad_norm": 1.6414018869400024, "kl": 0.008625084534287453, "learning_rate": 4.675778478306712e-06, "loss": 0.2948, "num_tokens": 537696.0, "reward": 0.7250000238418579, "reward_std": 0.550000011920929, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.550000011920929, "sampling/importance_sampling_ratio/max": 1.4431711435317993, "sampling/importance_sampling_ratio/mean": 0.9709050059318542, "sampling/importance_sampling_ratio/min": 0.7037810683250427, "sampling/sampling_logp_difference/max": 0.32624363899230957, "sampling/sampling_logp_difference/mean": 0.017840616405010223, "step": 190, "step_time": 21.020534721028525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1626574993133545, "epoch": 0.191, "frac_reward_zero_std": 0.0, "grad_norm": 1.1139878034591675, "kl": 0.0032230776268988848, "learning_rate": 4.671779341295378e-06, "loss": -0.1818, "num_tokens": 540415.0, "reward": 0.4599999785423279, "reward_std": 0.6237520575523376, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.6237521171569824, "sampling/importance_sampling_ratio/max": 2.0000407695770264, "sampling/importance_sampling_ratio/mean": 1.1519076824188232, "sampling/importance_sampling_ratio/min": 0.740306556224823, "sampling/sampling_logp_difference/max": 0.5854895114898682, "sampling/sampling_logp_difference/mean": 0.017062250524759293, "step": 191, "step_time": 29.520008480001707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.16937848925590515, "epoch": 0.192, "frac_reward_zero_std": 0.0, "grad_norm": 0.7818939089775085, "kl": 0.002788364654406905, "learning_rate": 4.667757423346423e-06, "loss": -0.0714, "num_tokens": 542785.0, "reward": 0.7300000190734863, "reward_std": 0.5333542227745056, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5333541631698608, "sampling/importance_sampling_ratio/max": 1.0759447813034058, "sampling/importance_sampling_ratio/mean": 0.8206874132156372, "sampling/importance_sampling_ratio/min": 0.533421516418457, "sampling/sampling_logp_difference/max": 0.25322389602661133, "sampling/sampling_logp_difference/mean": 0.0136664267629385, "step": 192, "step_time": 19.6760783889913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16708900034427643, "epoch": 0.193, "frac_reward_zero_std": 0.0, "grad_norm": 1.2296591997146606, "kl": 0.0013933817390352488, "learning_rate": 4.663712766647862e-06, "loss": 0.0745, "num_tokens": 546112.0, "reward": -0.01249999925494194, "reward_std": 0.00957427080720663, "rewards/reward_func/mean": -0.01249999925494194, "rewards/reward_func/std": 0.00957427080720663, "sampling/importance_sampling_ratio/max": 1.4404057264328003, "sampling/importance_sampling_ratio/mean": 0.9965447187423706, "sampling/importance_sampling_ratio/min": 0.587304413318634, "sampling/sampling_logp_difference/max": 0.49622249603271484, "sampling/sampling_logp_difference/mean": 0.018814144656062126, "step": 193, "step_time": 36.036414052010514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16939495503902435, "epoch": 0.194, "frac_reward_zero_std": 0.0, "grad_norm": 1.744502305984497, "kl": 0.007838107645511627, "learning_rate": 4.65964541362623e-06, "loss": -0.153, "num_tokens": 549120.0, "reward": 0.4750000238418579, "reward_std": 0.6011378169059753, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.6011378169059753, "sampling/importance_sampling_ratio/max": 1.58823823928833, "sampling/importance_sampling_ratio/mean": 0.9351853132247925, "sampling/importance_sampling_ratio/min": 0.48805105686187744, "sampling/sampling_logp_difference/max": 0.513603687286377, "sampling/sampling_logp_difference/mean": 0.015015587210655212, "step": 194, "step_time": 25.002652621013112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.18351857364177704, "epoch": 0.195, "frac_reward_zero_std": 0.0, "grad_norm": 1.9603594541549683, "kl": 0.00325272255577147, "learning_rate": 4.655555406946135e-06, "loss": 0.0028, "num_tokens": 551625.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.3735154867172241, "sampling/importance_sampling_ratio/mean": 1.0631992816925049, "sampling/importance_sampling_ratio/min": 0.6597886681556702, "sampling/sampling_logp_difference/max": 0.2774249315261841, "sampling/sampling_logp_difference/mean": 0.01287419255822897, "step": 195, "step_time": 15.258369963034056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19808019697666168, "epoch": 0.196, "frac_reward_zero_std": 0.0, "grad_norm": 1.173774003982544, "kl": 0.0024100602604448795, "learning_rate": 4.651442789509813e-06, "loss": -0.1324, "num_tokens": 554192.0, "reward": 0.4950000047683716, "reward_std": 0.5831809639930725, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5831809043884277, "sampling/importance_sampling_ratio/max": 1.550442099571228, "sampling/importance_sampling_ratio/mean": 0.8905876874923706, "sampling/importance_sampling_ratio/min": 0.4226371943950653, "sampling/sampling_logp_difference/max": 0.25294041633605957, "sampling/sampling_logp_difference/mean": 0.019111832603812218, "step": 196, "step_time": 22.24210825096816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.22103701531887054, "epoch": 0.197, "frac_reward_zero_std": 0.0, "grad_norm": 0.8214077949523926, "kl": 0.006223814096301794, "learning_rate": 4.647307604456675e-06, "loss": 0.1934, "num_tokens": 557374.0, "reward": 0.2175000011920929, "reward_std": 0.509730339050293, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.5097303986549377, "sampling/importance_sampling_ratio/max": 1.0265285968780518, "sampling/importance_sampling_ratio/mean": 0.645683765411377, "sampling/importance_sampling_ratio/min": 0.3142782151699066, "sampling/sampling_logp_difference/max": 0.850468635559082, "sampling/sampling_logp_difference/mean": 0.022503092885017395, "step": 197, "step_time": 41.81870661897119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18976393342018127, "epoch": 0.198, "frac_reward_zero_std": 0.0, "grad_norm": 1.5295933485031128, "kl": 0.023059934377670288, "learning_rate": 4.643149895162854e-06, "loss": 0.0298, "num_tokens": 559746.0, "reward": 0.19249999523162842, "reward_std": 0.5404550433158875, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.5404551029205322, "sampling/importance_sampling_ratio/max": 1.088395595550537, "sampling/importance_sampling_ratio/mean": 0.958138108253479, "sampling/importance_sampling_ratio/min": 0.790735125541687, "sampling/sampling_logp_difference/max": 0.25106096267700195, "sampling/sampling_logp_difference/mean": 0.010837765410542488, "step": 198, "step_time": 30.152644325979054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18738023936748505, "epoch": 0.199, "frac_reward_zero_std": 0.0, "grad_norm": 0.9161056876182556, "kl": 0.0014563420554623008, "learning_rate": 4.6389697052407535e-06, "loss": -0.3027, "num_tokens": 562476.0, "reward": 0.7174999713897705, "reward_std": 0.5583532452583313, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.5583532452583313, "sampling/importance_sampling_ratio/max": 1.5502657890319824, "sampling/importance_sampling_ratio/mean": 1.200308084487915, "sampling/importance_sampling_ratio/min": 0.6852198839187622, "sampling/sampling_logp_difference/max": 0.29951971769332886, "sampling/sampling_logp_difference/mean": 0.014930280856788158, "step": 199, "step_time": 22.09022646897938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16149453818798065, "epoch": 0.2, "frac_reward_zero_std": 0.0, "grad_norm": 1.4768141508102417, "kl": 0.006409245543181896, "learning_rate": 4.634767078538589e-06, "loss": 0.3176, "num_tokens": 565542.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 1.8615809679031372, "sampling/importance_sampling_ratio/mean": 1.1814024448394775, "sampling/importance_sampling_ratio/min": 0.5698959231376648, "sampling/sampling_logp_difference/max": 0.735806941986084, "sampling/sampling_logp_difference/mean": 0.01477879099547863, "step": 200, "step_time": 21.855877375986893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.18661177158355713, "epoch": 0.201, "frac_reward_zero_std": 0.0, "grad_norm": 1.5430704355239868, "kl": 0.003819567384198308, "learning_rate": 4.630542059139923e-06, "loss": 0.0171, "num_tokens": 568483.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.7871607542037964, "sampling/importance_sampling_ratio/mean": 1.1847898960113525, "sampling/importance_sampling_ratio/min": 0.6825270056724548, "sampling/sampling_logp_difference/max": 0.5733441114425659, "sampling/sampling_logp_difference/mean": 0.02175774984061718, "step": 201, "step_time": 24.261658603034448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19293345510959625, "epoch": 0.202, "frac_reward_zero_std": 0.0, "grad_norm": 1.0841349363327026, "kl": 0.002205593977123499, "learning_rate": 4.626294691363213e-06, "loss": -0.1181, "num_tokens": 571447.0, "reward": 0.23749999701976776, "reward_std": 0.5088794827461243, "rewards/reward_func/mean": 0.23749999701976776, "rewards/reward_func/std": 0.5088794827461243, "sampling/importance_sampling_ratio/max": 1.65753173828125, "sampling/importance_sampling_ratio/mean": 0.9754144549369812, "sampling/importance_sampling_ratio/min": 0.5637417435646057, "sampling/sampling_logp_difference/max": 0.43741893768310547, "sampling/sampling_logp_difference/mean": 0.01739703305065632, "step": 202, "step_time": 26.847976484976243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1924976110458374, "epoch": 0.203, "frac_reward_zero_std": 0.0, "grad_norm": 0.6357241272926331, "kl": 0.00545016024261713, "learning_rate": 4.622025019761336e-06, "loss": 0.0671, "num_tokens": 574574.0, "reward": 0.7425000071525574, "reward_std": 0.4952020049095154, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.495201975107193, "sampling/importance_sampling_ratio/max": 1.2182741165161133, "sampling/importance_sampling_ratio/mean": 0.6256334185600281, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6747913360595703, "sampling/sampling_logp_difference/mean": 0.019777601584792137, "step": 203, "step_time": 25.412911874009296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1870274841785431, "epoch": 0.204, "frac_reward_zero_std": 0.0, "grad_norm": 0.6026756763458252, "kl": 0.002690628869459033, "learning_rate": 4.617733089121127e-06, "loss": 0.0069, "num_tokens": 577251.0, "reward": -0.05250000208616257, "reward_std": 0.0684957504272461, "rewards/reward_func/mean": -0.05250000208616257, "rewards/reward_func/std": 0.0684957504272461, "sampling/importance_sampling_ratio/max": 0.8026514053344727, "sampling/importance_sampling_ratio/mean": 0.6897410154342651, "sampling/importance_sampling_ratio/min": 0.591428279876709, "sampling/sampling_logp_difference/max": 0.3059091567993164, "sampling/sampling_logp_difference/mean": 0.012346535921096802, "step": 204, "step_time": 35.360861964989454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.17740879952907562, "epoch": 0.205, "frac_reward_zero_std": 0.0, "grad_norm": 1.7944884300231934, "kl": 0.0068826377391815186, "learning_rate": 4.613418944462907e-06, "loss": -0.0543, "num_tokens": 580021.0, "reward": 0.7450000047683716, "reward_std": 0.4966890513896942, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.4966890215873718, "sampling/importance_sampling_ratio/max": 0.9970582723617554, "sampling/importance_sampling_ratio/mean": 0.7438902854919434, "sampling/importance_sampling_ratio/min": 0.34753066301345825, "sampling/sampling_logp_difference/max": 0.42323732376098633, "sampling/sampling_logp_difference/mean": 0.01754819042980671, "step": 205, "step_time": 24.926524802984204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.20659232139587402, "epoch": 0.206, "frac_reward_zero_std": 0.0, "grad_norm": 0.812511146068573, "kl": 0.005072076339274645, "learning_rate": 4.609082631040012e-06, "loss": -0.0632, "num_tokens": 582640.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 1.2154662609100342, "sampling/importance_sampling_ratio/mean": 0.7934609651565552, "sampling/importance_sampling_ratio/min": 0.5974061489105225, "sampling/sampling_logp_difference/max": 0.574169397354126, "sampling/sampling_logp_difference/mean": 0.020004652440547943, "step": 206, "step_time": 16.393548396008555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.17995257675647736, "epoch": 0.207, "frac_reward_zero_std": 0.0, "grad_norm": 1.166077971458435, "kl": 0.0034289222676306963, "learning_rate": 4.604724194338318e-06, "loss": -0.3517, "num_tokens": 585357.0, "reward": 0.7150000333786011, "reward_std": 0.5566866397857666, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5566866397857666, "sampling/importance_sampling_ratio/max": 2.170780658721924, "sampling/importance_sampling_ratio/mean": 1.2304482460021973, "sampling/importance_sampling_ratio/min": 0.5083069205284119, "sampling/sampling_logp_difference/max": 0.3541746139526367, "sampling/sampling_logp_difference/mean": 0.01570459082722664, "step": 207, "step_time": 24.92964396701427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1535055786371231, "epoch": 0.208, "frac_reward_zero_std": 0.0, "grad_norm": 2.279757499694824, "kl": 0.00849776342511177, "learning_rate": 4.600343680075764e-06, "loss": 0.2394, "num_tokens": 588493.0, "reward": 0.49000000953674316, "reward_std": 0.5831523537635803, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5831523537635803, "sampling/importance_sampling_ratio/max": 1.8564841747283936, "sampling/importance_sampling_ratio/mean": 1.1894092559814453, "sampling/importance_sampling_ratio/min": 0.8823007941246033, "sampling/sampling_logp_difference/max": 0.4677090644836426, "sampling/sampling_logp_difference/mean": 0.016920341178774834, "step": 208, "step_time": 23.995852635009214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16516010463237762, "epoch": 0.209, "frac_reward_zero_std": 0.0, "grad_norm": 0.7805745601654053, "kl": 0.0012875512475147843, "learning_rate": 4.5959411342018715e-06, "loss": -0.0038, "num_tokens": 590939.0, "reward": 0.7274999618530273, "reward_std": 0.518676221370697, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.518676221370697, "sampling/importance_sampling_ratio/max": 0.8482518196105957, "sampling/importance_sampling_ratio/mean": 0.8030445575714111, "sampling/importance_sampling_ratio/min": 0.7333587408065796, "sampling/sampling_logp_difference/max": 0.2263803482055664, "sampling/sampling_logp_difference/mean": 0.010130737908184528, "step": 209, "step_time": 22.50380123400828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.172202005982399, "epoch": 0.21, "frac_reward_zero_std": 0.0, "grad_norm": 0.9613502025604248, "kl": 0.01898016221821308, "learning_rate": 4.591516602897263e-06, "loss": -0.1782, "num_tokens": 594189.0, "reward": 0.49000000953674316, "reward_std": 0.5888972282409668, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5888972282409668, "sampling/importance_sampling_ratio/max": 1.467721700668335, "sampling/importance_sampling_ratio/mean": 0.9098449349403381, "sampling/importance_sampling_ratio/min": 0.5286229252815247, "sampling/sampling_logp_difference/max": 0.4542117118835449, "sampling/sampling_logp_difference/mean": 0.020286964252591133, "step": 210, "step_time": 26.957335913029965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.21753226220607758, "epoch": 0.211, "frac_reward_zero_std": 0.0, "grad_norm": 1.086082935333252, "kl": 0.00403759116306901, "learning_rate": 4.587070132573178e-06, "loss": 0.4218, "num_tokens": 597323.0, "reward": 0.22749999165534973, "reward_std": 0.48182812333106995, "rewards/reward_func/mean": 0.22749999165534973, "rewards/reward_func/std": 0.48182812333106995, "sampling/importance_sampling_ratio/max": 2.053893566131592, "sampling/importance_sampling_ratio/mean": 1.1513149738311768, "sampling/importance_sampling_ratio/min": 0.4566267430782318, "sampling/sampling_logp_difference/max": 0.22193604707717896, "sampling/sampling_logp_difference/mean": 0.01675686240196228, "step": 211, "step_time": 32.167313658981584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.20602165162563324, "epoch": 0.212, "frac_reward_zero_std": 0.0, "grad_norm": 1.4180781841278076, "kl": 0.0050665331073105335, "learning_rate": 4.582601769870988e-06, "loss": 0.2205, "num_tokens": 599812.0, "reward": 0.45499998331069946, "reward_std": 0.6293118000030518, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.6293118000030518, "sampling/importance_sampling_ratio/max": 1.8629767894744873, "sampling/importance_sampling_ratio/mean": 0.9542645215988159, "sampling/importance_sampling_ratio/min": 0.39431333541870117, "sampling/sampling_logp_difference/max": 0.47431087493896484, "sampling/sampling_logp_difference/mean": 0.01967659778892994, "step": 212, "step_time": 22.65969598700758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 61.5, "completions/mean_terminated_length": 61.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.21304994821548462, "epoch": 0.213, "frac_reward_zero_std": 0.0, "grad_norm": 1.0070796012878418, "kl": 0.0032520401291549206, "learning_rate": 4.578111561661702e-06, "loss": 0.1768, "num_tokens": 602481.0, "reward": 0.4625000059604645, "reward_std": 0.6236652135848999, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.6236652135848999, "sampling/importance_sampling_ratio/max": 1.1486892700195312, "sampling/importance_sampling_ratio/mean": 0.9541930556297302, "sampling/importance_sampling_ratio/min": 0.8212027549743652, "sampling/sampling_logp_difference/max": 0.44177913665771484, "sampling/sampling_logp_difference/mean": 0.01883576065301895, "step": 213, "step_time": 25.508753468981013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.15160547196865082, "epoch": 0.214, "frac_reward_zero_std": 0.0, "grad_norm": 0.8835320472717285, "kl": 0.004682162310928106, "learning_rate": 4.57359955504548e-06, "loss": 0.1544, "num_tokens": 605362.0, "reward": 0.20499999821186066, "reward_std": 0.530440092086792, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.530440092086792, "sampling/importance_sampling_ratio/max": 1.624040126800537, "sampling/importance_sampling_ratio/mean": 0.956684947013855, "sampling/importance_sampling_ratio/min": 0.652014970779419, "sampling/sampling_logp_difference/max": 0.4743063449859619, "sampling/sampling_logp_difference/mean": 0.020044377073645592, "step": 214, "step_time": 33.32595620397478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.14977309107780457, "epoch": 0.215, "frac_reward_zero_std": 0.0, "grad_norm": 1.01729154586792, "kl": 0.0017707758815959096, "learning_rate": 4.569065797351135e-06, "loss": -0.0411, "num_tokens": 608406.0, "reward": 0.7300000190734863, "reward_std": 0.5333542227745056, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5333541631698608, "sampling/importance_sampling_ratio/max": 1.2385690212249756, "sampling/importance_sampling_ratio/mean": 0.9292113780975342, "sampling/importance_sampling_ratio/min": 0.7877894639968872, "sampling/sampling_logp_difference/max": 0.32393813133239746, "sampling/sampling_logp_difference/mean": 0.01315419003367424, "step": 215, "step_time": 23.77990508201765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1729770451784134, "epoch": 0.216, "frac_reward_zero_std": 0.0, "grad_norm": 1.8886969089508057, "kl": 0.0054801651276648045, "learning_rate": 4.564510336135642e-06, "loss": 0.019, "num_tokens": 611498.0, "reward": 0.4775000214576721, "reward_std": 0.5984632968902588, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5984632968902588, "sampling/importance_sampling_ratio/max": 2.3270490169525146, "sampling/importance_sampling_ratio/mean": 1.3164610862731934, "sampling/importance_sampling_ratio/min": 0.5813789367675781, "sampling/sampling_logp_difference/max": 0.6861288547515869, "sampling/sampling_logp_difference/mean": 0.02149096690118313, "step": 216, "step_time": 25.423955380974803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17080888152122498, "epoch": 0.217, "frac_reward_zero_std": 0.0, "grad_norm": 2.486280918121338, "kl": 0.013949483633041382, "learning_rate": 4.559933219183631e-06, "loss": -0.063, "num_tokens": 614423.0, "reward": 0.23749999701976776, "reward_std": 0.508486270904541, "rewards/reward_func/mean": 0.23749999701976776, "rewards/reward_func/std": 0.5084863305091858, "sampling/importance_sampling_ratio/max": 1.5361467599868774, "sampling/importance_sampling_ratio/mean": 1.0354121923446655, "sampling/importance_sampling_ratio/min": 0.6014396548271179, "sampling/sampling_logp_difference/max": 0.5321464538574219, "sampling/sampling_logp_difference/mean": 0.01993136666715145, "step": 217, "step_time": 27.66601578099653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.14733242988586426, "epoch": 0.218, "frac_reward_zero_std": 0.0, "grad_norm": 0.9290664196014404, "kl": 0.0025747239124029875, "learning_rate": 4.555334494506895e-06, "loss": 0.4129, "num_tokens": 617128.0, "reward": 0.4699999988079071, "reward_std": 0.6013873219490051, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6013872623443604, "sampling/importance_sampling_ratio/max": 1.9591519832611084, "sampling/importance_sampling_ratio/mean": 0.9865109920501709, "sampling/importance_sampling_ratio/min": 0.3235395550727844, "sampling/sampling_logp_difference/max": 0.9777097702026367, "sampling/sampling_logp_difference/mean": 0.01940268650650978, "step": 218, "step_time": 23.89865228300914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.17947235703468323, "epoch": 0.219, "frac_reward_zero_std": 0.0, "grad_norm": 0.7968807220458984, "kl": 0.0034287527669221163, "learning_rate": 4.550714210343879e-06, "loss": 0.1055, "num_tokens": 620218.0, "reward": 0.47999998927116394, "reward_std": 0.5948108434677124, "rewards/reward_func/mean": 0.47999998927116394, "rewards/reward_func/std": 0.5948109030723572, "sampling/importance_sampling_ratio/max": 1.2088016271591187, "sampling/importance_sampling_ratio/mean": 0.8323151469230652, "sampling/importance_sampling_ratio/min": 0.6398744583129883, "sampling/sampling_logp_difference/max": 0.286823034286499, "sampling/sampling_logp_difference/mean": 0.014719543978571892, "step": 219, "step_time": 23.61854365700856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1820184886455536, "epoch": 0.22, "frac_reward_zero_std": 0.0, "grad_norm": 2.801039695739746, "kl": 0.00774419866502285, "learning_rate": 4.546072415159179e-06, "loss": 0.1586, "num_tokens": 623237.0, "reward": 0.7200000286102295, "reward_std": 0.5076087713241577, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5076087713241577, "sampling/importance_sampling_ratio/max": 2.0452206134796143, "sampling/importance_sampling_ratio/mean": 1.2801021337509155, "sampling/importance_sampling_ratio/min": 0.7474051713943481, "sampling/sampling_logp_difference/max": 0.3251032829284668, "sampling/sampling_logp_difference/mean": 0.020618204027414322, "step": 220, "step_time": 24.502415616007056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1628129929304123, "epoch": 0.221, "frac_reward_zero_std": 0.0, "grad_norm": 1.1559869050979614, "kl": 0.0016305913450196385, "learning_rate": 4.541409157643027e-06, "loss": 0.0374, "num_tokens": 625738.0, "reward": 0.4775000214576721, "reward_std": 0.6040074825286865, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.6040074825286865, "sampling/importance_sampling_ratio/max": 1.414289951324463, "sampling/importance_sampling_ratio/mean": 1.120866298675537, "sampling/importance_sampling_ratio/min": 0.8656407594680786, "sampling/sampling_logp_difference/max": 0.33409690856933594, "sampling/sampling_logp_difference/mean": 0.013469605706632137, "step": 221, "step_time": 18.414888665021863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15178748965263367, "epoch": 0.222, "frac_reward_zero_std": 0.0, "grad_norm": 1.117395043373108, "kl": 0.0029174734372645617, "learning_rate": 4.5367244867107905e-06, "loss": 0.0867, "num_tokens": 628482.0, "reward": 0.7300000190734863, "reward_std": 0.5399999618530273, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 2.355330467224121, "sampling/importance_sampling_ratio/mean": 1.2426996231079102, "sampling/importance_sampling_ratio/min": 0.39689332246780396, "sampling/sampling_logp_difference/max": 0.5048971176147461, "sampling/sampling_logp_difference/mean": 0.014619938097894192, "step": 222, "step_time": 19.629085155960638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.21440953016281128, "epoch": 0.223, "frac_reward_zero_std": 0.0, "grad_norm": 1.0970300436019897, "kl": 0.002301981672644615, "learning_rate": 4.53201845150245e-06, "loss": -0.045, "num_tokens": 631167.0, "reward": -0.04749999940395355, "reward_std": 0.042720019817352295, "rewards/reward_func/mean": -0.04749999940395355, "rewards/reward_func/std": 0.042720019817352295, "sampling/importance_sampling_ratio/max": 1.2772808074951172, "sampling/importance_sampling_ratio/mean": 0.9095630049705505, "sampling/importance_sampling_ratio/min": 0.4679076075553894, "sampling/sampling_logp_difference/max": 0.3412916660308838, "sampling/sampling_logp_difference/mean": 0.019764376804232597, "step": 223, "step_time": 31.221598003990948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17501311004161835, "epoch": 0.224, "frac_reward_zero_std": 0.0, "grad_norm": 0.9713287949562073, "kl": 0.0035262643359601498, "learning_rate": 4.527291101382088e-06, "loss": 0.1099, "num_tokens": 633819.0, "reward": 0.44749999046325684, "reward_std": 0.6364681124687195, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.6364681720733643, "sampling/importance_sampling_ratio/max": 0.9688714146614075, "sampling/importance_sampling_ratio/mean": 0.8728395700454712, "sampling/importance_sampling_ratio/min": 0.606150209903717, "sampling/sampling_logp_difference/max": 0.3571023941040039, "sampling/sampling_logp_difference/mean": 0.015612028539180756, "step": 224, "step_time": 25.103644379007164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.16089242696762085, "epoch": 0.225, "frac_reward_zero_std": 0.0, "grad_norm": 2.1169357299804688, "kl": 0.003555483417585492, "learning_rate": 4.522542485937369e-06, "loss": -0.1462, "num_tokens": 636742.0, "reward": 0.20250000059604645, "reward_std": 0.5330650806427002, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.5330650806427002, "sampling/importance_sampling_ratio/max": 1.6218316555023193, "sampling/importance_sampling_ratio/mean": 1.142683506011963, "sampling/importance_sampling_ratio/min": 0.7163317799568176, "sampling/sampling_logp_difference/max": 0.358301043510437, "sampling/sampling_logp_difference/mean": 0.01212917361408472, "step": 225, "step_time": 28.86196574702626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1688849776983261, "epoch": 0.226, "frac_reward_zero_std": 0.0, "grad_norm": 1.5231505632400513, "kl": 0.0049278815276920795, "learning_rate": 4.517772654979024e-06, "loss": -0.378, "num_tokens": 639824.0, "reward": 0.2199999988079071, "reward_std": 0.5147815346717834, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5147815346717834, "sampling/importance_sampling_ratio/max": 1.800282597541809, "sampling/importance_sampling_ratio/mean": 1.1005094051361084, "sampling/importance_sampling_ratio/min": 0.6027688980102539, "sampling/sampling_logp_difference/max": 0.30415111780166626, "sampling/sampling_logp_difference/mean": 0.016980208456516266, "step": 226, "step_time": 33.62905010796385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1840108186006546, "epoch": 0.227, "frac_reward_zero_std": 0.0, "grad_norm": 1.5238986015319824, "kl": 0.0016073728911578655, "learning_rate": 4.512981658540321e-06, "loss": 0.0929, "num_tokens": 642189.0, "reward": 0.7325000166893005, "reward_std": 0.5350000262260437, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5349999666213989, "sampling/importance_sampling_ratio/max": 1.1417913436889648, "sampling/importance_sampling_ratio/mean": 0.8770938515663147, "sampling/importance_sampling_ratio/min": 0.33509719371795654, "sampling/sampling_logp_difference/max": 0.37438416481018066, "sampling/sampling_logp_difference/mean": 0.012336324900388718, "step": 227, "step_time": 12.747543913021218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.20544865727424622, "epoch": 0.228, "frac_reward_zero_std": 0.0, "grad_norm": 0.8147231936454773, "kl": 0.0060724313370883465, "learning_rate": 4.508169546876547e-06, "loss": 0.4258, "num_tokens": 644975.0, "reward": 0.49000000953674316, "reward_std": 0.5891236066818237, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5891236066818237, "sampling/importance_sampling_ratio/max": 1.1660664081573486, "sampling/importance_sampling_ratio/mean": 0.7710179090499878, "sampling/importance_sampling_ratio/min": 0.15357238054275513, "sampling/sampling_logp_difference/max": 0.6296253204345703, "sampling/sampling_logp_difference/mean": 0.02125202864408493, "step": 228, "step_time": 22.53876115300227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.15922018885612488, "epoch": 0.229, "frac_reward_zero_std": 0.0, "grad_norm": 1.0769132375717163, "kl": 0.004166284576058388, "learning_rate": 4.503336370464476e-06, "loss": 0.2407, "num_tokens": 648145.0, "reward": 0.7325000166893005, "reward_std": 0.5349999666213989, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5349999666213989, "sampling/importance_sampling_ratio/max": 1.227849006652832, "sampling/importance_sampling_ratio/mean": 0.8435319066047668, "sampling/importance_sampling_ratio/min": 0.6474818587303162, "sampling/sampling_logp_difference/max": 0.368394136428833, "sampling/sampling_logp_difference/mean": 0.01712263748049736, "step": 229, "step_time": 30.984656499989796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.14839787781238556, "epoch": 0.23, "frac_reward_zero_std": 0.0, "grad_norm": 0.9229027032852173, "kl": 0.0030242730863392353, "learning_rate": 4.49848218000184e-06, "loss": -0.1217, "num_tokens": 651425.0, "reward": 0.4650000035762787, "reward_std": 0.6126717329025269, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.6126717329025269, "sampling/importance_sampling_ratio/max": 1.0468376874923706, "sampling/importance_sampling_ratio/mean": 0.7879884839057922, "sampling/importance_sampling_ratio/min": 0.4326409697532654, "sampling/sampling_logp_difference/max": 0.47689372301101685, "sampling/sampling_logp_difference/mean": 0.015902047976851463, "step": 230, "step_time": 34.554520089994185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17190957069396973, "epoch": 0.231, "frac_reward_zero_std": 0.0, "grad_norm": 0.692036509513855, "kl": 0.0019588011782616377, "learning_rate": 4.493607026406802e-06, "loss": 0.0892, "num_tokens": 653833.0, "reward": 0.45750001072883606, "reward_std": 0.6280326843261719, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.6280326843261719, "sampling/importance_sampling_ratio/max": 1.2550852298736572, "sampling/importance_sampling_ratio/mean": 0.7677662372589111, "sampling/importance_sampling_ratio/min": 0.4966028928756714, "sampling/sampling_logp_difference/max": 0.4629930853843689, "sampling/sampling_logp_difference/mean": 0.015202355571091175, "step": 231, "step_time": 22.114998120989185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.21091072261333466, "epoch": 0.232, "frac_reward_zero_std": 0.0, "grad_norm": 1.5379953384399414, "kl": 0.008772267960011959, "learning_rate": 4.488710960817416e-06, "loss": -0.1515, "num_tokens": 656498.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.8928613662719727, "sampling/importance_sampling_ratio/mean": 1.324841856956482, "sampling/importance_sampling_ratio/min": 0.7693072557449341, "sampling/sampling_logp_difference/max": 0.47295308113098145, "sampling/sampling_logp_difference/mean": 0.018158717080950737, "step": 232, "step_time": 19.566407561011147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2370857149362564, "epoch": 0.233, "frac_reward_zero_std": 0.0, "grad_norm": 1.0172522068023682, "kl": 0.006720225792378187, "learning_rate": 4.483794034591092e-06, "loss": 0.2642, "num_tokens": 659291.0, "reward": 0.20250000059604645, "reward_std": 0.5361825227737427, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.5361825227737427, "sampling/importance_sampling_ratio/max": 1.9427385330200195, "sampling/importance_sampling_ratio/mean": 0.9721396565437317, "sampling/importance_sampling_ratio/min": 0.46251410245895386, "sampling/sampling_logp_difference/max": 0.6873619556427002, "sampling/sampling_logp_difference/mean": 0.027657004073262215, "step": 233, "step_time": 27.474261264957022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.19535812735557556, "epoch": 0.234, "frac_reward_zero_std": 0.0, "grad_norm": 1.2024794816970825, "kl": 0.0037248514126986265, "learning_rate": 4.4788562993040615e-06, "loss": 0.064, "num_tokens": 662091.0, "reward": 0.7350000143051147, "reward_std": 0.5036864280700684, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5036864280700684, "sampling/importance_sampling_ratio/max": 1.1399562358856201, "sampling/importance_sampling_ratio/mean": 0.8556986451148987, "sampling/importance_sampling_ratio/min": 0.5738300085067749, "sampling/sampling_logp_difference/max": 0.5115323066711426, "sampling/sampling_logp_difference/mean": 0.018068833276629448, "step": 234, "step_time": 21.070288820948917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.16162559390068054, "epoch": 0.235, "frac_reward_zero_std": 0.0, "grad_norm": 0.7312474846839905, "kl": 0.0031602142844349146, "learning_rate": 4.473897806750829e-06, "loss": 0.18, "num_tokens": 664729.0, "reward": 0.7225000262260437, "reward_std": 0.5094032287597656, "rewards/reward_func/mean": 0.7225000262260437, "rewards/reward_func/std": 0.5094032883644104, "sampling/importance_sampling_ratio/max": 0.983009934425354, "sampling/importance_sampling_ratio/mean": 0.6252938508987427, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6728285551071167, "sampling/sampling_logp_difference/mean": 0.014609193429350853, "step": 235, "step_time": 26.038421535980888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1573459506034851, "epoch": 0.236, "frac_reward_zero_std": 0.0, "grad_norm": 0.9555028676986694, "kl": 0.005580625031143427, "learning_rate": 4.4689186089436365e-06, "loss": -0.1231, "num_tokens": 667303.0, "reward": 0.44999998807907104, "reward_std": 0.6299735307693481, "rewards/reward_func/mean": 0.44999998807907104, "rewards/reward_func/std": 0.6299735307693481, "sampling/importance_sampling_ratio/max": 1.1240020990371704, "sampling/importance_sampling_ratio/mean": 0.831385612487793, "sampling/importance_sampling_ratio/min": 0.5903722643852234, "sampling/sampling_logp_difference/max": 0.5004134178161621, "sampling/sampling_logp_difference/mean": 0.01297784224152565, "step": 236, "step_time": 22.75848210102413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1554155796766281, "epoch": 0.237, "frac_reward_zero_std": 0.0, "grad_norm": 1.3145447969436646, "kl": 0.0049837431870400906, "learning_rate": 4.463918758111912e-06, "loss": -0.2598, "num_tokens": 670723.0, "reward": -0.019999999552965164, "reward_std": 0.027080126106739044, "rewards/reward_func/mean": -0.019999999552965164, "rewards/reward_func/std": 0.027080127969384193, "sampling/importance_sampling_ratio/max": 2.3555781841278076, "sampling/importance_sampling_ratio/mean": 1.6146156787872314, "sampling/importance_sampling_ratio/min": 0.9639477133750916, "sampling/sampling_logp_difference/max": 0.558695912361145, "sampling/sampling_logp_difference/mean": 0.01871698722243309, "step": 237, "step_time": 38.98968921700725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1660567969083786, "epoch": 0.238, "frac_reward_zero_std": 0.0, "grad_norm": 1.6980713605880737, "kl": 0.0034061181358993053, "learning_rate": 4.4588983067017255e-06, "loss": -0.0333, "num_tokens": 673536.0, "reward": 0.45749998092651367, "reward_std": 0.6270765066146851, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6270765662193298, "sampling/importance_sampling_ratio/max": 1.5242257118225098, "sampling/importance_sampling_ratio/mean": 1.0824337005615234, "sampling/importance_sampling_ratio/min": 0.7657027840614319, "sampling/sampling_logp_difference/max": 0.345442533493042, "sampling/sampling_logp_difference/mean": 0.01617368683218956, "step": 238, "step_time": 24.416691513033584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16248099505901337, "epoch": 0.239, "frac_reward_zero_std": 0.0, "grad_norm": 0.7217922806739807, "kl": 0.0029372740536928177, "learning_rate": 4.4538573073752365e-06, "loss": 0.3969, "num_tokens": 675895.0, "reward": 0.09999999403953552, "reward_std": 0.5246586203575134, "rewards/reward_func/mean": 0.09999999403953552, "rewards/reward_func/std": 0.5246586203575134, "sampling/importance_sampling_ratio/max": 1.893911600112915, "sampling/importance_sampling_ratio/mean": 1.1577680110931396, "sampling/importance_sampling_ratio/min": 0.3794650435447693, "sampling/sampling_logp_difference/max": 0.7814117670059204, "sampling/sampling_logp_difference/mean": 0.016155004501342773, "step": 239, "step_time": 35.26459780102596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.20193207263946533, "epoch": 0.24, "frac_reward_zero_std": 0.0, "grad_norm": 2.1561241149902344, "kl": 0.0028088088147342205, "learning_rate": 4.448795813010142e-06, "loss": -0.0842, "num_tokens": 678756.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.5845848321914673, "sampling/importance_sampling_ratio/mean": 1.233544945716858, "sampling/importance_sampling_ratio/min": 0.6882336139678955, "sampling/sampling_logp_difference/max": 0.42262864112854004, "sampling/sampling_logp_difference/mean": 0.02130243554711342, "step": 240, "step_time": 21.420518664992414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.17727532982826233, "epoch": 0.241, "frac_reward_zero_std": 0.0, "grad_norm": 1.7023075819015503, "kl": 0.0036515984684228897, "learning_rate": 4.443713876699124e-06, "loss": -0.0706, "num_tokens": 681473.0, "reward": -0.03750000149011612, "reward_std": 0.04349329322576523, "rewards/reward_func/mean": -0.03750000149011612, "rewards/reward_func/std": 0.04349329695105553, "sampling/importance_sampling_ratio/max": 2.2336153984069824, "sampling/importance_sampling_ratio/mean": 1.3457293510437012, "sampling/importance_sampling_ratio/min": 0.6323131918907166, "sampling/sampling_logp_difference/max": 0.29899919033050537, "sampling/sampling_logp_difference/mean": 0.015179060399532318, "step": 241, "step_time": 27.12615438195644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.15212073922157288, "epoch": 0.242, "frac_reward_zero_std": 0.0, "grad_norm": 1.0509767532348633, "kl": 0.003847780404612422, "learning_rate": 4.438611551749288e-06, "loss": 0.0448, "num_tokens": 684563.0, "reward": 0.48750001192092896, "reward_std": 0.5860816240310669, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5860816240310669, "sampling/importance_sampling_ratio/max": 1.4215171337127686, "sampling/importance_sampling_ratio/mean": 1.1065583229064941, "sampling/importance_sampling_ratio/min": 0.915309488773346, "sampling/sampling_logp_difference/max": 0.2636893391609192, "sampling/sampling_logp_difference/mean": 0.011473672464489937, "step": 242, "step_time": 26.94395197898848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2163042277097702, "epoch": 0.243, "frac_reward_zero_std": 0.0, "grad_norm": 1.3274853229522705, "kl": 0.006431460380554199, "learning_rate": 4.4334888916816096e-06, "loss": 0.0586, "num_tokens": 687070.0, "reward": 0.4649999737739563, "reward_std": 0.6123452186584473, "rewards/reward_func/mean": 0.4649999737739563, "rewards/reward_func/std": 0.6123452186584473, "sampling/importance_sampling_ratio/max": 1.4256460666656494, "sampling/importance_sampling_ratio/mean": 0.9783781170845032, "sampling/importance_sampling_ratio/min": 0.5138875842094421, "sampling/sampling_logp_difference/max": 0.5303188562393188, "sampling/sampling_logp_difference/mean": 0.02629111334681511, "step": 243, "step_time": 22.43969178403495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17016561329364777, "epoch": 0.244, "frac_reward_zero_std": 0.0, "grad_norm": 0.7333023548126221, "kl": 0.0048171463422477245, "learning_rate": 4.42834595023037e-06, "loss": 0.0111, "num_tokens": 689561.0, "reward": 0.22499999403953552, "reward_std": 0.5188127756118774, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.5188127756118774, "sampling/importance_sampling_ratio/max": 0.7121997475624084, "sampling/importance_sampling_ratio/mean": 0.6534751057624817, "sampling/importance_sampling_ratio/min": 0.5646950006484985, "sampling/sampling_logp_difference/max": 0.41962337493896484, "sampling/sampling_logp_difference/mean": 0.020349925383925438, "step": 244, "step_time": 24.1915795810055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1496214121580124, "epoch": 0.245, "frac_reward_zero_std": 0.0, "grad_norm": 1.108378291130066, "kl": 0.0028258857782930136, "learning_rate": 4.423182781342589e-06, "loss": -0.2688, "num_tokens": 691982.0, "reward": 0.9925000071525574, "reward_std": 0.00957426242530346, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.00957426242530346, "sampling/importance_sampling_ratio/max": 1.8059526681900024, "sampling/importance_sampling_ratio/mean": 1.0737988948822021, "sampling/importance_sampling_ratio/min": 0.6583109498023987, "sampling/sampling_logp_difference/max": 0.34187817573547363, "sampling/sampling_logp_difference/mean": 0.014524086378514767, "step": 245, "step_time": 15.640448669029865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17321377992630005, "epoch": 0.246, "frac_reward_zero_std": 0.0, "grad_norm": 1.1270389556884766, "kl": 0.005757591687142849, "learning_rate": 4.417999439177465e-06, "loss": -0.0684, "num_tokens": 694701.0, "reward": 0.1850000023841858, "reward_std": 0.5380520820617676, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.5380520224571228, "sampling/importance_sampling_ratio/max": 2.4126176834106445, "sampling/importance_sampling_ratio/mean": 1.2212178707122803, "sampling/importance_sampling_ratio/min": 0.3442741632461548, "sampling/sampling_logp_difference/max": 0.46993541717529297, "sampling/sampling_logp_difference/mean": 0.01431471761316061, "step": 246, "step_time": 33.33708826696966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1900184154510498, "epoch": 0.247, "frac_reward_zero_std": 0.0, "grad_norm": 0.991073489189148, "kl": 0.00824125949293375, "learning_rate": 4.412795978105807e-06, "loss": -0.1085, "num_tokens": 697487.0, "reward": 0.7325000166893005, "reward_std": 0.5150647163391113, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5150647163391113, "sampling/importance_sampling_ratio/max": 1.5174659490585327, "sampling/importance_sampling_ratio/mean": 0.8417618274688721, "sampling/importance_sampling_ratio/min": 0.3911638855934143, "sampling/sampling_logp_difference/max": 0.44056326150894165, "sampling/sampling_logp_difference/mean": 0.022896194830536842, "step": 247, "step_time": 28.42331211094279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18074484169483185, "epoch": 0.248, "frac_reward_zero_std": 0.0, "grad_norm": 1.414807677268982, "kl": 0.007243963424116373, "learning_rate": 4.407572452709459e-06, "loss": 0.1367, "num_tokens": 700398.0, "reward": 0.48250001668930054, "reward_std": 0.5918543934822083, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.591854453086853, "sampling/importance_sampling_ratio/max": 1.1573386192321777, "sampling/importance_sampling_ratio/mean": 0.8814097046852112, "sampling/importance_sampling_ratio/min": 0.3181094527244568, "sampling/sampling_logp_difference/max": 0.8345720767974854, "sampling/sampling_logp_difference/mean": 0.021166425198316574, "step": 248, "step_time": 24.261309003981296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.21707120537757874, "epoch": 0.249, "frac_reward_zero_std": 0.0, "grad_norm": 1.0128535032272339, "kl": 0.004511576611548662, "learning_rate": 4.402328917780728e-06, "loss": 0.0774, "num_tokens": 702973.0, "reward": 0.44749999046325684, "reward_std": 0.600631594657898, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.600631594657898, "sampling/importance_sampling_ratio/max": 1.4061305522918701, "sampling/importance_sampling_ratio/mean": 0.8819220066070557, "sampling/importance_sampling_ratio/min": 0.31978657841682434, "sampling/sampling_logp_difference/max": 0.7459839582443237, "sampling/sampling_logp_difference/mean": 0.020309727638959885, "step": 249, "step_time": 26.176429501967505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.20041458308696747, "epoch": 0.25, "frac_reward_zero_std": 0.0, "grad_norm": 0.9125915169715881, "kl": 0.0036426770966500044, "learning_rate": 4.397065428321818e-06, "loss": -0.1118, "num_tokens": 705863.0, "reward": 0.49000000953674316, "reward_std": 0.5889538526535034, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5889538526535034, "sampling/importance_sampling_ratio/max": 1.032097578048706, "sampling/importance_sampling_ratio/mean": 0.8578061461448669, "sampling/importance_sampling_ratio/min": 0.7476117610931396, "sampling/sampling_logp_difference/max": 0.33843517303466797, "sampling/sampling_logp_difference/mean": 0.018863698467612267, "step": 250, "step_time": 28.240272235998418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.20030836760997772, "epoch": 0.251, "frac_reward_zero_std": 0.0, "grad_norm": 1.1295192241668701, "kl": 0.005140064284205437, "learning_rate": 4.391782039544239e-06, "loss": 0.089, "num_tokens": 709059.0, "reward": 0.45749998092651367, "reward_std": 0.6270765066146851, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6270765662193298, "sampling/importance_sampling_ratio/max": 1.1278167963027954, "sampling/importance_sampling_ratio/mean": 0.8279645442962646, "sampling/importance_sampling_ratio/min": 0.5244529843330383, "sampling/sampling_logp_difference/max": 0.41276586055755615, "sampling/sampling_logp_difference/mean": 0.017191220074892044, "step": 251, "step_time": 27.095232755003963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2057914137840271, "epoch": 0.252, "frac_reward_zero_std": 0.0, "grad_norm": 1.2285442352294922, "kl": 0.003582398174330592, "learning_rate": 4.386478806868242e-06, "loss": -0.0912, "num_tokens": 711445.0, "reward": 0.4375, "reward_std": 0.6498396992683411, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.6498397588729858, "sampling/importance_sampling_ratio/max": 1.137756586074829, "sampling/importance_sampling_ratio/mean": 0.8332376480102539, "sampling/importance_sampling_ratio/min": 0.5631604194641113, "sampling/sampling_logp_difference/max": 0.5276651382446289, "sampling/sampling_logp_difference/mean": 0.019915318116545677, "step": 252, "step_time": 22.086407112015877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1933041363954544, "epoch": 0.253, "frac_reward_zero_std": 0.0, "grad_norm": 1.0412102937698364, "kl": 0.006408913526684046, "learning_rate": 4.381155785922226e-06, "loss": 0.2416, "num_tokens": 714293.0, "reward": 0.48500001430511475, "reward_std": 0.5948948860168457, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5948949456214905, "sampling/importance_sampling_ratio/max": 1.4360581636428833, "sampling/importance_sampling_ratio/mean": 0.8679232597351074, "sampling/importance_sampling_ratio/min": 0.39636853337287903, "sampling/sampling_logp_difference/max": 0.8181741237640381, "sampling/sampling_logp_difference/mean": 0.02443419210612774, "step": 253, "step_time": 26.770205716020428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19776971638202667, "epoch": 0.254, "frac_reward_zero_std": 0.0, "grad_norm": 0.7572110891342163, "kl": 0.005734957754611969, "learning_rate": 4.375813032542164e-06, "loss": 0.1354, "num_tokens": 717471.0, "reward": 0.22499999403953552, "reward_std": 0.5170750617980957, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.5170750617980957, "sampling/importance_sampling_ratio/max": 1.523694634437561, "sampling/importance_sampling_ratio/mean": 0.8848665356636047, "sampling/importance_sampling_ratio/min": 0.567645788192749, "sampling/sampling_logp_difference/max": 0.31868433952331543, "sampling/sampling_logp_difference/mean": 0.014101257547736168, "step": 254, "step_time": 29.720171510009095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 68.25, "completions/mean_terminated_length": 68.25, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.18643206357955933, "epoch": 0.255, "frac_reward_zero_std": 0.0, "grad_norm": 1.1912782192230225, "kl": 0.0037159142084419727, "learning_rate": 4.37045060277101e-06, "loss": 0.2135, "num_tokens": 720222.0, "reward": 0.737500011920929, "reward_std": 0.5249999761581421, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5250000357627869, "sampling/importance_sampling_ratio/max": 1.1553459167480469, "sampling/importance_sampling_ratio/mean": 0.9451342225074768, "sampling/importance_sampling_ratio/min": 0.4631896913051605, "sampling/sampling_logp_difference/max": 0.32436370849609375, "sampling/sampling_logp_difference/mean": 0.016107618808746338, "step": 255, "step_time": 24.64952604199061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.22509580850601196, "epoch": 0.256, "frac_reward_zero_std": 0.0, "grad_norm": 1.5727344751358032, "kl": 0.008898830972611904, "learning_rate": 4.365068552858116e-06, "loss": 0.3247, "num_tokens": 722739.0, "reward": 0.7275000214576721, "reward_std": 0.5317502617835999, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5317502617835999, "sampling/importance_sampling_ratio/max": 1.28960120677948, "sampling/importance_sampling_ratio/mean": 0.7401763796806335, "sampling/importance_sampling_ratio/min": 0.47913017868995667, "sampling/sampling_logp_difference/max": 0.35707414150238037, "sampling/sampling_logp_difference/mean": 0.020260661840438843, "step": 256, "step_time": 19.525142749014776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1881706714630127, "epoch": 0.257, "frac_reward_zero_std": 0.0, "grad_norm": 0.9456437826156616, "kl": 0.004819028545171022, "learning_rate": 4.359666939258637e-06, "loss": -0.0382, "num_tokens": 725613.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5773502588272095, "sampling/importance_sampling_ratio/max": 1.3900043964385986, "sampling/importance_sampling_ratio/mean": 0.972355842590332, "sampling/importance_sampling_ratio/min": 0.6663265228271484, "sampling/sampling_logp_difference/max": 0.29797863960266113, "sampling/sampling_logp_difference/mean": 0.015425936318933964, "step": 257, "step_time": 20.744930865999777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1881631314754486, "epoch": 0.258, "frac_reward_zero_std": 0.0, "grad_norm": 0.41945284605026245, "kl": 0.0073770261369645596, "learning_rate": 4.354245818632944e-06, "loss": -0.0538, "num_tokens": 727934.0, "reward": 0.9850000143051147, "reward_std": 0.023804759606719017, "rewards/reward_func/mean": 0.9850000143051147, "rewards/reward_func/std": 0.023804768919944763, "sampling/importance_sampling_ratio/max": 0.809240460395813, "sampling/importance_sampling_ratio/mean": 0.37712496519088745, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8455805778503418, "sampling/sampling_logp_difference/mean": 0.02925044670701027, "step": 258, "step_time": 17.990879310993478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17907847464084625, "epoch": 0.259, "frac_reward_zero_std": 0.0, "grad_norm": 0.5851262807846069, "kl": 0.01236674189567566, "learning_rate": 4.348805247846027e-06, "loss": -0.2206, "num_tokens": 730758.0, "reward": -0.04500000178813934, "reward_std": 0.033166248351335526, "rewards/reward_func/mean": -0.04500000178813934, "rewards/reward_func/std": 0.033166248351335526, "sampling/importance_sampling_ratio/max": 1.3331166505813599, "sampling/importance_sampling_ratio/mean": 0.6799603700637817, "sampling/importance_sampling_ratio/min": 0.2507660984992981, "sampling/sampling_logp_difference/max": 0.8053848743438721, "sampling/sampling_logp_difference/mean": 0.025499863550066948, "step": 259, "step_time": 38.053798260050826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1822432577610016, "epoch": 0.26, "frac_reward_zero_std": 0.0, "grad_norm": 0.943036675453186, "kl": 0.005222966894507408, "learning_rate": 4.343345283966901e-06, "loss": -0.1568, "num_tokens": 733132.0, "reward": 0.7200000286102295, "reward_std": 0.5336666107177734, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5336665511131287, "sampling/importance_sampling_ratio/max": 1.7482022047042847, "sampling/importance_sampling_ratio/mean": 1.1221463680267334, "sampling/importance_sampling_ratio/min": 0.8477591872215271, "sampling/sampling_logp_difference/max": 0.4114915132522583, "sampling/sampling_logp_difference/mean": 0.02037693001329899, "step": 260, "step_time": 16.01960237097228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17438703775405884, "epoch": 0.261, "frac_reward_zero_std": 0.0, "grad_norm": 1.3230925798416138, "kl": 0.006910612341016531, "learning_rate": 4.337865984268002e-06, "loss": -0.0799, "num_tokens": 735984.0, "reward": 0.17249999940395355, "reward_std": 0.5565593838691711, "rewards/reward_func/mean": 0.17249999940395355, "rewards/reward_func/std": 0.5565593838691711, "sampling/importance_sampling_ratio/max": 1.1228914260864258, "sampling/importance_sampling_ratio/mean": 0.9449234008789062, "sampling/importance_sampling_ratio/min": 0.5257958769798279, "sampling/sampling_logp_difference/max": 0.3406977653503418, "sampling/sampling_logp_difference/mean": 0.01464507170021534, "step": 261, "step_time": 30.31822886399459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.14493580162525177, "epoch": 0.262, "frac_reward_zero_std": 0.0, "grad_norm": 0.6737772226333618, "kl": 0.0028185821138322353, "learning_rate": 4.33236740622459e-06, "loss": -0.0929, "num_tokens": 739458.0, "reward": 0.21000000834465027, "reward_std": 0.5207687020301819, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.5207686424255371, "sampling/importance_sampling_ratio/max": 1.3010153770446777, "sampling/importance_sampling_ratio/mean": 0.7470679879188538, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.058201551437378, "sampling/sampling_logp_difference/mean": 0.0165545791387558, "step": 262, "step_time": 40.07091985194711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.22490911185741425, "epoch": 0.263, "frac_reward_zero_std": 0.0, "grad_norm": 1.2464733123779297, "kl": 0.010036076419055462, "learning_rate": 4.326849607514149e-06, "loss": 0.1017, "num_tokens": 741942.0, "reward": 0.2199999988079071, "reward_std": 0.5207686424255371, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5207686424255371, "sampling/importance_sampling_ratio/max": 1.7852157354354858, "sampling/importance_sampling_ratio/mean": 1.0568002462387085, "sampling/importance_sampling_ratio/min": 0.5981595516204834, "sampling/sampling_logp_difference/max": 0.649437427520752, "sampling/sampling_logp_difference/mean": 0.02148485742509365, "step": 263, "step_time": 23.003530462970957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.19617676734924316, "epoch": 0.264, "frac_reward_zero_std": 0.0, "grad_norm": 0.8873128294944763, "kl": 0.005915424786508083, "learning_rate": 4.321312646015775e-06, "loss": -0.1147, "num_tokens": 744416.0, "reward": 0.4675000011920929, "reward_std": 0.6149999499320984, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6150000095367432, "sampling/importance_sampling_ratio/max": 0.9767049551010132, "sampling/importance_sampling_ratio/mean": 0.7750425338745117, "sampling/importance_sampling_ratio/min": 0.6112923622131348, "sampling/sampling_logp_difference/max": 0.28504109382629395, "sampling/sampling_logp_difference/mean": 0.02051348052918911, "step": 264, "step_time": 23.24079003604129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18105047941207886, "epoch": 0.265, "frac_reward_zero_std": 0.0, "grad_norm": 0.8302136659622192, "kl": 0.00661893468350172, "learning_rate": 4.315756579809575e-06, "loss": 0.0114, "num_tokens": 746830.0, "reward": 0.49000000953674316, "reward_std": 0.5889538526535034, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5889538526535034, "sampling/importance_sampling_ratio/max": 0.9202174544334412, "sampling/importance_sampling_ratio/mean": 0.7125985026359558, "sampling/importance_sampling_ratio/min": 0.44532686471939087, "sampling/sampling_logp_difference/max": 0.29930973052978516, "sampling/sampling_logp_difference/mean": 0.017515754327178, "step": 265, "step_time": 14.638449133024551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1975841075181961, "epoch": 0.266, "frac_reward_zero_std": 0.0, "grad_norm": 0.8019160628318787, "kl": 0.01245148666203022, "learning_rate": 4.3101814671760546e-06, "loss": 0.151, "num_tokens": 749222.0, "reward": 0.4749999940395355, "reward_std": 0.6004719734191895, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6004719734191895, "sampling/importance_sampling_ratio/max": 1.3367249965667725, "sampling/importance_sampling_ratio/mean": 0.6567279696464539, "sampling/importance_sampling_ratio/min": 0.3185805082321167, "sampling/sampling_logp_difference/max": 0.8763775825500488, "sampling/sampling_logp_difference/mean": 0.022529691457748413, "step": 266, "step_time": 18.077765353024006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 61.75, "completions/mean_terminated_length": 61.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1953420341014862, "epoch": 0.267, "frac_reward_zero_std": 0.0, "grad_norm": 1.353758454322815, "kl": 0.005161132663488388, "learning_rate": 4.304587366595506e-06, "loss": -0.5241, "num_tokens": 752141.0, "reward": 0.4675000011920929, "reward_std": 0.6093370318412781, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6093370914459229, "sampling/importance_sampling_ratio/max": 2.1259207725524902, "sampling/importance_sampling_ratio/mean": 1.0848398208618164, "sampling/importance_sampling_ratio/min": 0.444561243057251, "sampling/sampling_logp_difference/max": 0.34227943420410156, "sampling/sampling_logp_difference/mean": 0.021183475852012634, "step": 267, "step_time": 29.290160751959775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18871867656707764, "epoch": 0.268, "frac_reward_zero_std": 0.0, "grad_norm": 2.049696445465088, "kl": 0.006005663890391588, "learning_rate": 4.298974336747397e-06, "loss": 0.0522, "num_tokens": 754396.0, "reward": 0.7150000333786011, "reward_std": 0.5567465424537659, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5567465424537659, "sampling/importance_sampling_ratio/max": 1.865655541419983, "sampling/importance_sampling_ratio/mean": 1.3764516115188599, "sampling/importance_sampling_ratio/min": 0.7865685224533081, "sampling/sampling_logp_difference/max": 0.28329575061798096, "sampling/sampling_logp_difference/mean": 0.017431169748306274, "step": 268, "step_time": 10.731868006987497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.19366972148418427, "epoch": 0.269, "frac_reward_zero_std": 0.0, "grad_norm": 1.0040234327316284, "kl": 0.003363449824973941, "learning_rate": 4.2933424365097565e-06, "loss": -0.0892, "num_tokens": 757372.0, "reward": 0.22499999403953552, "reward_std": 0.5172684788703918, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.5172684788703918, "sampling/importance_sampling_ratio/max": 1.0346956253051758, "sampling/importance_sampling_ratio/mean": 0.8142103552818298, "sampling/importance_sampling_ratio/min": 0.5647660493850708, "sampling/sampling_logp_difference/max": 0.2397313117980957, "sampling/sampling_logp_difference/mean": 0.01780715398490429, "step": 269, "step_time": 31.292387477005832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.17705900967121124, "epoch": 0.27, "frac_reward_zero_std": 0.0, "grad_norm": 0.6349918246269226, "kl": 0.011828066781163216, "learning_rate": 4.287691724958551e-06, "loss": 0.105, "num_tokens": 760279.0, "reward": 0.2199999988079071, "reward_std": 0.5223664045333862, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5223664045333862, "sampling/importance_sampling_ratio/max": 1.2974605560302734, "sampling/importance_sampling_ratio/mean": 0.8365160226821899, "sampling/importance_sampling_ratio/min": 0.37822747230529785, "sampling/sampling_logp_difference/max": 0.4485206604003906, "sampling/sampling_logp_difference/mean": 0.021975291892886162, "step": 270, "step_time": 28.168547167035285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.16666339337825775, "epoch": 0.271, "frac_reward_zero_std": 0.0, "grad_norm": 0.7390958666801453, "kl": 0.013754061423242092, "learning_rate": 4.282022261367074e-06, "loss": 0.006, "num_tokens": 763198.0, "reward": 0.18000000715255737, "reward_std": 0.5344155430793762, "rewards/reward_func/mean": 0.18000000715255737, "rewards/reward_func/std": 0.534415602684021, "sampling/importance_sampling_ratio/max": 0.8411187529563904, "sampling/importance_sampling_ratio/mean": 0.6068553924560547, "sampling/importance_sampling_ratio/min": 0.1923002451658249, "sampling/sampling_logp_difference/max": 0.5083863735198975, "sampling/sampling_logp_difference/mean": 0.022569026798009872, "step": 271, "step_time": 35.70359892799752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16260796785354614, "epoch": 0.272, "frac_reward_zero_std": 0.0, "grad_norm": 0.9466172456741333, "kl": 0.0027168146334588528, "learning_rate": 4.276334105205312e-06, "loss": 0.1497, "num_tokens": 765549.0, "reward": 0.48249998688697815, "reward_std": 0.5976830124855042, "rewards/reward_func/mean": 0.48249998688697815, "rewards/reward_func/std": 0.5976830720901489, "sampling/importance_sampling_ratio/max": 1.2416424751281738, "sampling/importance_sampling_ratio/mean": 0.9922610521316528, "sampling/importance_sampling_ratio/min": 0.7148333191871643, "sampling/sampling_logp_difference/max": 0.26919054985046387, "sampling/sampling_logp_difference/mean": 0.01195564679801464, "step": 272, "step_time": 23.564375263056718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.23189429938793182, "epoch": 0.273, "frac_reward_zero_std": 0.0, "grad_norm": 0.998937726020813, "kl": 0.00828265119343996, "learning_rate": 4.270627316139333e-06, "loss": -0.1157, "num_tokens": 768135.0, "reward": 0.4650000035762787, "reward_std": 0.6186274886131287, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.6186274886131287, "sampling/importance_sampling_ratio/max": 1.0588619709014893, "sampling/importance_sampling_ratio/mean": 0.6840273141860962, "sampling/importance_sampling_ratio/min": 0.3518528938293457, "sampling/sampling_logp_difference/max": 0.535036563873291, "sampling/sampling_logp_difference/mean": 0.022393321618437767, "step": 273, "step_time": 26.729645722021814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16174140572547913, "epoch": 0.274, "frac_reward_zero_std": 0.0, "grad_norm": 1.842128872871399, "kl": 0.017359547317028046, "learning_rate": 4.264901954030655e-06, "loss": -0.1095, "num_tokens": 771076.0, "reward": 0.49000000953674316, "reward_std": 0.5889538526535034, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5889538526535034, "sampling/importance_sampling_ratio/max": 1.4731436967849731, "sampling/importance_sampling_ratio/mean": 1.1341400146484375, "sampling/importance_sampling_ratio/min": 0.7386038303375244, "sampling/sampling_logp_difference/max": 0.376442551612854, "sampling/sampling_logp_difference/mean": 0.017150020226836205, "step": 274, "step_time": 20.607079999987036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1715756207704544, "epoch": 0.275, "frac_reward_zero_std": 0.0, "grad_norm": 1.150766372680664, "kl": 0.0064427792094647884, "learning_rate": 4.259158078935616e-06, "loss": 0.0212, "num_tokens": 773816.0, "reward": 0.49000000953674316, "reward_std": 0.5832666158676147, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5832666754722595, "sampling/importance_sampling_ratio/max": 1.166966438293457, "sampling/importance_sampling_ratio/mean": 0.8412402868270874, "sampling/importance_sampling_ratio/min": 0.6058522462844849, "sampling/sampling_logp_difference/max": 0.30178576707839966, "sampling/sampling_logp_difference/mean": 0.017852554097771645, "step": 275, "step_time": 22.63811416499084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.2126409113407135, "epoch": 0.276, "frac_reward_zero_std": 0.0, "grad_norm": 0.7494298219680786, "kl": 0.009623635560274124, "learning_rate": 4.2533957511047485e-06, "loss": 0.0313, "num_tokens": 776331.0, "reward": 0.19499999284744263, "reward_std": 0.5366873741149902, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.5366873741149902, "sampling/importance_sampling_ratio/max": 0.7916879057884216, "sampling/importance_sampling_ratio/mean": 0.5831737518310547, "sampling/importance_sampling_ratio/min": 0.31662437319755554, "sampling/sampling_logp_difference/max": 1.1284599304199219, "sampling/sampling_logp_difference/mean": 0.02790701575577259, "step": 276, "step_time": 19.741026120027527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.21963299810886383, "epoch": 0.277, "frac_reward_zero_std": 0.0, "grad_norm": 1.1093361377716064, "kl": 0.016089681535959244, "learning_rate": 4.247615030982144e-06, "loss": 0.0093, "num_tokens": 779064.0, "reward": 0.4724999964237213, "reward_std": 0.5867068767547607, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.5867069363594055, "sampling/importance_sampling_ratio/max": 1.3055884838104248, "sampling/importance_sampling_ratio/mean": 0.7731707096099854, "sampling/importance_sampling_ratio/min": 0.3470041751861572, "sampling/sampling_logp_difference/max": 0.7043886184692383, "sampling/sampling_logp_difference/mean": 0.02418813295662403, "step": 277, "step_time": 28.931750844989438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.15914718806743622, "epoch": 0.278, "frac_reward_zero_std": 0.0, "grad_norm": 1.6521419286727905, "kl": 0.0047762515023350716, "learning_rate": 4.241815979204822e-06, "loss": -0.0651, "num_tokens": 782065.0, "reward": 0.4925000071525574, "reward_std": 0.5861384868621826, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5861384868621826, "sampling/importance_sampling_ratio/max": 1.3694645166397095, "sampling/importance_sampling_ratio/mean": 1.0192992687225342, "sampling/importance_sampling_ratio/min": 0.7678786516189575, "sampling/sampling_logp_difference/max": 0.5276446342468262, "sampling/sampling_logp_difference/mean": 0.018691327422857285, "step": 278, "step_time": 26.474371277028695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15368247032165527, "epoch": 0.279, "frac_reward_zero_std": 0.0, "grad_norm": 0.9143109917640686, "kl": 0.006546621210873127, "learning_rate": 4.235998656602091e-06, "loss": -0.0264, "num_tokens": 785123.0, "reward": 0.4650000035762787, "reward_std": 0.6204031109809875, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.6204031109809875, "sampling/importance_sampling_ratio/max": 1.103335976600647, "sampling/importance_sampling_ratio/mean": 0.7847365140914917, "sampling/importance_sampling_ratio/min": 0.43466076254844666, "sampling/sampling_logp_difference/max": 0.4671754837036133, "sampling/sampling_logp_difference/mean": 0.01675252430140972, "step": 279, "step_time": 28.23038095701486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19246931374073029, "epoch": 0.28, "frac_reward_zero_std": 0.0, "grad_norm": 1.3832927942276, "kl": 0.0071713486686348915, "learning_rate": 4.230163124194913e-06, "loss": 0.0447, "num_tokens": 787841.0, "reward": 0.48500001430511475, "reward_std": 0.5834095478057861, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5834095478057861, "sampling/importance_sampling_ratio/max": 1.384497880935669, "sampling/importance_sampling_ratio/mean": 1.012965440750122, "sampling/importance_sampling_ratio/min": 0.624067485332489, "sampling/sampling_logp_difference/max": 0.8023831844329834, "sampling/sampling_logp_difference/mean": 0.022642865777015686, "step": 280, "step_time": 23.616583047027234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.22725999355316162, "epoch": 0.281, "frac_reward_zero_std": 0.0, "grad_norm": 1.0041000843048096, "kl": 0.007762733846902847, "learning_rate": 4.224309443195261e-06, "loss": 0.2108, "num_tokens": 790365.0, "reward": 0.4449999928474426, "reward_std": 0.6139761209487915, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.6139761209487915, "sampling/importance_sampling_ratio/max": 1.2132846117019653, "sampling/importance_sampling_ratio/mean": 0.7643641829490662, "sampling/importance_sampling_ratio/min": 0.5451329350471497, "sampling/sampling_logp_difference/max": 0.6096268892288208, "sampling/sampling_logp_difference/mean": 0.020223373547196388, "step": 281, "step_time": 16.90137097798288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.20043538510799408, "epoch": 0.282, "frac_reward_zero_std": 0.0, "grad_norm": 1.3638219833374023, "kl": 0.013254034332931042, "learning_rate": 4.218437675005479e-06, "loss": 0.0224, "num_tokens": 792895.0, "reward": 0.7425000071525574, "reward_std": 0.4952020049095154, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.495201975107193, "sampling/importance_sampling_ratio/max": 1.0346201658248901, "sampling/importance_sampling_ratio/mean": 0.7741513252258301, "sampling/importance_sampling_ratio/min": 0.6141830086708069, "sampling/sampling_logp_difference/max": 0.4573390483856201, "sampling/sampling_logp_difference/mean": 0.02056916058063507, "step": 282, "step_time": 22.094105703989044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1776740700006485, "epoch": 0.283, "frac_reward_zero_std": 0.0, "grad_norm": 1.3536454439163208, "kl": 0.014500267803668976, "learning_rate": 4.212547881217637e-06, "loss": -0.0654, "num_tokens": 795724.0, "reward": 0.7350000143051147, "reward_std": 0.49027201533317566, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.49027204513549805, "sampling/importance_sampling_ratio/max": 1.3580315113067627, "sampling/importance_sampling_ratio/mean": 1.0029497146606445, "sampling/importance_sampling_ratio/min": 0.6571022868156433, "sampling/sampling_logp_difference/max": 0.47780632972717285, "sampling/sampling_logp_difference/mean": 0.020190872251987457, "step": 283, "step_time": 24.135174111987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1572485715150833, "epoch": 0.284, "frac_reward_zero_std": 0.0, "grad_norm": 1.2983592748641968, "kl": 0.014417567290365696, "learning_rate": 4.206640123612885e-06, "loss": -0.0056, "num_tokens": 798552.0, "reward": 0.7300000190734863, "reward_std": 0.5399999618530273, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 1.5939992666244507, "sampling/importance_sampling_ratio/mean": 1.2404561042785645, "sampling/importance_sampling_ratio/min": 0.4974382519721985, "sampling/sampling_logp_difference/max": 0.6747608184814453, "sampling/sampling_logp_difference/mean": 0.023639293387532234, "step": 284, "step_time": 20.81338968500495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16033831238746643, "epoch": 0.285, "frac_reward_zero_std": 0.0, "grad_norm": 1.5701148509979248, "kl": 0.005200207699090242, "learning_rate": 4.2007144641608035e-06, "loss": 0.3791, "num_tokens": 801243.0, "reward": 0.7250000238418579, "reward_std": 0.5300629138946533, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.5300628542900085, "sampling/importance_sampling_ratio/max": 1.3495690822601318, "sampling/importance_sampling_ratio/mean": 0.7657822370529175, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7519104480743408, "sampling/sampling_logp_difference/mean": 0.024758273735642433, "step": 285, "step_time": 25.39879522600677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.17625954747200012, "epoch": 0.286, "frac_reward_zero_std": 0.0, "grad_norm": 1.7910927534103394, "kl": 0.011552062816917896, "learning_rate": 4.194770965018758e-06, "loss": -0.2197, "num_tokens": 804340.0, "reward": 0.25, "reward_std": 0.5, "rewards/reward_func/mean": 0.25, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.253133773803711, "sampling/importance_sampling_ratio/mean": 1.1235811710357666, "sampling/importance_sampling_ratio/min": 0.8812562227249146, "sampling/sampling_logp_difference/max": 0.2838616967201233, "sampling/sampling_logp_difference/mean": 0.022335588932037354, "step": 286, "step_time": 28.819354197010398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.19039280712604523, "epoch": 0.287, "frac_reward_zero_std": 0.0, "grad_norm": 1.0310262441635132, "kl": 0.00873173400759697, "learning_rate": 4.188809688531241e-06, "loss": -0.0195, "num_tokens": 807358.0, "reward": 0.45249998569488525, "reward_std": 0.6351574659347534, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6351574659347534, "sampling/importance_sampling_ratio/max": 0.981810986995697, "sampling/importance_sampling_ratio/mean": 0.7609665393829346, "sampling/importance_sampling_ratio/min": 0.5114357471466064, "sampling/sampling_logp_difference/max": 0.5009238719940186, "sampling/sampling_logp_difference/mean": 0.026809850707650185, "step": 287, "step_time": 26.23984911298612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.20077866315841675, "epoch": 0.288, "frac_reward_zero_std": 0.0, "grad_norm": 0.9160837531089783, "kl": 0.009624039754271507, "learning_rate": 4.182830697229223e-06, "loss": -0.0841, "num_tokens": 810070.0, "reward": 0.4700000286102295, "reward_std": 0.6007218360900879, "rewards/reward_func/mean": 0.4700000286102295, "rewards/reward_func/std": 0.6007217764854431, "sampling/importance_sampling_ratio/max": 0.8712549209594727, "sampling/importance_sampling_ratio/mean": 0.6799152493476868, "sampling/importance_sampling_ratio/min": 0.3031260669231415, "sampling/sampling_logp_difference/max": 0.5424580574035645, "sampling/sampling_logp_difference/mean": 0.020591598004102707, "step": 288, "step_time": 26.02415558602661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 63.5, "completions/mean_terminated_length": 63.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.24311241507530212, "epoch": 0.289, "frac_reward_zero_std": 0.0, "grad_norm": 1.8792979717254639, "kl": 0.00862202886492014, "learning_rate": 4.176834053829492e-06, "loss": -0.5516, "num_tokens": 813046.0, "reward": 0.46000000834465027, "reward_std": 0.608002245426178, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.6080021858215332, "sampling/importance_sampling_ratio/max": 2.8263542652130127, "sampling/importance_sampling_ratio/mean": 1.1652201414108276, "sampling/importance_sampling_ratio/min": 0.49277186393737793, "sampling/sampling_logp_difference/max": 0.36234772205352783, "sampling/sampling_logp_difference/mean": 0.023499049246311188, "step": 289, "step_time": 29.258544155978598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 43.5, "completions/mean_terminated_length": 43.5, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.20949023962020874, "epoch": 0.29, "frac_reward_zero_std": 0.0, "grad_norm": 1.3677107095718384, "kl": 0.005677942652255297, "learning_rate": 4.170819821234001e-06, "loss": -0.2062, "num_tokens": 815410.0, "reward": 0.1850000023841858, "reward_std": 0.5451911091804504, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.5451911091804504, "sampling/importance_sampling_ratio/max": 1.1046886444091797, "sampling/importance_sampling_ratio/mean": 0.9451396465301514, "sampling/importance_sampling_ratio/min": 0.8439013957977295, "sampling/sampling_logp_difference/max": 0.27832645177841187, "sampling/sampling_logp_difference/mean": 0.01512377243489027, "step": 290, "step_time": 34.21004091197392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.19340306520462036, "epoch": 0.291, "frac_reward_zero_std": 0.0, "grad_norm": 2.240518808364868, "kl": 0.009020933881402016, "learning_rate": 4.164788062529203e-06, "loss": -0.128, "num_tokens": 818240.0, "reward": 0.48750001192092896, "reward_std": 0.5919107794761658, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5919107794761658, "sampling/importance_sampling_ratio/max": 1.561298131942749, "sampling/importance_sampling_ratio/mean": 0.9305005073547363, "sampling/importance_sampling_ratio/min": 0.456493079662323, "sampling/sampling_logp_difference/max": 0.6589646339416504, "sampling/sampling_logp_difference/mean": 0.025264868512749672, "step": 291, "step_time": 19.011539578961674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.15618827939033508, "epoch": 0.292, "frac_reward_zero_std": 0.0, "grad_norm": 3.2577297687530518, "kl": 0.013795577920973301, "learning_rate": 4.158738840985393e-06, "loss": 0.6144, "num_tokens": 820947.0, "reward": -0.042500000447034836, "reward_std": 0.04272002354264259, "rewards/reward_func/mean": -0.042500000447034836, "rewards/reward_func/std": 0.042720019817352295, "sampling/importance_sampling_ratio/max": 2.786485433578491, "sampling/importance_sampling_ratio/mean": 1.4379103183746338, "sampling/importance_sampling_ratio/min": 0.800940215587616, "sampling/sampling_logp_difference/max": 0.3482666015625, "sampling/sampling_logp_difference/mean": 0.015532279387116432, "step": 292, "step_time": 28.34935155295534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.22635218501091003, "epoch": 0.293, "frac_reward_zero_std": 0.0, "grad_norm": 0.770022988319397, "kl": 0.0159556046128273, "learning_rate": 4.1526722200560445e-06, "loss": -0.062, "num_tokens": 823697.0, "reward": 0.49000000953674316, "reward_std": 0.5889538526535034, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5889538526535034, "sampling/importance_sampling_ratio/max": 1.0683104991912842, "sampling/importance_sampling_ratio/mean": 0.5945674777030945, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.37361443042755127, "sampling/sampling_logp_difference/mean": 0.022427037358283997, "step": 293, "step_time": 21.212856367987115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.197050079703331, "epoch": 0.294, "frac_reward_zero_std": 0.0, "grad_norm": 1.072265863418579, "kl": 0.004137760028243065, "learning_rate": 4.146588263377137e-06, "loss": -0.208, "num_tokens": 826452.0, "reward": 0.45249998569488525, "reward_std": 0.6266511082649231, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6266511678695679, "sampling/importance_sampling_ratio/max": 1.5648176670074463, "sampling/importance_sampling_ratio/mean": 0.9401169419288635, "sampling/importance_sampling_ratio/min": 0.5629730820655823, "sampling/sampling_logp_difference/max": 0.3453207015991211, "sampling/sampling_logp_difference/mean": 0.017655933275818825, "step": 294, "step_time": 28.316711787017994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.2030697762966156, "epoch": 0.295, "frac_reward_zero_std": 0.0, "grad_norm": 1.3082462549209595, "kl": 0.019615227356553078, "learning_rate": 4.140487034766499e-06, "loss": 0.0493, "num_tokens": 829598.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.1989033222198486, "sampling/importance_sampling_ratio/mean": 0.8297949433326721, "sampling/importance_sampling_ratio/min": 0.6502844095230103, "sampling/sampling_logp_difference/max": 0.4132622480392456, "sampling/sampling_logp_difference/mean": 0.03036736324429512, "step": 295, "step_time": 27.12130633898778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 60.75, "completions/mean_terminated_length": 60.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.20656226575374603, "epoch": 0.296, "frac_reward_zero_std": 0.0, "grad_norm": 0.8728966116905212, "kl": 0.003473206888884306, "learning_rate": 4.134368598223132e-06, "loss": -0.0161, "num_tokens": 831998.0, "reward": 0.45500001311302185, "reward_std": 0.6319018602371216, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6319018602371216, "sampling/importance_sampling_ratio/max": 0.852838397026062, "sampling/importance_sampling_ratio/mean": 0.7476969957351685, "sampling/importance_sampling_ratio/min": 0.6645308136940002, "sampling/sampling_logp_difference/max": 0.28294897079467773, "sampling/sampling_logp_difference/mean": 0.013870280236005783, "step": 296, "step_time": 20.166604634025134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.16951675713062286, "epoch": 0.297, "frac_reward_zero_std": 0.0, "grad_norm": 1.4472236633300781, "kl": 0.015550295822322369, "learning_rate": 4.128233017926538e-06, "loss": -0.2316, "num_tokens": 834529.0, "reward": 0.6775000095367432, "reward_std": 0.6051653027534485, "rewards/reward_func/mean": 0.6775000095367432, "rewards/reward_func/std": 0.6051652431488037, "sampling/importance_sampling_ratio/max": 1.9800890684127808, "sampling/importance_sampling_ratio/mean": 1.1235166788101196, "sampling/importance_sampling_ratio/min": 0.7053938508033752, "sampling/sampling_logp_difference/max": 0.7558305263519287, "sampling/sampling_logp_difference/mean": 0.021329833194613457, "step": 297, "step_time": 20.45540096901823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.18112340569496155, "epoch": 0.298, "frac_reward_zero_std": 0.0, "grad_norm": 0.8436924815177917, "kl": 0.013439967297017574, "learning_rate": 4.1220803582360545e-06, "loss": -0.2926, "num_tokens": 837503.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.5407968759536743, "sampling/importance_sampling_ratio/mean": 0.961435079574585, "sampling/importance_sampling_ratio/min": 0.45320987701416016, "sampling/sampling_logp_difference/max": 0.3260352611541748, "sampling/sampling_logp_difference/mean": 0.01948177069425583, "step": 298, "step_time": 23.726718263002113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 61.25, "completions/mean_terminated_length": 61.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.21395805478096008, "epoch": 0.299, "frac_reward_zero_std": 1.0, "grad_norm": 0.004934549797326326, "kl": 0.005729455500841141, "learning_rate": 4.115910683690167e-06, "loss": 0.0001, "num_tokens": 839862.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.13666570186615, "sampling/importance_sampling_ratio/mean": 0.7800756692886353, "sampling/importance_sampling_ratio/min": 0.5447342991828918, "sampling/sampling_logp_difference/max": 0.5907832384109497, "sampling/sampling_logp_difference/mean": 0.019901322200894356, "step": 299, "step_time": 5.962714196997695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16754566133022308, "epoch": 0.3, "frac_reward_zero_std": 0.0, "grad_norm": 1.338972806930542, "kl": 0.005489873234182596, "learning_rate": 4.109724059005844e-06, "loss": -0.0752, "num_tokens": 842908.0, "reward": -0.06750000268220901, "reward_std": 0.0403112918138504, "rewards/reward_func/mean": -0.06750000268220901, "rewards/reward_func/std": 0.0403112918138504, "sampling/importance_sampling_ratio/max": 1.0867990255355835, "sampling/importance_sampling_ratio/mean": 1.0129293203353882, "sampling/importance_sampling_ratio/min": 0.885906994342804, "sampling/sampling_logp_difference/max": 0.2247610092163086, "sampling/sampling_logp_difference/mean": 0.013455437496304512, "step": 300, "step_time": 37.255950010963716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2165483683347702, "epoch": 0.301, "frac_reward_zero_std": 0.0, "grad_norm": 1.354928970336914, "kl": 0.007270726375281811, "learning_rate": 4.1035205490778505e-06, "loss": 0.086, "num_tokens": 845757.0, "reward": 0.4950000047683716, "reward_std": 0.5715767741203308, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5715767741203308, "sampling/importance_sampling_ratio/max": 1.7322866916656494, "sampling/importance_sampling_ratio/mean": 1.0112117528915405, "sampling/importance_sampling_ratio/min": 0.5857402086257935, "sampling/sampling_logp_difference/max": 0.5875649452209473, "sampling/sampling_logp_difference/mean": 0.021048210561275482, "step": 301, "step_time": 22.708721707982477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.16461658477783203, "epoch": 0.302, "frac_reward_zero_std": 0.0, "grad_norm": 0.8669045567512512, "kl": 0.015467801131308079, "learning_rate": 4.09730021897807e-06, "loss": -0.3312, "num_tokens": 848723.0, "reward": 0.7450000047683716, "reward_std": 0.5033554434776306, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5033553838729858, "sampling/importance_sampling_ratio/max": 1.6999658346176147, "sampling/importance_sampling_ratio/mean": 0.9837483167648315, "sampling/importance_sampling_ratio/min": 0.2985522747039795, "sampling/sampling_logp_difference/max": 0.720937967300415, "sampling/sampling_logp_difference/mean": 0.024597885087132454, "step": 302, "step_time": 20.930386556021404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2379549890756607, "epoch": 0.303, "frac_reward_zero_std": 0.0, "grad_norm": 1.7637532949447632, "kl": 0.010248987935483456, "learning_rate": 4.091063133954821e-06, "loss": -0.1237, "num_tokens": 851899.0, "reward": 0.4599999785423279, "reward_std": 0.624019205570221, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.624019205570221, "sampling/importance_sampling_ratio/max": 1.4683917760849, "sampling/importance_sampling_ratio/mean": 1.0287961959838867, "sampling/importance_sampling_ratio/min": 0.4632247984409332, "sampling/sampling_logp_difference/max": 0.5381326079368591, "sampling/sampling_logp_difference/mean": 0.03329369053244591, "step": 303, "step_time": 25.040667850000318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.14339658617973328, "epoch": 0.304, "frac_reward_zero_std": 0.0, "grad_norm": 1.8714348077774048, "kl": 0.008155702613294125, "learning_rate": 4.084809359432175e-06, "loss": -0.05, "num_tokens": 855101.0, "reward": -0.022499999031424522, "reward_std": 0.012583056464791298, "rewards/reward_func/mean": -0.022499999031424522, "rewards/reward_func/std": 0.012583057396113873, "sampling/importance_sampling_ratio/max": 1.7282090187072754, "sampling/importance_sampling_ratio/mean": 1.223402500152588, "sampling/importance_sampling_ratio/min": 0.9619302749633789, "sampling/sampling_logp_difference/max": 0.5329961776733398, "sampling/sampling_logp_difference/mean": 0.01699461229145527, "step": 304, "step_time": 37.72068432898959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.16901369392871857, "epoch": 0.305, "frac_reward_zero_std": 0.0, "grad_norm": 0.6511167883872986, "kl": 0.007796689867973328, "learning_rate": 4.0785389610092684e-06, "loss": -0.1083, "num_tokens": 857720.0, "reward": 0.7450000047683716, "reward_std": 0.5033554434776306, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5033553838729858, "sampling/importance_sampling_ratio/max": 1.0465255975723267, "sampling/importance_sampling_ratio/mean": 0.6912955045700073, "sampling/importance_sampling_ratio/min": 0.3082582354545593, "sampling/sampling_logp_difference/max": 0.6349806785583496, "sampling/sampling_logp_difference/mean": 0.021492263302206993, "step": 305, "step_time": 17.989883826987352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17556984722614288, "epoch": 0.306, "frac_reward_zero_std": 0.0, "grad_norm": 1.2873533964157104, "kl": 0.00885076355189085, "learning_rate": 4.072252004459612e-06, "loss": 0.1341, "num_tokens": 860432.0, "reward": 0.4625000059604645, "reward_std": 0.6153793334960938, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.615379273891449, "sampling/importance_sampling_ratio/max": 1.1377438306808472, "sampling/importance_sampling_ratio/mean": 0.8565549850463867, "sampling/importance_sampling_ratio/min": 0.6356294751167297, "sampling/sampling_logp_difference/max": 0.4788249731063843, "sampling/sampling_logp_difference/mean": 0.01974719949066639, "step": 306, "step_time": 25.479528513969854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.20250898599624634, "epoch": 0.307, "frac_reward_zero_std": 0.0, "grad_norm": 1.522504448890686, "kl": 0.009249414317309856, "learning_rate": 4.065948555730405e-06, "loss": -0.2131, "num_tokens": 863376.0, "reward": 0.19750000536441803, "reward_std": 0.5406400561332703, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.5406400561332703, "sampling/importance_sampling_ratio/max": 1.8201909065246582, "sampling/importance_sampling_ratio/mean": 1.0534489154815674, "sampling/importance_sampling_ratio/min": 0.38565030694007874, "sampling/sampling_logp_difference/max": 0.5085287094116211, "sampling/sampling_logp_difference/mean": 0.020834771916270256, "step": 307, "step_time": 26.842261792975478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1906985491514206, "epoch": 0.308, "frac_reward_zero_std": 0.0, "grad_norm": 0.933506429195404, "kl": 0.0072281635366380215, "learning_rate": 4.059628680941843e-06, "loss": 0.0721, "num_tokens": 865669.0, "reward": 0.19999998807907104, "reward_std": 0.5338538885116577, "rewards/reward_func/mean": 0.19999998807907104, "rewards/reward_func/std": 0.5338539481163025, "sampling/importance_sampling_ratio/max": 1.0822560787200928, "sampling/importance_sampling_ratio/mean": 0.8242489099502563, "sampling/importance_sampling_ratio/min": 0.5063313841819763, "sampling/sampling_logp_difference/max": 0.5612988471984863, "sampling/sampling_logp_difference/mean": 0.01824266090989113, "step": 308, "step_time": 27.762431093957275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1771543174982071, "epoch": 0.309, "frac_reward_zero_std": 0.0, "grad_norm": 0.8803468942642212, "kl": 0.012751858681440353, "learning_rate": 4.053292446386422e-06, "loss": -0.0671, "num_tokens": 868033.0, "reward": 0.2175000011920929, "reward_std": 0.5224542021751404, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.5224542021751404, "sampling/importance_sampling_ratio/max": 1.3624162673950195, "sampling/importance_sampling_ratio/mean": 1.001397967338562, "sampling/importance_sampling_ratio/min": 0.7807343006134033, "sampling/sampling_logp_difference/max": 0.4967763423919678, "sampling/sampling_logp_difference/mean": 0.018794281408190727, "step": 309, "step_time": 26.882163838017732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17031854391098022, "epoch": 0.31, "frac_reward_zero_std": 0.0, "grad_norm": 1.2438889741897583, "kl": 0.007974425330758095, "learning_rate": 4.046939918528243e-06, "loss": -0.278, "num_tokens": 871180.0, "reward": 0.45749998092651367, "reward_std": 0.6264383792877197, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6264383792877197, "sampling/importance_sampling_ratio/max": 2.1466610431671143, "sampling/importance_sampling_ratio/mean": 1.1853176355361938, "sampling/importance_sampling_ratio/min": 0.636775553226471, "sampling/sampling_logp_difference/max": 0.37122952938079834, "sampling/sampling_logp_difference/mean": 0.017471516504883766, "step": 310, "step_time": 28.060560956946574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.15581251680850983, "epoch": 0.311, "frac_reward_zero_std": 0.0, "grad_norm": 2.4202959537506104, "kl": 0.01499974075704813, "learning_rate": 4.040571164002319e-06, "loss": -0.1401, "num_tokens": 874589.0, "reward": 0.7350000143051147, "reward_std": 0.5233545899391174, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 2.878161668777466, "sampling/importance_sampling_ratio/mean": 1.5831506252288818, "sampling/importance_sampling_ratio/min": 0.7393553853034973, "sampling/sampling_logp_difference/max": 0.6463634967803955, "sampling/sampling_logp_difference/mean": 0.023818520829081535, "step": 311, "step_time": 29.021404444007203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 86.0, "completions/max_terminated_length": 86.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.22328732907772064, "epoch": 0.312, "frac_reward_zero_std": 0.0, "grad_norm": 2.0664865970611572, "kl": 0.01062964741140604, "learning_rate": 4.034186249613869e-06, "loss": -0.1823, "num_tokens": 877199.0, "reward": 0.4449999928474426, "reward_std": 0.6306874752044678, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.6306874752044678, "sampling/importance_sampling_ratio/max": 1.6430108547210693, "sampling/importance_sampling_ratio/mean": 0.9111121892929077, "sampling/importance_sampling_ratio/min": 0.25723037123680115, "sampling/sampling_logp_difference/max": 0.465049684047699, "sampling/sampling_logp_difference/mean": 0.0224713534116745, "step": 312, "step_time": 25.91834179701982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.18686476349830627, "epoch": 0.313, "frac_reward_zero_std": 0.0, "grad_norm": 4.161800384521484, "kl": 0.016447437927126884, "learning_rate": 4.027785242337626e-06, "loss": 0.1773, "num_tokens": 880100.0, "reward": 0.45249998569488525, "reward_std": 0.6359966397285461, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6359965801239014, "sampling/importance_sampling_ratio/max": 1.3400695323944092, "sampling/importance_sampling_ratio/mean": 1.008766770362854, "sampling/importance_sampling_ratio/min": 0.7905641198158264, "sampling/sampling_logp_difference/max": 0.28570711612701416, "sampling/sampling_logp_difference/mean": 0.021306177601218224, "step": 313, "step_time": 26.2007266949513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17860497534275055, "epoch": 0.314, "frac_reward_zero_std": 0.0, "grad_norm": 0.7877838015556335, "kl": 0.01939004845917225, "learning_rate": 4.021368209317126e-06, "loss": 0.0079, "num_tokens": 882636.0, "reward": 0.45750001072883606, "reward_std": 0.628669261932373, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.628669261932373, "sampling/importance_sampling_ratio/max": 0.5461062788963318, "sampling/importance_sampling_ratio/mean": 0.4604978859424591, "sampling/importance_sampling_ratio/min": 0.34500694274902344, "sampling/sampling_logp_difference/max": 0.8048954010009766, "sampling/sampling_logp_difference/mean": 0.02993612550199032, "step": 314, "step_time": 21.142497588007245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.14769034087657928, "epoch": 0.315, "frac_reward_zero_std": 0.0, "grad_norm": 1.9626437425613403, "kl": 0.008432296104729176, "learning_rate": 4.014935217864009e-06, "loss": -0.1798, "num_tokens": 885779.0, "reward": 0.48750001192092896, "reward_std": 0.5689390897750854, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5689390897750854, "sampling/importance_sampling_ratio/max": 2.081059217453003, "sampling/importance_sampling_ratio/mean": 1.4404575824737549, "sampling/importance_sampling_ratio/min": 0.8488056063652039, "sampling/sampling_logp_difference/max": 0.5091421604156494, "sampling/sampling_logp_difference/mean": 0.018797120079398155, "step": 315, "step_time": 27.078205574012827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.19298554956912994, "epoch": 0.316, "frac_reward_zero_std": 0.0, "grad_norm": 2.7235498428344727, "kl": 0.015732839703559875, "learning_rate": 4.008486335457312e-06, "loss": 0.0072, "num_tokens": 888824.0, "reward": 0.44749999046325684, "reward_std": 0.5804237127304077, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.5804236531257629, "sampling/importance_sampling_ratio/max": 2.487926721572876, "sampling/importance_sampling_ratio/mean": 1.4004504680633545, "sampling/importance_sampling_ratio/min": 0.5610126852989197, "sampling/sampling_logp_difference/max": 0.5306744575500488, "sampling/sampling_logp_difference/mean": 0.020576931536197662, "step": 316, "step_time": 34.30429990199627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17905010282993317, "epoch": 0.317, "frac_reward_zero_std": 0.0, "grad_norm": 1.2237578630447388, "kl": 0.00807053130120039, "learning_rate": 4.002021629742759e-06, "loss": 0.152, "num_tokens": 891696.0, "reward": 0.20999999344348907, "reward_std": 0.5282676219940186, "rewards/reward_func/mean": 0.20999999344348907, "rewards/reward_func/std": 0.5282676219940186, "sampling/importance_sampling_ratio/max": 1.2832632064819336, "sampling/importance_sampling_ratio/mean": 1.0614540576934814, "sampling/importance_sampling_ratio/min": 0.8507044315338135, "sampling/sampling_logp_difference/max": 0.3338606357574463, "sampling/sampling_logp_difference/mean": 0.01767737790942192, "step": 317, "step_time": 25.71861167100724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1323891580104828, "epoch": 0.318, "frac_reward_zero_std": 0.0, "grad_norm": 0.6477075219154358, "kl": 0.0036014558281749487, "learning_rate": 3.995541168532055e-06, "loss": -0.1226, "num_tokens": 894428.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 1.0933988094329834, "sampling/importance_sampling_ratio/mean": 0.9546777009963989, "sampling/importance_sampling_ratio/min": 0.6472349166870117, "sampling/sampling_logp_difference/max": 0.19443655014038086, "sampling/sampling_logp_difference/mean": 0.010395800694823265, "step": 318, "step_time": 17.722675739962142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1850094050168991, "epoch": 0.319, "frac_reward_zero_std": 0.0, "grad_norm": 3.085007667541504, "kl": 0.029916824772953987, "learning_rate": 3.989045019802171e-06, "loss": -0.8953, "num_tokens": 897595.0, "reward": 0.2475000023841858, "reward_std": 0.5016888380050659, "rewards/reward_func/mean": 0.2475000023841858, "rewards/reward_func/std": 0.5016888380050659, "sampling/importance_sampling_ratio/max": 2.8503739833831787, "sampling/importance_sampling_ratio/mean": 1.121381163597107, "sampling/importance_sampling_ratio/min": 0.25432464480400085, "sampling/sampling_logp_difference/max": 1.1472411155700684, "sampling/sampling_logp_difference/mean": 0.03603522479534149, "step": 319, "step_time": 26.476199185010046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 43.25, "completions/mean_terminated_length": 43.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.19836486876010895, "epoch": 0.32, "frac_reward_zero_std": 0.0, "grad_norm": 2.180964708328247, "kl": 0.015342935919761658, "learning_rate": 3.982533251694632e-06, "loss": 0.0575, "num_tokens": 900997.0, "reward": 0.4925000071525574, "reward_std": 0.5803088545799255, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5803088545799255, "sampling/importance_sampling_ratio/max": 1.6309031248092651, "sampling/importance_sampling_ratio/mean": 1.0582911968231201, "sampling/importance_sampling_ratio/min": 0.4553375840187073, "sampling/sampling_logp_difference/max": 0.5026814937591553, "sampling/sampling_logp_difference/mean": 0.02851291000843048, "step": 320, "step_time": 35.190176750998944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1988164633512497, "epoch": 0.321, "frac_reward_zero_std": 0.0, "grad_norm": 1.265689730644226, "kl": 0.0112705547362566, "learning_rate": 3.976005932514807e-06, "loss": 0.0931, "num_tokens": 903462.0, "reward": 0.20500001311302185, "reward_std": 0.531005322933197, "rewards/reward_func/mean": 0.20500001311302185, "rewards/reward_func/std": 0.5310053825378418, "sampling/importance_sampling_ratio/max": 1.689989686012268, "sampling/importance_sampling_ratio/mean": 1.1673799753189087, "sampling/importance_sampling_ratio/min": 0.8731977343559265, "sampling/sampling_logp_difference/max": 0.4486992359161377, "sampling/sampling_logp_difference/mean": 0.01723732240498066, "step": 321, "step_time": 24.45267485198565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18294958770275116, "epoch": 0.322, "frac_reward_zero_std": 0.0, "grad_norm": 1.2053050994873047, "kl": 0.013773796148598194, "learning_rate": 3.969463130731183e-06, "loss": 0.3301, "num_tokens": 906928.0, "reward": 0.24000000953674316, "reward_std": 0.5068201422691345, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.5068202018737793, "sampling/importance_sampling_ratio/max": 2.753206253051758, "sampling/importance_sampling_ratio/mean": 1.0970711708068848, "sampling/importance_sampling_ratio/min": 0.37015625834465027, "sampling/sampling_logp_difference/max": 0.6594007015228271, "sampling/sampling_logp_difference/mean": 0.025990234687924385, "step": 322, "step_time": 36.18112298700726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1791200041770935, "epoch": 0.323, "frac_reward_zero_std": 0.0, "grad_norm": 1.2937581539154053, "kl": 0.011144736781716347, "learning_rate": 3.962904914974656e-06, "loss": 0.1015, "num_tokens": 910037.0, "reward": 0.7450000047683716, "reward_std": 0.4967561364173889, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.4967561364173889, "sampling/importance_sampling_ratio/max": 1.6385554075241089, "sampling/importance_sampling_ratio/mean": 1.3996423482894897, "sampling/importance_sampling_ratio/min": 1.0398919582366943, "sampling/sampling_logp_difference/max": 0.4586447477340698, "sampling/sampling_logp_difference/mean": 0.0197463221848011, "step": 323, "step_time": 23.473180503002368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.16650740802288055, "epoch": 0.324, "frac_reward_zero_std": 0.0, "grad_norm": 0.609142541885376, "kl": 0.03226905316114426, "learning_rate": 3.956331354037805e-06, "loss": -0.0885, "num_tokens": 912694.0, "reward": 0.7425000071525574, "reward_std": 0.4950673580169678, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.4950673282146454, "sampling/importance_sampling_ratio/max": 1.1368002891540527, "sampling/importance_sampling_ratio/mean": 0.5865644216537476, "sampling/importance_sampling_ratio/min": 0.2975062131881714, "sampling/sampling_logp_difference/max": 0.6102359294891357, "sampling/sampling_logp_difference/mean": 0.020232977345585823, "step": 324, "step_time": 19.133039970009122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.22847072780132294, "epoch": 0.325, "frac_reward_zero_std": 0.0, "grad_norm": 1.287922739982605, "kl": 0.00280725653283298, "learning_rate": 3.949742516874175e-06, "loss": 0.2455, "num_tokens": 915589.0, "reward": 0.7425000071525574, "reward_std": 0.5149999856948853, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5149999856948853, "sampling/importance_sampling_ratio/max": 1.2986663579940796, "sampling/importance_sampling_ratio/mean": 0.863052248954773, "sampling/importance_sampling_ratio/min": 0.5840887427330017, "sampling/sampling_logp_difference/max": 0.24478614330291748, "sampling/sampling_logp_difference/mean": 0.015452906489372253, "step": 325, "step_time": 19.80979007499991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.1659536361694336, "epoch": 0.326, "frac_reward_zero_std": 0.0, "grad_norm": 0.9871028065681458, "kl": 0.014796778559684753, "learning_rate": 3.943138472597549e-06, "loss": 0.0943, "num_tokens": 918374.0, "reward": 0.4724999964237213, "reward_std": 0.6091181635856628, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6091182231903076, "sampling/importance_sampling_ratio/max": 1.016363501548767, "sampling/importance_sampling_ratio/mean": 0.8258405923843384, "sampling/importance_sampling_ratio/min": 0.6133086681365967, "sampling/sampling_logp_difference/max": 0.5127270221710205, "sampling/sampling_logp_difference/mean": 0.019324785098433495, "step": 326, "step_time": 20.519930072012357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2158389538526535, "epoch": 0.327, "frac_reward_zero_std": 0.0, "grad_norm": 0.8764495849609375, "kl": 0.01843287982046604, "learning_rate": 3.936519290481226e-06, "loss": -0.3639, "num_tokens": 921264.0, "reward": 0.7275000214576721, "reward_std": 0.4987568259239197, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.4987567663192749, "sampling/importance_sampling_ratio/max": 1.5433945655822754, "sampling/importance_sampling_ratio/mean": 0.9397084712982178, "sampling/importance_sampling_ratio/min": 0.26126623153686523, "sampling/sampling_logp_difference/max": 0.6747951507568359, "sampling/sampling_logp_difference/mean": 0.03063812665641308, "step": 327, "step_time": 25.105901809001807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.15249447524547577, "epoch": 0.328, "frac_reward_zero_std": 0.0, "grad_norm": 0.6270763874053955, "kl": 0.006014922168105841, "learning_rate": 3.929885039957296e-06, "loss": -0.0719, "num_tokens": 923752.0, "reward": 0.44749999046325684, "reward_std": 0.6216309070587158, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.6216309070587158, "sampling/importance_sampling_ratio/max": 0.80250483751297, "sampling/importance_sampling_ratio/mean": 0.7386065721511841, "sampling/importance_sampling_ratio/min": 0.6346631050109863, "sampling/sampling_logp_difference/max": 0.359513521194458, "sampling/sampling_logp_difference/mean": 0.015164329670369625, "step": 328, "step_time": 26.279648306954186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 62.25, "completions/mean_terminated_length": 62.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18893060088157654, "epoch": 0.329, "frac_reward_zero_std": 0.0, "grad_norm": 0.6806035041809082, "kl": 0.01311151310801506, "learning_rate": 3.923235790615907e-06, "loss": 0.0155, "num_tokens": 926319.0, "reward": 0.737500011920929, "reward_std": 0.5249999761581421, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5250000357627869, "sampling/importance_sampling_ratio/max": 0.8080926537513733, "sampling/importance_sampling_ratio/mean": 0.4545941948890686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5767067670822144, "sampling/sampling_logp_difference/mean": 0.0246772188693285, "step": 329, "step_time": 18.280938295996748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 63.0, "completions/mean_terminated_length": 63.0, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "entropy": 0.2027558982372284, "epoch": 0.33, "frac_reward_zero_std": 0.0, "grad_norm": 1.1840717792510986, "kl": 0.008667624555528164, "learning_rate": 3.916571612204538e-06, "loss": -0.0631, "num_tokens": 928999.0, "reward": 0.4375, "reward_std": 0.6505574584007263, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.6505574584007263, "sampling/importance_sampling_ratio/max": 1.1569744348526, "sampling/importance_sampling_ratio/mean": 1.016008973121643, "sampling/importance_sampling_ratio/min": 0.7677184343338013, "sampling/sampling_logp_difference/max": 0.2774064540863037, "sampling/sampling_logp_difference/mean": 0.021065648645162582, "step": 330, "step_time": 27.227331248985138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19148050248622894, "epoch": 0.331, "frac_reward_zero_std": 0.0, "grad_norm": 1.3376356363296509, "kl": 0.013826675713062286, "learning_rate": 3.909892574627267e-06, "loss": 0.1824, "num_tokens": 932047.0, "reward": 0.4599999785423279, "reward_std": 0.5838378667831421, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.5838378667831421, "sampling/importance_sampling_ratio/max": 1.3473550081253052, "sampling/importance_sampling_ratio/mean": 1.0255568027496338, "sampling/importance_sampling_ratio/min": 0.5907083749771118, "sampling/sampling_logp_difference/max": 0.48423266410827637, "sampling/sampling_logp_difference/mean": 0.02318631298840046, "step": 331, "step_time": 28.01457067701267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18365478515625, "epoch": 0.332, "frac_reward_zero_std": 0.0, "grad_norm": 0.9068941473960876, "kl": 0.008917930535972118, "learning_rate": 3.903198747944037e-06, "loss": -0.1221, "num_tokens": 934610.0, "reward": 0.7150000333786011, "reward_std": 0.5567464828491211, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5567465424537659, "sampling/importance_sampling_ratio/max": 1.3261334896087646, "sampling/importance_sampling_ratio/mean": 0.9809004664421082, "sampling/importance_sampling_ratio/min": 0.7198947072029114, "sampling/sampling_logp_difference/max": 0.4912610650062561, "sampling/sampling_logp_difference/mean": 0.016315879300236702, "step": 332, "step_time": 15.043208496994339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 61.75, "completions/mean_terminated_length": 61.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18460683524608612, "epoch": 0.333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0409892313182354, "kl": 0.024226641282439232, "learning_rate": 3.896490202369924e-06, "loss": 0.0002, "num_tokens": 937292.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.546746850013733, "sampling/importance_sampling_ratio/mean": 0.7852298021316528, "sampling/importance_sampling_ratio/min": 0.13446657359600067, "sampling/sampling_logp_difference/max": 0.7890514135360718, "sampling/sampling_logp_difference/mean": 0.02319558709859848, "step": 333, "step_time": 16.15961219801102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17790281772613525, "epoch": 0.334, "frac_reward_zero_std": 0.0, "grad_norm": 0.796797513961792, "kl": 0.023831887170672417, "learning_rate": 3.889767008274396e-06, "loss": 0.2569, "num_tokens": 940141.0, "reward": 0.19249999523162842, "reward_std": 0.532752275466919, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.532752275466919, "sampling/importance_sampling_ratio/max": 1.0283513069152832, "sampling/importance_sampling_ratio/mean": 0.7008533477783203, "sampling/importance_sampling_ratio/min": 0.17393198609352112, "sampling/sampling_logp_difference/max": 1.0065237283706665, "sampling/sampling_logp_difference/mean": 0.025002971291542053, "step": 334, "step_time": 31.485712825029623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.17776799201965332, "epoch": 0.335, "frac_reward_zero_std": 0.0, "grad_norm": 0.46931275725364685, "kl": 0.007756600622087717, "learning_rate": 3.883029236180577e-06, "loss": 0.0419, "num_tokens": 943119.0, "reward": 0.49000000953674316, "reward_std": 0.5888972282409668, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5888972282409668, "sampling/importance_sampling_ratio/max": 0.6710522770881653, "sampling/importance_sampling_ratio/mean": 0.5299115777015686, "sampling/importance_sampling_ratio/min": 0.29155147075653076, "sampling/sampling_logp_difference/max": 0.5155463218688965, "sampling/sampling_logp_difference/mean": 0.02160179801285267, "step": 335, "step_time": 29.258832390012685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16258485615253448, "epoch": 0.336, "frac_reward_zero_std": 0.0, "grad_norm": 0.49670156836509705, "kl": 0.010599338449537754, "learning_rate": 3.876276956764509e-06, "loss": -0.1452, "num_tokens": 945692.0, "reward": 0.7100000381469727, "reward_std": 0.5600595474243164, "rewards/reward_func/mean": 0.7100000381469727, "rewards/reward_func/std": 0.5600595474243164, "sampling/importance_sampling_ratio/max": 1.045454740524292, "sampling/importance_sampling_ratio/mean": 0.7635714411735535, "sampling/importance_sampling_ratio/min": 0.5580934882164001, "sampling/sampling_logp_difference/max": 0.6071094274520874, "sampling/sampling_logp_difference/mean": 0.01582520455121994, "step": 336, "step_time": 22.399734677979723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 60.75, "completions/mean_terminated_length": 60.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.25067830085754395, "epoch": 0.337, "frac_reward_zero_std": 0.0, "grad_norm": 1.339911937713623, "kl": 0.004537747707217932, "learning_rate": 3.869510240854408e-06, "loss": -0.2085, "num_tokens": 948193.0, "reward": 0.7174999713897705, "reward_std": 0.5319382548332214, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.5319382548332214, "sampling/importance_sampling_ratio/max": 2.0598645210266113, "sampling/importance_sampling_ratio/mean": 1.0936967134475708, "sampling/importance_sampling_ratio/min": 0.6571621894836426, "sampling/sampling_logp_difference/max": 0.3324289321899414, "sampling/sampling_logp_difference/mean": 0.0188552625477314, "step": 337, "step_time": 20.51769036904443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16378864645957947, "epoch": 0.338, "frac_reward_zero_std": 0.0, "grad_norm": 0.7914707064628601, "kl": 0.0069785104133188725, "learning_rate": 3.862729159429921e-06, "loss": -0.0101, "num_tokens": 950773.0, "reward": 0.4724999964237213, "reward_std": 0.6097745299339294, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6097745895385742, "sampling/importance_sampling_ratio/max": 1.2649530172348022, "sampling/importance_sampling_ratio/mean": 0.8402005434036255, "sampling/importance_sampling_ratio/min": 0.31406426429748535, "sampling/sampling_logp_difference/max": 0.6651067733764648, "sampling/sampling_logp_difference/mean": 0.018476329743862152, "step": 338, "step_time": 17.648877100029495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.16403910517692566, "epoch": 0.339, "frac_reward_zero_std": 0.0, "grad_norm": 0.9339858889579773, "kl": 0.01450288575142622, "learning_rate": 3.855933783621384e-06, "loss": 0.0245, "num_tokens": 953794.0, "reward": 0.7325000166893005, "reward_std": 0.5283543467521667, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5283544063568115, "sampling/importance_sampling_ratio/max": 1.0750203132629395, "sampling/importance_sampling_ratio/mean": 0.9471150636672974, "sampling/importance_sampling_ratio/min": 0.7434753775596619, "sampling/sampling_logp_difference/max": 0.297252893447876, "sampling/sampling_logp_difference/mean": 0.016641445457935333, "step": 339, "step_time": 25.475070370011963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.19940239191055298, "epoch": 0.34, "frac_reward_zero_std": 0.0, "grad_norm": 1.5934970378875732, "kl": 0.04297367483377457, "learning_rate": 3.849124184709073e-06, "loss": 0.0562, "num_tokens": 956918.0, "reward": 0.48000001907348633, "reward_std": 0.6006662249565125, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.600666344165802, "sampling/importance_sampling_ratio/max": 1.5451816320419312, "sampling/importance_sampling_ratio/mean": 1.0539028644561768, "sampling/importance_sampling_ratio/min": 0.32391858100891113, "sampling/sampling_logp_difference/max": 0.3123375177383423, "sampling/sampling_logp_difference/mean": 0.02347414940595627, "step": 340, "step_time": 29.539071800012607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.19785621762275696, "epoch": 0.341, "frac_reward_zero_std": 0.0, "grad_norm": 1.1970665454864502, "kl": 0.011496574617922306, "learning_rate": 3.84230043412246e-06, "loss": 0.0679, "num_tokens": 959174.0, "reward": 0.6899999976158142, "reward_std": 0.5343531966209412, "rewards/reward_func/mean": 0.6899999976158142, "rewards/reward_func/std": 0.5343531966209412, "sampling/importance_sampling_ratio/max": 1.1067477464675903, "sampling/importance_sampling_ratio/mean": 0.9331328868865967, "sampling/importance_sampling_ratio/min": 0.5557433366775513, "sampling/sampling_logp_difference/max": 0.4405163526535034, "sampling/sampling_logp_difference/mean": 0.016990629956126213, "step": 341, "step_time": 21.30721773498226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.15399383008480072, "epoch": 0.342, "frac_reward_zero_std": 0.0, "grad_norm": 1.4224368333816528, "kl": 0.006188969127833843, "learning_rate": 3.835462603439458e-06, "loss": 0.0462, "num_tokens": 961706.0, "reward": 0.7274999618530273, "reward_std": 0.5450000166893005, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.5450000166893005, "sampling/importance_sampling_ratio/max": 1.5189361572265625, "sampling/importance_sampling_ratio/mean": 0.9896402955055237, "sampling/importance_sampling_ratio/min": 0.6185334920883179, "sampling/sampling_logp_difference/max": 0.48210835456848145, "sampling/sampling_logp_difference/mean": 0.011365511454641819, "step": 342, "step_time": 13.123410012980457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.17660373449325562, "epoch": 0.343, "frac_reward_zero_std": 0.0, "grad_norm": 1.2085705995559692, "kl": 0.016034504398703575, "learning_rate": 3.828610764385676e-06, "loss": 0.0986, "num_tokens": 964887.0, "reward": 0.737500011920929, "reward_std": 0.5051979422569275, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5051980018615723, "sampling/importance_sampling_ratio/max": 0.978752613067627, "sampling/importance_sampling_ratio/mean": 0.7919660806655884, "sampling/importance_sampling_ratio/min": 0.2847706079483032, "sampling/sampling_logp_difference/max": 0.5747048854827881, "sampling/sampling_logp_difference/mean": 0.02400084026157856, "step": 343, "step_time": 26.27474796102615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16883505880832672, "epoch": 0.344, "frac_reward_zero_std": 0.0, "grad_norm": 1.02289617061615, "kl": 0.013173779472708702, "learning_rate": 3.821744988833664e-06, "loss": -0.0744, "num_tokens": 967386.0, "reward": 0.7350000143051147, "reward_std": 0.5300000309944153, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5300000309944153, "sampling/importance_sampling_ratio/max": 1.5552880764007568, "sampling/importance_sampling_ratio/mean": 1.0475304126739502, "sampling/importance_sampling_ratio/min": 0.8000168800354004, "sampling/sampling_logp_difference/max": 0.41827690601348877, "sampling/sampling_logp_difference/mean": 0.01849531941115856, "step": 344, "step_time": 20.683368442987558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.23533396422863007, "epoch": 0.345, "frac_reward_zero_std": 0.0, "grad_norm": 1.0094482898712158, "kl": 0.01087096892297268, "learning_rate": 3.814865348802157e-06, "loss": -0.2382, "num_tokens": 969620.0, "reward": 0.48750001192092896, "reward_std": 0.5921359658241272, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.592136025428772, "sampling/importance_sampling_ratio/max": 1.5260144472122192, "sampling/importance_sampling_ratio/mean": 0.8424215316772461, "sampling/importance_sampling_ratio/min": 0.4631038308143616, "sampling/sampling_logp_difference/max": 0.3892955780029297, "sampling/sampling_logp_difference/mean": 0.02018243819475174, "step": 345, "step_time": 13.577111947000958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15041658282279968, "epoch": 0.346, "frac_reward_zero_std": 0.0, "grad_norm": 0.6183080077171326, "kl": 0.006258783396333456, "learning_rate": 3.807971916455325e-06, "loss": 0.2827, "num_tokens": 972216.0, "reward": -0.05250000208616257, "reward_std": 0.03862210363149643, "rewards/reward_func/mean": -0.05250000208616257, "rewards/reward_func/std": 0.03862210363149643, "sampling/importance_sampling_ratio/max": 1.4885988235473633, "sampling/importance_sampling_ratio/mean": 0.802808403968811, "sampling/importance_sampling_ratio/min": 0.24331799149513245, "sampling/sampling_logp_difference/max": 0.5820979475975037, "sampling/sampling_logp_difference/mean": 0.018627675250172615, "step": 346, "step_time": 32.57653158402536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.2260751873254776, "epoch": 0.347, "frac_reward_zero_std": 0.0, "grad_norm": 2.1435744762420654, "kl": 0.05988778918981552, "learning_rate": 3.8010647641020116e-06, "loss": 0.2221, "num_tokens": 975199.0, "reward": -0.04749999940395355, "reward_std": 0.059090323746204376, "rewards/reward_func/mean": -0.04749999940395355, "rewards/reward_func/std": 0.059090327471494675, "sampling/importance_sampling_ratio/max": 1.448785424232483, "sampling/importance_sampling_ratio/mean": 1.0044033527374268, "sampling/importance_sampling_ratio/min": 0.5217226147651672, "sampling/sampling_logp_difference/max": 0.6781806945800781, "sampling/sampling_logp_difference/mean": 0.026839422062039375, "step": 347, "step_time": 35.182972672977485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1741323322057724, "epoch": 0.348, "frac_reward_zero_std": 0.0, "grad_norm": 1.0567734241485596, "kl": 0.008789798244833946, "learning_rate": 3.794143964194976e-06, "loss": 0.0406, "num_tokens": 977969.0, "reward": 0.6800000071525574, "reward_std": 0.5666862726211548, "rewards/reward_func/mean": 0.6800000071525574, "rewards/reward_func/std": 0.5666862726211548, "sampling/importance_sampling_ratio/max": 1.2841132879257202, "sampling/importance_sampling_ratio/mean": 0.9939666390419006, "sampling/importance_sampling_ratio/min": 0.7173458933830261, "sampling/sampling_logp_difference/max": 0.2553447484970093, "sampling/sampling_logp_difference/mean": 0.01658765599131584, "step": 348, "step_time": 32.1732480530045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16321228444576263, "epoch": 0.349, "frac_reward_zero_std": 0.0, "grad_norm": 1.0157686471939087, "kl": 0.006961624603718519, "learning_rate": 3.7872095893301344e-06, "loss": -0.2533, "num_tokens": 980337.0, "reward": 0.4325000047683716, "reward_std": 0.6497371196746826, "rewards/reward_func/mean": 0.4325000047683716, "rewards/reward_func/std": 0.6497371196746826, "sampling/importance_sampling_ratio/max": 1.5574259757995605, "sampling/importance_sampling_ratio/mean": 1.062833547592163, "sampling/importance_sampling_ratio/min": 0.5259334444999695, "sampling/sampling_logp_difference/max": 0.35424327850341797, "sampling/sampling_logp_difference/mean": 0.01507798582315445, "step": 349, "step_time": 21.289733862970024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.17826926708221436, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 1.1914362907409668, "kl": 0.0142469871789217, "learning_rate": 3.7802617122457976e-06, "loss": 0.3066, "num_tokens": 983414.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.8274776935577393, "sampling/importance_sampling_ratio/mean": 1.1029167175292969, "sampling/importance_sampling_ratio/min": 0.6254013180732727, "sampling/sampling_logp_difference/max": 0.5443787574768066, "sampling/sampling_logp_difference/mean": 0.024480953812599182, "step": 350, "step_time": 26.361375631997362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.17679348587989807, "epoch": 0.351, "frac_reward_zero_std": 0.0, "grad_norm": 1.6374858617782593, "kl": 0.03299008309841156, "learning_rate": 3.773300405821908e-06, "loss": 0.0433, "num_tokens": 986573.0, "reward": 0.48250001668930054, "reward_std": 0.5864227414131165, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5864228010177612, "sampling/importance_sampling_ratio/max": 1.4335743188858032, "sampling/importance_sampling_ratio/mean": 1.0593678951263428, "sampling/importance_sampling_ratio/min": 0.6837874054908752, "sampling/sampling_logp_difference/max": 0.6966762542724609, "sampling/sampling_logp_difference/mean": 0.022292258217930794, "step": 351, "step_time": 32.720512653002515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16347146034240723, "epoch": 0.352, "frac_reward_zero_std": 0.0, "grad_norm": 0.6886407732963562, "kl": 0.010092093609273434, "learning_rate": 3.766325743079277e-06, "loss": -0.1097, "num_tokens": 989246.0, "reward": 0.4399999976158142, "reward_std": 0.6470960974693298, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.6470960974693298, "sampling/importance_sampling_ratio/max": 0.8785107135772705, "sampling/importance_sampling_ratio/mean": 0.645267128944397, "sampling/importance_sampling_ratio/min": 0.2759000360965729, "sampling/sampling_logp_difference/max": 0.5170090198516846, "sampling/sampling_logp_difference/mean": 0.023559367284178734, "step": 352, "step_time": 17.21799072698923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18451102077960968, "epoch": 0.353, "frac_reward_zero_std": 0.0, "grad_norm": 2.3069911003112793, "kl": 0.012927538715302944, "learning_rate": 3.7593377971788162e-06, "loss": -0.0172, "num_tokens": 991705.0, "reward": 0.9700000286102295, "reward_std": 0.04760953411459923, "rewards/reward_func/mean": 0.9700000286102295, "rewards/reward_func/std": 0.047609537839889526, "sampling/importance_sampling_ratio/max": 2.8937880992889404, "sampling/importance_sampling_ratio/mean": 1.6569774150848389, "sampling/importance_sampling_ratio/min": 0.9218763113021851, "sampling/sampling_logp_difference/max": 0.8311319351196289, "sampling/sampling_logp_difference/mean": 0.02265118435025215, "step": 353, "step_time": 20.25130614900263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.19052478671073914, "epoch": 0.354, "frac_reward_zero_std": 0.0, "grad_norm": 1.2124743461608887, "kl": 0.02176245115697384, "learning_rate": 3.752336641420772e-06, "loss": -0.0881, "num_tokens": 994093.0, "reward": 0.7224999666213989, "reward_std": 0.5483536124229431, "rewards/reward_func/mean": 0.7224999666213989, "rewards/reward_func/std": 0.5483536124229431, "sampling/importance_sampling_ratio/max": 1.4200595617294312, "sampling/importance_sampling_ratio/mean": 0.913448691368103, "sampling/importance_sampling_ratio/min": 0.6244259476661682, "sampling/sampling_logp_difference/max": 0.33083927631378174, "sampling/sampling_logp_difference/mean": 0.022579513490200043, "step": 354, "step_time": 16.410235597984865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.19823837280273438, "epoch": 0.355, "frac_reward_zero_std": 0.0, "grad_norm": 2.740354537963867, "kl": 0.011836385354399681, "learning_rate": 3.7453223492439544e-06, "loss": -0.0749, "num_tokens": 997160.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 2.2212026119232178, "sampling/importance_sampling_ratio/mean": 1.6361935138702393, "sampling/importance_sampling_ratio/min": 0.5486956238746643, "sampling/sampling_logp_difference/max": 0.42912137508392334, "sampling/sampling_logp_difference/mean": 0.032044801861047745, "step": 355, "step_time": 27.215389285993297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.22500787675380707, "epoch": 0.356, "frac_reward_zero_std": 0.0, "grad_norm": 1.1363515853881836, "kl": 0.038025617599487305, "learning_rate": 3.7382949942249695e-06, "loss": -0.2381, "num_tokens": 999712.0, "reward": 0.9950000047683716, "reward_std": 0.009999990463256836, "rewards/reward_func/mean": 0.9950000047683716, "rewards/reward_func/std": 0.009999990463256836, "sampling/importance_sampling_ratio/max": 1.738869071006775, "sampling/importance_sampling_ratio/mean": 1.1264631748199463, "sampling/importance_sampling_ratio/min": 0.7589366436004639, "sampling/sampling_logp_difference/max": 0.3271911144256592, "sampling/sampling_logp_difference/mean": 0.023825498297810555, "step": 356, "step_time": 11.289376422995701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19238880276679993, "epoch": 0.357, "frac_reward_zero_std": 0.0, "grad_norm": 1.8837995529174805, "kl": 0.024569693952798843, "learning_rate": 3.731254650077446e-06, "loss": 0.3417, "num_tokens": 1002497.0, "reward": 0.4724999964237213, "reward_std": 0.604228138923645, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6042281985282898, "sampling/importance_sampling_ratio/max": 1.9373708963394165, "sampling/importance_sampling_ratio/mean": 0.8590580224990845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4783909320831299, "sampling/sampling_logp_difference/mean": 0.02700084075331688, "step": 357, "step_time": 24.55734916200163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18510816991329193, "epoch": 0.358, "frac_reward_zero_std": 0.0, "grad_norm": 0.8063596487045288, "kl": 0.00983469933271408, "learning_rate": 3.724201390651263e-06, "loss": -0.0492, "num_tokens": 1004957.0, "reward": 0.46000000834465027, "reward_std": 0.6177917718887329, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.6177917718887329, "sampling/importance_sampling_ratio/max": 1.0804027318954468, "sampling/importance_sampling_ratio/mean": 0.8439720869064331, "sampling/importance_sampling_ratio/min": 0.5704425573348999, "sampling/sampling_logp_difference/max": 0.5955214500427246, "sampling/sampling_logp_difference/mean": 0.015848539769649506, "step": 358, "step_time": 20.172869423986413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 82.0, "completions/max_terminated_length": 82.0, "completions/mean_length": 61.5, "completions/mean_terminated_length": 61.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16663743555545807, "epoch": 0.359, "frac_reward_zero_std": 0.0, "grad_norm": 1.336969017982483, "kl": 0.005679582245647907, "learning_rate": 3.7171352899317743e-06, "loss": 0.0105, "num_tokens": 1007819.0, "reward": 0.21250000596046448, "reward_std": 0.5254442691802979, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.5254442691802979, "sampling/importance_sampling_ratio/max": 1.7619911432266235, "sampling/importance_sampling_ratio/mean": 1.4409316778182983, "sampling/importance_sampling_ratio/min": 0.9566137790679932, "sampling/sampling_logp_difference/max": 0.5972033739089966, "sampling/sampling_logp_difference/mean": 0.0182740930467844, "step": 359, "step_time": 32.22705738199875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1910375952720642, "epoch": 0.36, "frac_reward_zero_std": 0.0, "grad_norm": 0.588450014591217, "kl": 0.007703287061303854, "learning_rate": 3.710056422039033e-06, "loss": 0.1409, "num_tokens": 1010642.0, "reward": 0.20499998331069946, "reward_std": 0.5048101544380188, "rewards/reward_func/mean": 0.20499998331069946, "rewards/reward_func/std": 0.5048101544380188, "sampling/importance_sampling_ratio/max": 1.3246843814849854, "sampling/importance_sampling_ratio/mean": 0.6796797513961792, "sampling/importance_sampling_ratio/min": 0.29863858222961426, "sampling/sampling_logp_difference/max": 0.3399052619934082, "sampling/sampling_logp_difference/mean": 0.02161012403666973, "step": 360, "step_time": 29.609306400001515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.13602198660373688, "epoch": 0.361, "frac_reward_zero_std": 0.0, "grad_norm": 0.7196847200393677, "kl": 0.03245507925748825, "learning_rate": 3.702964861227013e-06, "loss": 0.0008, "num_tokens": 1013481.0, "reward": -0.04749999940395355, "reward_std": 0.07544313371181488, "rewards/reward_func/mean": -0.04749999940395355, "rewards/reward_func/std": 0.07544313371181488, "sampling/importance_sampling_ratio/max": 0.7759369015693665, "sampling/importance_sampling_ratio/mean": 0.48067164421081543, "sampling/importance_sampling_ratio/min": 0.21579115092754364, "sampling/sampling_logp_difference/max": 0.6043939590454102, "sampling/sampling_logp_difference/mean": 0.02018584869801998, "step": 361, "step_time": 32.27657372900285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.13641466200351715, "epoch": 0.362, "frac_reward_zero_std": 0.0, "grad_norm": 0.6214380860328674, "kl": 0.018835650756955147, "learning_rate": 3.695860681882832e-06, "loss": 0.0356, "num_tokens": 1016930.0, "reward": 0.23750001192092896, "reward_std": 0.4950673580169678, "rewards/reward_func/mean": 0.23750001192092896, "rewards/reward_func/std": 0.4950673580169678, "sampling/importance_sampling_ratio/max": 0.713340699672699, "sampling/importance_sampling_ratio/mean": 0.45140209794044495, "sampling/importance_sampling_ratio/min": 0.26424604654312134, "sampling/sampling_logp_difference/max": 0.6167413592338562, "sampling/sampling_logp_difference/mean": 0.028661642223596573, "step": 362, "step_time": 34.351921004999895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18236687779426575, "epoch": 0.363, "frac_reward_zero_std": 0.0, "grad_norm": 0.7312202453613281, "kl": 0.022743677720427513, "learning_rate": 3.6887439585259693e-06, "loss": -0.1132, "num_tokens": 1019365.0, "reward": 0.7150000333786011, "reward_std": 0.5633530616760254, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5633530616760254, "sampling/importance_sampling_ratio/max": 1.2302157878875732, "sampling/importance_sampling_ratio/mean": 0.7505608201026917, "sampling/importance_sampling_ratio/min": 0.5181044936180115, "sampling/sampling_logp_difference/max": 0.6649560928344727, "sampling/sampling_logp_difference/mean": 0.025182969868183136, "step": 363, "step_time": 21.408829763007816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.17376121878623962, "epoch": 0.364, "frac_reward_zero_std": 0.0, "grad_norm": 0.9325841665267944, "kl": 0.010413718409836292, "learning_rate": 3.6816147658074864e-06, "loss": -0.3429, "num_tokens": 1022131.0, "reward": 0.7350000143051147, "reward_std": 0.5233545303344727, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 1.8628607988357544, "sampling/importance_sampling_ratio/mean": 1.1008493900299072, "sampling/importance_sampling_ratio/min": 0.5260623097419739, "sampling/sampling_logp_difference/max": 0.35680532455444336, "sampling/sampling_logp_difference/mean": 0.01927400752902031, "step": 364, "step_time": 17.39398234098917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1751767247915268, "epoch": 0.365, "frac_reward_zero_std": 0.0, "grad_norm": 1.2364414930343628, "kl": 0.01140661258250475, "learning_rate": 3.6744731785092396e-06, "loss": -0.4871, "num_tokens": 1024859.0, "reward": 0.4925000071525574, "reward_std": 0.5861384868621826, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5861384868621826, "sampling/importance_sampling_ratio/max": 1.88137948513031, "sampling/importance_sampling_ratio/mean": 1.180488109588623, "sampling/importance_sampling_ratio/min": 0.5428536534309387, "sampling/sampling_logp_difference/max": 0.3909285068511963, "sampling/sampling_logp_difference/mean": 0.019178345799446106, "step": 365, "step_time": 22.96234353003092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17336700856685638, "epoch": 0.366, "frac_reward_zero_std": 0.0, "grad_norm": 1.5716477632522583, "kl": 0.012173735536634922, "learning_rate": 3.6673192715431016e-06, "loss": 0.1415, "num_tokens": 1027360.0, "reward": 0.48250001668930054, "reward_std": 0.5975714921951294, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5975714921951294, "sampling/importance_sampling_ratio/max": 1.6835219860076904, "sampling/importance_sampling_ratio/mean": 1.2127034664154053, "sampling/importance_sampling_ratio/min": 0.8520066142082214, "sampling/sampling_logp_difference/max": 0.4784733057022095, "sampling/sampling_logp_difference/mean": 0.018504593521356583, "step": 366, "step_time": 22.57081652805209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1550433486700058, "epoch": 0.367, "frac_reward_zero_std": 0.0, "grad_norm": 0.9739995002746582, "kl": 0.017274178564548492, "learning_rate": 3.6601531199501715e-06, "loss": -0.2568, "num_tokens": 1030170.0, "reward": 0.48750001192092896, "reward_std": 0.5917980670928955, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5917980670928955, "sampling/importance_sampling_ratio/max": 1.552931785583496, "sampling/importance_sampling_ratio/mean": 0.7558615207672119, "sampling/importance_sampling_ratio/min": 0.3065848648548126, "sampling/sampling_logp_difference/max": 0.4559915065765381, "sampling/sampling_logp_difference/mean": 0.020614411681890488, "step": 367, "step_time": 24.205471395980567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18777520954608917, "epoch": 0.368, "frac_reward_zero_std": 0.0, "grad_norm": 1.1970232725143433, "kl": 0.008067840710282326, "learning_rate": 3.652974798899988e-06, "loss": 0.1733, "num_tokens": 1032972.0, "reward": 0.23000000417232513, "reward_std": 0.5070831775665283, "rewards/reward_func/mean": 0.23000000417232513, "rewards/reward_func/std": 0.5070831775665283, "sampling/importance_sampling_ratio/max": 1.7239021062850952, "sampling/importance_sampling_ratio/mean": 1.2376397848129272, "sampling/importance_sampling_ratio/min": 0.6731739044189453, "sampling/sampling_logp_difference/max": 0.5554044246673584, "sampling/sampling_logp_difference/mean": 0.022544674575328827, "step": 368, "step_time": 27.654578586982097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.21973448991775513, "epoch": 0.369, "frac_reward_zero_std": 0.0, "grad_norm": 0.9220748543739319, "kl": 0.006503449287265539, "learning_rate": 3.645784383689742e-06, "loss": 0.0775, "num_tokens": 1035517.0, "reward": 0.45249998569488525, "reward_std": 0.6266511678695679, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6266511678695679, "sampling/importance_sampling_ratio/max": 0.9215123057365417, "sampling/importance_sampling_ratio/mean": 0.8444769382476807, "sampling/importance_sampling_ratio/min": 0.7371353507041931, "sampling/sampling_logp_difference/max": 0.27382606267929077, "sampling/sampling_logp_difference/mean": 0.01445186696946621, "step": 369, "step_time": 26.782864376029465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.21189478039741516, "epoch": 0.37, "frac_reward_zero_std": 0.0, "grad_norm": 1.154143214225769, "kl": 0.008169838227331638, "learning_rate": 3.6385819497434877e-06, "loss": 0.0011, "num_tokens": 1037883.0, "reward": 0.47749999165534973, "reward_std": 0.6044487953186035, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6044487953186035, "sampling/importance_sampling_ratio/max": 1.4487707614898682, "sampling/importance_sampling_ratio/mean": 1.1775838136672974, "sampling/importance_sampling_ratio/min": 0.8134413361549377, "sampling/sampling_logp_difference/max": 0.44367146492004395, "sampling/sampling_logp_difference/mean": 0.02093052677810192, "step": 370, "step_time": 19.5043215820333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18765220046043396, "epoch": 0.371, "frac_reward_zero_std": 0.0, "grad_norm": 2.4134814739227295, "kl": 0.01851447857916355, "learning_rate": 3.631367572611348e-06, "loss": 0.3159, "num_tokens": 1041220.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.956591010093689, "sampling/importance_sampling_ratio/mean": 1.1217083930969238, "sampling/importance_sampling_ratio/min": 0.633980393409729, "sampling/sampling_logp_difference/max": 0.6677395701408386, "sampling/sampling_logp_difference/mean": 0.0246714036911726, "step": 371, "step_time": 29.689375291985925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1634160876274109, "epoch": 0.372, "frac_reward_zero_std": 0.0, "grad_norm": 0.8839187026023865, "kl": 0.016868669539690018, "learning_rate": 3.6241413279687256e-06, "loss": -0.0216, "num_tokens": 1044403.0, "reward": 0.4650000035762787, "reward_std": 0.6204031109809875, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.6204031109809875, "sampling/importance_sampling_ratio/max": 0.9768995642662048, "sampling/importance_sampling_ratio/mean": 0.7806432843208313, "sampling/importance_sampling_ratio/min": 0.5837256908416748, "sampling/sampling_logp_difference/max": 0.5378838181495667, "sampling/sampling_logp_difference/mean": 0.023061394691467285, "step": 372, "step_time": 30.69561639201129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18320967257022858, "epoch": 0.373, "frac_reward_zero_std": 0.0, "grad_norm": 0.8849806785583496, "kl": 0.0037198010832071304, "learning_rate": 3.616903291615506e-06, "loss": 0.0098, "num_tokens": 1046755.0, "reward": 0.42750000953674316, "reward_std": 0.6666020750999451, "rewards/reward_func/mean": 0.42750000953674316, "rewards/reward_func/std": 0.6666020750999451, "sampling/importance_sampling_ratio/max": 1.3146392107009888, "sampling/importance_sampling_ratio/mean": 0.840124249458313, "sampling/importance_sampling_ratio/min": 0.3749515414237976, "sampling/sampling_logp_difference/max": 0.3202456831932068, "sampling/sampling_logp_difference/mean": 0.017809590324759483, "step": 373, "step_time": 27.592172019998543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1951667219400406, "epoch": 0.374, "frac_reward_zero_std": 0.0, "grad_norm": 0.9001742005348206, "kl": 0.021216701716184616, "learning_rate": 3.609653539475268e-06, "loss": 0.0563, "num_tokens": 1049627.0, "reward": 0.1875, "reward_std": 0.49189257621765137, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.49189260601997375, "sampling/importance_sampling_ratio/max": 1.798539400100708, "sampling/importance_sampling_ratio/mean": 1.033005952835083, "sampling/importance_sampling_ratio/min": 0.7205654978752136, "sampling/sampling_logp_difference/max": 0.5329791307449341, "sampling/sampling_logp_difference/mean": 0.02314153127372265, "step": 374, "step_time": 33.20785316900583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16902080178260803, "epoch": 0.375, "frac_reward_zero_std": 0.0, "grad_norm": 1.4866374731063843, "kl": 0.015396352857351303, "learning_rate": 3.6023921475944795e-06, "loss": 0.0584, "num_tokens": 1052379.0, "reward": 0.20000000298023224, "reward_std": 0.5348520278930664, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5348520278930664, "sampling/importance_sampling_ratio/max": 1.63822340965271, "sampling/importance_sampling_ratio/mean": 1.0075761079788208, "sampling/importance_sampling_ratio/min": 0.5067777037620544, "sampling/sampling_logp_difference/max": 0.5632400512695312, "sampling/sampling_logp_difference/mean": 0.02449890226125717, "step": 375, "step_time": 28.247903392999433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2049083560705185, "epoch": 0.376, "frac_reward_zero_std": 0.0, "grad_norm": 1.3860288858413696, "kl": 0.014334267936646938, "learning_rate": 3.5951191921417063e-06, "loss": -0.2906, "num_tokens": 1055033.0, "reward": 0.9950000047683716, "reward_std": 0.005773497279733419, "rewards/reward_func/mean": 0.9950000047683716, "rewards/reward_func/std": 0.005773497279733419, "sampling/importance_sampling_ratio/max": 1.5544980764389038, "sampling/importance_sampling_ratio/mean": 0.832476019859314, "sampling/importance_sampling_ratio/min": 0.44439753890037537, "sampling/sampling_logp_difference/max": 0.5425586700439453, "sampling/sampling_logp_difference/mean": 0.021196428686380386, "step": 376, "step_time": 22.002049055008683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.11741887032985687, "epoch": 0.377, "frac_reward_zero_std": 0.0, "grad_norm": 1.3070216178894043, "kl": 0.018845703452825546, "learning_rate": 3.5878347494068083e-06, "loss": -0.0461, "num_tokens": 1057767.0, "reward": 0.4449999928474426, "reward_std": 0.6410667896270752, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.6410667896270752, "sampling/importance_sampling_ratio/max": 2.246612071990967, "sampling/importance_sampling_ratio/mean": 1.3774360418319702, "sampling/importance_sampling_ratio/min": 0.6010515689849854, "sampling/sampling_logp_difference/max": 0.6281629800796509, "sampling/sampling_logp_difference/mean": 0.01632842794060707, "step": 377, "step_time": 24.821758064965252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.17253008484840393, "epoch": 0.378, "frac_reward_zero_std": 0.0, "grad_norm": 1.5289995670318604, "kl": 0.011049321852624416, "learning_rate": 3.580538895800144e-06, "loss": -0.2151, "num_tokens": 1060366.0, "reward": 0.45499998331069946, "reward_std": 0.6326926350593567, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.6326926946640015, "sampling/importance_sampling_ratio/max": 1.3653011322021484, "sampling/importance_sampling_ratio/mean": 0.9994121789932251, "sampling/importance_sampling_ratio/min": 0.7179602384567261, "sampling/sampling_logp_difference/max": 0.43517541885375977, "sampling/sampling_logp_difference/mean": 0.017930272966623306, "step": 378, "step_time": 26.169304884038866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.17355261743068695, "epoch": 0.379, "frac_reward_zero_std": 0.0, "grad_norm": 1.3942126035690308, "kl": 0.019073203206062317, "learning_rate": 3.573231707851765e-06, "loss": -0.1511, "num_tokens": 1063441.0, "reward": 0.48750001192092896, "reward_std": 0.5917980670928955, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5917980670928955, "sampling/importance_sampling_ratio/max": 0.9193845391273499, "sampling/importance_sampling_ratio/mean": 0.7706226110458374, "sampling/importance_sampling_ratio/min": 0.5126433968544006, "sampling/sampling_logp_difference/max": 0.5120062828063965, "sampling/sampling_logp_difference/mean": 0.022825224325060844, "step": 379, "step_time": 26.87055314698955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.19090497493743896, "epoch": 0.38, "frac_reward_zero_std": 0.0, "grad_norm": 1.549770712852478, "kl": 0.007455130573362112, "learning_rate": 3.5659132622106152e-06, "loss": 0.2314, "num_tokens": 1066567.0, "reward": 0.48500001430511475, "reward_std": 0.59517502784729, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5951750874519348, "sampling/importance_sampling_ratio/max": 1.6396021842956543, "sampling/importance_sampling_ratio/mean": 1.0665512084960938, "sampling/importance_sampling_ratio/min": 0.7346222400665283, "sampling/sampling_logp_difference/max": 0.4571352005004883, "sampling/sampling_logp_difference/mean": 0.019577497616410255, "step": 380, "step_time": 24.84273273200961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18461297452449799, "epoch": 0.381, "frac_reward_zero_std": 0.0, "grad_norm": 0.9871307611465454, "kl": 0.01335266511887312, "learning_rate": 3.5585836356437266e-06, "loss": 0.2114, "num_tokens": 1069429.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5773502588272095, "sampling/importance_sampling_ratio/max": 1.549399971961975, "sampling/importance_sampling_ratio/mean": 0.9277427196502686, "sampling/importance_sampling_ratio/min": 0.4782666563987732, "sampling/sampling_logp_difference/max": 0.5258088111877441, "sampling/sampling_logp_difference/mean": 0.020237673074007034, "step": 381, "step_time": 30.47787551395595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.1702517867088318, "epoch": 0.382, "frac_reward_zero_std": 0.0, "grad_norm": 1.105722427368164, "kl": 0.012948703020811081, "learning_rate": 3.551242905035412e-06, "loss": 0.0566, "num_tokens": 1073020.0, "reward": 0.22500000894069672, "reward_std": 0.5107837319374084, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.5107837319374084, "sampling/importance_sampling_ratio/max": 1.3269617557525635, "sampling/importance_sampling_ratio/mean": 0.8236912488937378, "sampling/importance_sampling_ratio/min": 0.49761515855789185, "sampling/sampling_logp_difference/max": 0.3609501123428345, "sampling/sampling_logp_difference/mean": 0.021472128108143806, "step": 382, "step_time": 35.570017318998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1873074620962143, "epoch": 0.383, "frac_reward_zero_std": 0.0, "grad_norm": 0.9811713099479675, "kl": 0.021067099645733833, "learning_rate": 3.5438911473864633e-06, "loss": -0.1157, "num_tokens": 1076462.0, "reward": 0.4925000071525574, "reward_std": 0.5860247611999512, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.6275314092636108, "sampling/importance_sampling_ratio/mean": 0.8247594237327576, "sampling/importance_sampling_ratio/min": 0.4828380346298218, "sampling/sampling_logp_difference/max": 0.4131711721420288, "sampling/sampling_logp_difference/mean": 0.0262280460447073, "step": 383, "step_time": 35.00671487901127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16057370603084564, "epoch": 0.384, "frac_reward_zero_std": 0.0, "grad_norm": 1.1424998044967651, "kl": 0.06058909744024277, "learning_rate": 3.5365284398133404e-06, "loss": -0.0974, "num_tokens": 1079196.0, "reward": 0.7124999761581421, "reward_std": 0.574999988079071, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.574999988079071, "sampling/importance_sampling_ratio/max": 0.8824173212051392, "sampling/importance_sampling_ratio/mean": 0.8071755170822144, "sampling/importance_sampling_ratio/min": 0.744614839553833, "sampling/sampling_logp_difference/max": 0.3392900824546814, "sampling/sampling_logp_difference/mean": 0.013692907057702541, "step": 384, "step_time": 16.42252398596611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15960726141929626, "epoch": 0.385, "frac_reward_zero_std": 0.0, "grad_norm": 0.528042197227478, "kl": 0.013345170766115189, "learning_rate": 3.52915485954736e-06, "loss": -0.229, "num_tokens": 1081850.0, "reward": 0.7475000023841858, "reward_std": 0.4983556270599365, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.4983556270599365, "sampling/importance_sampling_ratio/max": 1.2556705474853516, "sampling/importance_sampling_ratio/mean": 0.8194347620010376, "sampling/importance_sampling_ratio/min": 0.4233543574810028, "sampling/sampling_logp_difference/max": 0.6411216259002686, "sampling/sampling_logp_difference/mean": 0.01721111685037613, "step": 385, "step_time": 19.69826082699001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.14118431508541107, "epoch": 0.386, "frac_reward_zero_std": 0.0, "grad_norm": 1.089905858039856, "kl": 0.013371966779232025, "learning_rate": 3.521770483933891e-06, "loss": 0.2191, "num_tokens": 1084349.0, "reward": 0.4650000035762787, "reward_std": 0.6126717329025269, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.6126717329025269, "sampling/importance_sampling_ratio/max": 1.530951738357544, "sampling/importance_sampling_ratio/mean": 1.0204105377197266, "sampling/importance_sampling_ratio/min": 0.553114652633667, "sampling/sampling_logp_difference/max": 0.4593997001647949, "sampling/sampling_logp_difference/mean": 0.015425493009388447, "step": 386, "step_time": 19.046076595957857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 63.5, "completions/mean_terminated_length": 63.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.2142651528120041, "epoch": 0.387, "frac_reward_zero_std": 0.0, "grad_norm": 0.8082420825958252, "kl": 0.012878898531198502, "learning_rate": 3.514375390431539e-06, "loss": -0.173, "num_tokens": 1087533.0, "reward": 0.9549999833106995, "reward_std": 0.08346657454967499, "rewards/reward_func/mean": 0.9549999833106995, "rewards/reward_func/std": 0.08346656709909439, "sampling/importance_sampling_ratio/max": 1.4651248455047607, "sampling/importance_sampling_ratio/mean": 0.8976165056228638, "sampling/importance_sampling_ratio/min": 0.5406602025032043, "sampling/sampling_logp_difference/max": 0.38579654693603516, "sampling/sampling_logp_difference/mean": 0.024349762126803398, "step": 387, "step_time": 27.138454746978823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.19480647146701813, "epoch": 0.388, "frac_reward_zero_std": 0.0, "grad_norm": 0.7227767705917358, "kl": 0.01165311224758625, "learning_rate": 3.5069696566113347e-06, "loss": -0.2149, "num_tokens": 1090439.0, "reward": 0.7174999713897705, "reward_std": 0.4814128577709198, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.4814128577709198, "sampling/importance_sampling_ratio/max": 1.2608762979507446, "sampling/importance_sampling_ratio/mean": 0.8780190348625183, "sampling/importance_sampling_ratio/min": 0.5550334453582764, "sampling/sampling_logp_difference/max": 0.42597687244415283, "sampling/sampling_logp_difference/mean": 0.016529759392142296, "step": 388, "step_time": 30.15476765600033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.18826434016227722, "epoch": 0.389, "frac_reward_zero_std": 0.0, "grad_norm": 2.7948060035705566, "kl": 0.014595668762922287, "learning_rate": 3.499553360155923e-06, "loss": -0.8413, "num_tokens": 1093498.0, "reward": 0.24000000953674316, "reward_std": 0.5066885948181152, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.5066885948181152, "sampling/importance_sampling_ratio/max": 2.6444642543792725, "sampling/importance_sampling_ratio/mean": 1.3944354057312012, "sampling/importance_sampling_ratio/min": 0.5688008069992065, "sampling/sampling_logp_difference/max": 0.5762366056442261, "sampling/sampling_logp_difference/mean": 0.021283309906721115, "step": 389, "step_time": 25.115495408012066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19184885919094086, "epoch": 0.39, "frac_reward_zero_std": 0.0, "grad_norm": 1.076230764389038, "kl": 0.01152780931442976, "learning_rate": 3.4921265788587432e-06, "loss": 0.1135, "num_tokens": 1096202.0, "reward": 0.45499998331069946, "reward_std": 0.6238856911659241, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.6238856911659241, "sampling/importance_sampling_ratio/max": 1.183936595916748, "sampling/importance_sampling_ratio/mean": 0.8780079483985901, "sampling/importance_sampling_ratio/min": 0.6814122200012207, "sampling/sampling_logp_difference/max": 0.4658985137939453, "sampling/sampling_logp_difference/mean": 0.024866536259651184, "step": 390, "step_time": 24.32982387096854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.19940578937530518, "epoch": 0.391, "frac_reward_zero_std": 0.0, "grad_norm": 0.8423588871955872, "kl": 0.008173727430403233, "learning_rate": 3.484689390623218e-06, "loss": 0.0342, "num_tokens": 1099343.0, "reward": 0.2199999988079071, "reward_std": 0.5201922655105591, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5201922655105591, "sampling/importance_sampling_ratio/max": 1.2359317541122437, "sampling/importance_sampling_ratio/mean": 0.7610199451446533, "sampling/importance_sampling_ratio/min": 0.42562541365623474, "sampling/sampling_logp_difference/max": 0.6401360034942627, "sampling/sampling_logp_difference/mean": 0.022527404129505157, "step": 391, "step_time": 34.160252500965726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.21434739232063293, "epoch": 0.392, "frac_reward_zero_std": 0.0, "grad_norm": 1.1383732557296753, "kl": 0.01180187612771988, "learning_rate": 3.4772418734619325e-06, "loss": 0.1871, "num_tokens": 1101774.0, "reward": 0.1550000011920929, "reward_std": 0.5435991287231445, "rewards/reward_func/mean": 0.1550000011920929, "rewards/reward_func/std": 0.5435991287231445, "sampling/importance_sampling_ratio/max": 1.429150938987732, "sampling/importance_sampling_ratio/mean": 0.9047324657440186, "sampling/importance_sampling_ratio/min": 0.6350069046020508, "sampling/sampling_logp_difference/max": 0.35371971130371094, "sampling/sampling_logp_difference/mean": 0.019119998440146446, "step": 392, "step_time": 28.64637309202226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.20441976189613342, "epoch": 0.393, "frac_reward_zero_std": 0.0, "grad_norm": 1.4917439222335815, "kl": 0.007794365752488375, "learning_rate": 3.4697841054958163e-06, "loss": 0.0572, "num_tokens": 1104646.0, "reward": 0.7124999761581421, "reward_std": 0.5483536124229431, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.5483536124229431, "sampling/importance_sampling_ratio/max": 1.4354469776153564, "sampling/importance_sampling_ratio/mean": 1.0428463220596313, "sampling/importance_sampling_ratio/min": 0.725904107093811, "sampling/sampling_logp_difference/max": 0.2969186305999756, "sampling/sampling_logp_difference/mean": 0.016633160412311554, "step": 393, "step_time": 24.66445926501183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16488778591156006, "epoch": 0.394, "frac_reward_zero_std": 0.0, "grad_norm": 1.8049037456512451, "kl": 0.01617872528731823, "learning_rate": 3.4623161649533284e-06, "loss": 0.1404, "num_tokens": 1107482.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 1.3772903680801392, "sampling/importance_sampling_ratio/mean": 0.9450259208679199, "sampling/importance_sampling_ratio/min": 0.3407703638076782, "sampling/sampling_logp_difference/max": 0.610370397567749, "sampling/sampling_logp_difference/mean": 0.02506442368030548, "step": 394, "step_time": 12.062467945972458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.22006820142269135, "epoch": 0.395, "frac_reward_zero_std": 0.0, "grad_norm": 2.0660171508789062, "kl": 0.00955228228121996, "learning_rate": 3.4548381301696298e-06, "loss": 0.6532, "num_tokens": 1110017.0, "reward": 0.4674999713897705, "reward_std": 0.6152167916297913, "rewards/reward_func/mean": 0.4674999713897705, "rewards/reward_func/std": 0.6152167916297913, "sampling/importance_sampling_ratio/max": 2.7660470008850098, "sampling/importance_sampling_ratio/mean": 1.2794352769851685, "sampling/importance_sampling_ratio/min": 0.5871570706367493, "sampling/sampling_logp_difference/max": 0.6263835430145264, "sampling/sampling_logp_difference/mean": 0.02425001747906208, "step": 395, "step_time": 21.532297872006893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15717606246471405, "epoch": 0.396, "frac_reward_zero_std": 0.0, "grad_norm": 1.4786607027053833, "kl": 0.011599971912801266, "learning_rate": 3.4473500795857674e-06, "loss": 0.0856, "num_tokens": 1112529.0, "reward": 0.4925000071525574, "reward_std": 0.5861384868621826, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5861384868621826, "sampling/importance_sampling_ratio/max": 1.4786831140518188, "sampling/importance_sampling_ratio/mean": 1.084773302078247, "sampling/importance_sampling_ratio/min": 0.7607637643814087, "sampling/sampling_logp_difference/max": 0.2931593060493469, "sampling/sampling_logp_difference/mean": 0.01693398877978325, "step": 396, "step_time": 23.326487703016028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19288864731788635, "epoch": 0.397, "frac_reward_zero_std": 0.0, "grad_norm": 0.9499870538711548, "kl": 0.008061782456934452, "learning_rate": 3.4398520917478478e-06, "loss": 0.0209, "num_tokens": 1115557.0, "reward": 0.19500000774860382, "reward_std": 0.5404011011123657, "rewards/reward_func/mean": 0.19500000774860382, "rewards/reward_func/std": 0.5404011011123657, "sampling/importance_sampling_ratio/max": 1.0636721849441528, "sampling/importance_sampling_ratio/mean": 0.9281097650527954, "sampling/importance_sampling_ratio/min": 0.7292148470878601, "sampling/sampling_logp_difference/max": 0.6591765880584717, "sampling/sampling_logp_difference/mean": 0.02257619798183441, "step": 397, "step_time": 35.80613448203076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.17203356325626373, "epoch": 0.398, "frac_reward_zero_std": 1.0, "grad_norm": 0.01310775801539421, "kl": 0.013430298306047916, "learning_rate": 3.4323442453062173e-06, "loss": 0.0001, "num_tokens": 1118311.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7286064624786377, "sampling/importance_sampling_ratio/mean": 1.196413278579712, "sampling/importance_sampling_ratio/min": 0.8802234530448914, "sampling/sampling_logp_difference/max": 0.32708340883255005, "sampling/sampling_logp_difference/mean": 0.01674327626824379, "step": 398, "step_time": 14.224776286981069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.20256562530994415, "epoch": 0.399, "frac_reward_zero_std": 0.0, "grad_norm": 1.0486704111099243, "kl": 0.013046537525951862, "learning_rate": 3.4248266190146307e-06, "loss": 0.1286, "num_tokens": 1121021.0, "reward": 0.7050000429153442, "reward_std": 0.5701754093170166, "rewards/reward_func/mean": 0.7050000429153442, "rewards/reward_func/std": 0.5701754093170166, "sampling/importance_sampling_ratio/max": 1.1218959093093872, "sampling/importance_sampling_ratio/mean": 0.7170436382293701, "sampling/importance_sampling_ratio/min": 0.3158906400203705, "sampling/sampling_logp_difference/max": 0.5914814472198486, "sampling/sampling_logp_difference/mean": 0.02777245081961155, "step": 399, "step_time": 24.702410023019183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.184691920876503, "epoch": 0.4, "frac_reward_zero_std": 0.0, "grad_norm": 1.0994385480880737, "kl": 0.008174250833690166, "learning_rate": 3.417299291729431e-06, "loss": -0.1262, "num_tokens": 1123417.0, "reward": 0.18000000715255737, "reward_std": 0.5404319167137146, "rewards/reward_func/mean": 0.18000000715255737, "rewards/reward_func/std": 0.5404319167137146, "sampling/importance_sampling_ratio/max": 1.2939773797988892, "sampling/importance_sampling_ratio/mean": 0.883842408657074, "sampling/importance_sampling_ratio/min": 0.5669020414352417, "sampling/sampling_logp_difference/max": 0.43274223804473877, "sampling/sampling_logp_difference/mean": 0.016977887600660324, "step": 400, "step_time": 25.887663036002778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17350856959819794, "epoch": 0.401, "frac_reward_zero_std": 0.0, "grad_norm": 1.981282353401184, "kl": 0.013754883781075478, "learning_rate": 3.4097623424087196e-06, "loss": 0.8352, "num_tokens": 1126853.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5773502588272095, "sampling/importance_sampling_ratio/max": 2.9755170345306396, "sampling/importance_sampling_ratio/mean": 1.5757747888565063, "sampling/importance_sampling_ratio/min": 0.6507756114006042, "sampling/sampling_logp_difference/max": 0.6103277206420898, "sampling/sampling_logp_difference/mean": 0.02276819571852684, "step": 401, "step_time": 29.58999710099306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2085791677236557, "epoch": 0.402, "frac_reward_zero_std": 0.0, "grad_norm": 1.696266531944275, "kl": 0.01851225085556507, "learning_rate": 3.4022158501115283e-06, "loss": -0.0018, "num_tokens": 1129949.0, "reward": 0.7325000166893005, "reward_std": 0.4952692985534668, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.4952692985534668, "sampling/importance_sampling_ratio/max": 1.9577833414077759, "sampling/importance_sampling_ratio/mean": 1.2906630039215088, "sampling/importance_sampling_ratio/min": 0.38272997736930847, "sampling/sampling_logp_difference/max": 0.9501994848251343, "sampling/sampling_logp_difference/mean": 0.026440078392624855, "step": 402, "step_time": 28.12618389102863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18392504751682281, "epoch": 0.403, "frac_reward_zero_std": 0.0, "grad_norm": 2.2606561183929443, "kl": 0.013010360300540924, "learning_rate": 3.39465989399699e-06, "loss": -0.5321, "num_tokens": 1132338.0, "reward": 0.4424999952316284, "reward_std": 0.6440690755844116, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.6440691351890564, "sampling/importance_sampling_ratio/max": 2.6831576824188232, "sampling/importance_sampling_ratio/mean": 1.5712738037109375, "sampling/importance_sampling_ratio/min": 0.8517704010009766, "sampling/sampling_logp_difference/max": 0.42365944385528564, "sampling/sampling_logp_difference/mean": 0.023412903770804405, "step": 403, "step_time": 21.7189172670478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.14150793850421906, "epoch": 0.404, "frac_reward_zero_std": 0.0, "grad_norm": 1.8395705223083496, "kl": 0.02101302146911621, "learning_rate": 3.3870945533235104e-06, "loss": 0.0726, "num_tokens": 1135113.0, "reward": 0.7325000166893005, "reward_std": 0.5350000262260437, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5349999666213989, "sampling/importance_sampling_ratio/max": 2.175739049911499, "sampling/importance_sampling_ratio/mean": 1.2809014320373535, "sampling/importance_sampling_ratio/min": 0.6664741635322571, "sampling/sampling_logp_difference/max": 0.4918487071990967, "sampling/sampling_logp_difference/mean": 0.01728738285601139, "step": 404, "step_time": 25.933103664021473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17913362383842468, "epoch": 0.405, "frac_reward_zero_std": 0.0, "grad_norm": 1.1731756925582886, "kl": 0.01274901907891035, "learning_rate": 3.3795199074479312e-06, "loss": 0.0648, "num_tokens": 1137463.0, "reward": 0.4325000047683716, "reward_std": 0.6554069519042969, "rewards/reward_func/mean": 0.4325000047683716, "rewards/reward_func/std": 0.6554070115089417, "sampling/importance_sampling_ratio/max": 1.1376131772994995, "sampling/importance_sampling_ratio/mean": 0.9263861179351807, "sampling/importance_sampling_ratio/min": 0.749922513961792, "sampling/sampling_logp_difference/max": 0.4268850088119507, "sampling/sampling_logp_difference/mean": 0.01855890452861786, "step": 405, "step_time": 21.311677350953687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17304542660713196, "epoch": 0.406, "frac_reward_zero_std": 0.0, "grad_norm": 1.1238586902618408, "kl": 0.009351055137813091, "learning_rate": 3.3719360358247054e-06, "loss": -0.0771, "num_tokens": 1140079.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 1.5342347621917725, "sampling/importance_sampling_ratio/mean": 1.0851131677627563, "sampling/importance_sampling_ratio/min": 0.5945015549659729, "sampling/sampling_logp_difference/max": 0.3694108724594116, "sampling/sampling_logp_difference/mean": 0.018196124583482742, "step": 406, "step_time": 19.72663107304834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18855470418930054, "epoch": 0.407, "frac_reward_zero_std": 0.0, "grad_norm": 1.2877403497695923, "kl": 0.020471690222620964, "learning_rate": 3.3643430180050573e-06, "loss": -0.4103, "num_tokens": 1142666.0, "reward": 0.7200000286102295, "reward_std": 0.5336666107177734, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5336665511131287, "sampling/importance_sampling_ratio/max": 2.560563564300537, "sampling/importance_sampling_ratio/mean": 1.0040044784545898, "sampling/importance_sampling_ratio/min": 0.33224251866340637, "sampling/sampling_logp_difference/max": 1.067708969116211, "sampling/sampling_logp_difference/mean": 0.03308543935418129, "step": 407, "step_time": 15.52750147599727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1751941293478012, "epoch": 0.408, "frac_reward_zero_std": 0.0, "grad_norm": 1.568233847618103, "kl": 0.013900411315262318, "learning_rate": 3.3567409336361502e-06, "loss": -0.6409, "num_tokens": 1145343.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 2.25968861579895, "sampling/importance_sampling_ratio/mean": 1.3579943180084229, "sampling/importance_sampling_ratio/min": 0.45420321822166443, "sampling/sampling_logp_difference/max": 0.9019727110862732, "sampling/sampling_logp_difference/mean": 0.02441788837313652, "step": 408, "step_time": 19.71322906302521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.17237667739391327, "epoch": 0.409, "frac_reward_zero_std": 0.0, "grad_norm": 1.574562907218933, "kl": 0.009185725823044777, "learning_rate": 3.3491298624602514e-06, "loss": -0.3125, "num_tokens": 1148125.0, "reward": 0.17000000178813934, "reward_std": 0.5412947535514832, "rewards/reward_func/mean": 0.17000000178813934, "rewards/reward_func/std": 0.5412947535514832, "sampling/importance_sampling_ratio/max": 1.678153395652771, "sampling/importance_sampling_ratio/mean": 1.1256051063537598, "sampling/importance_sampling_ratio/min": 0.5961788296699524, "sampling/sampling_logp_difference/max": 0.2204127311706543, "sampling/sampling_logp_difference/mean": 0.014151177369058132, "step": 409, "step_time": 34.661781869013794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18380093574523926, "epoch": 0.41, "frac_reward_zero_std": 0.0, "grad_norm": 2.313466787338257, "kl": 0.01990172453224659, "learning_rate": 3.3415098843138972e-06, "loss": 0.5663, "num_tokens": 1151378.0, "reward": 0.4599999785423279, "reward_std": 0.6237520575523376, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.6237521171569824, "sampling/importance_sampling_ratio/max": 2.9580395221710205, "sampling/importance_sampling_ratio/mean": 1.1903181076049805, "sampling/importance_sampling_ratio/min": 0.5987442135810852, "sampling/sampling_logp_difference/max": 0.6924982070922852, "sampling/sampling_logp_difference/mean": 0.01830255426466465, "step": 410, "step_time": 31.639083153975662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.20008501410484314, "epoch": 0.411, "frac_reward_zero_std": 0.0, "grad_norm": 1.0733983516693115, "kl": 0.01456481497734785, "learning_rate": 3.333881079127052e-06, "loss": -0.4213, "num_tokens": 1154190.0, "reward": 0.4724999964237213, "reward_std": 0.6047244071960449, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6047244668006897, "sampling/importance_sampling_ratio/max": 1.6890958547592163, "sampling/importance_sampling_ratio/mean": 0.8984994888305664, "sampling/importance_sampling_ratio/min": 0.3304322063922882, "sampling/sampling_logp_difference/max": 0.6848804950714111, "sampling/sampling_logp_difference/mean": 0.02761130966246128, "step": 411, "step_time": 20.2441977999988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1885094791650772, "epoch": 0.412, "frac_reward_zero_std": 0.0, "grad_norm": 0.9934810400009155, "kl": 0.014622784219682217, "learning_rate": 3.326243526922272e-06, "loss": -0.1651, "num_tokens": 1157353.0, "reward": 0.987500011920929, "reward_std": 0.02499999664723873, "rewards/reward_func/mean": 0.987500011920929, "rewards/reward_func/std": 0.025000005960464478, "sampling/importance_sampling_ratio/max": 1.6981385946273804, "sampling/importance_sampling_ratio/mean": 0.9352574348449707, "sampling/importance_sampling_ratio/min": 0.6347556710243225, "sampling/sampling_logp_difference/max": 0.5635101795196533, "sampling/sampling_logp_difference/mean": 0.021851930767297745, "step": 412, "step_time": 24.14433729200391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.21593183279037476, "epoch": 0.413, "frac_reward_zero_std": 0.0, "grad_norm": 0.7647023797035217, "kl": 0.022160032764077187, "learning_rate": 3.3185973078138665e-06, "loss": -0.1909, "num_tokens": 1160611.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.1115139722824097, "sampling/importance_sampling_ratio/mean": 0.6738185882568359, "sampling/importance_sampling_ratio/min": 0.23636533319950104, "sampling/sampling_logp_difference/max": 0.7055172920227051, "sampling/sampling_logp_difference/mean": 0.02442113868892193, "step": 413, "step_time": 25.213386025978252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1816244125366211, "epoch": 0.414, "frac_reward_zero_std": 0.0, "grad_norm": 1.295897126197815, "kl": 0.013486105017364025, "learning_rate": 3.3109425020070564e-06, "loss": 0.1982, "num_tokens": 1163243.0, "reward": 0.22749999165534973, "reward_std": 0.5086829662322998, "rewards/reward_func/mean": 0.22749999165534973, "rewards/reward_func/std": 0.5086829662322998, "sampling/importance_sampling_ratio/max": 1.416931390762329, "sampling/importance_sampling_ratio/mean": 0.9034465551376343, "sampling/importance_sampling_ratio/min": 0.522864818572998, "sampling/sampling_logp_difference/max": 0.41960668563842773, "sampling/sampling_logp_difference/mean": 0.021167315542697906, "step": 414, "step_time": 33.43888196098851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.14779776334762573, "epoch": 0.415, "frac_reward_zero_std": 0.0, "grad_norm": 1.3686352968215942, "kl": 0.034022629261016846, "learning_rate": 3.3032791897971313e-06, "loss": -0.0724, "num_tokens": 1166110.0, "reward": 0.7274999618530273, "reward_std": 0.5450000166893005, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.5450000166893005, "sampling/importance_sampling_ratio/max": 1.2460366487503052, "sampling/importance_sampling_ratio/mean": 1.0311167240142822, "sampling/importance_sampling_ratio/min": 0.7128075361251831, "sampling/sampling_logp_difference/max": 0.4975733757019043, "sampling/sampling_logp_difference/mean": 0.02013254165649414, "step": 415, "step_time": 16.53204043698497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1835378259420395, "epoch": 0.416, "frac_reward_zero_std": 0.0, "grad_norm": 0.9966428875923157, "kl": 0.01061837375164032, "learning_rate": 3.2956074515686105e-06, "loss": -0.3726, "num_tokens": 1168924.0, "reward": -0.08249999582767487, "reward_std": 0.08098354190587997, "rewards/reward_func/mean": -0.08249999582767487, "rewards/reward_func/std": 0.08098353445529938, "sampling/importance_sampling_ratio/max": 1.373403787612915, "sampling/importance_sampling_ratio/mean": 0.834275484085083, "sampling/importance_sampling_ratio/min": 0.31483566761016846, "sampling/sampling_logp_difference/max": 0.6974654197692871, "sampling/sampling_logp_difference/mean": 0.026441434398293495, "step": 416, "step_time": 37.5055824269657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.14321918785572052, "epoch": 0.417, "frac_reward_zero_std": 0.0, "grad_norm": 1.3877036571502686, "kl": 0.018968133255839348, "learning_rate": 3.2879273677943972e-06, "loss": -0.3618, "num_tokens": 1172026.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 2.2938640117645264, "sampling/importance_sampling_ratio/mean": 1.0610339641571045, "sampling/importance_sampling_ratio/min": 0.4484946131706238, "sampling/sampling_logp_difference/max": 0.9790163040161133, "sampling/sampling_logp_difference/mean": 0.02530120313167572, "step": 417, "step_time": 31.41495476401178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.14992672204971313, "epoch": 0.418, "frac_reward_zero_std": 0.0, "grad_norm": 0.753036618232727, "kl": 0.02462541125714779, "learning_rate": 3.2802390190349364e-06, "loss": 0.0759, "num_tokens": 1175137.0, "reward": 0.7275000214576721, "reward_std": 0.5317502617835999, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5317502617835999, "sampling/importance_sampling_ratio/max": 1.2481024265289307, "sampling/importance_sampling_ratio/mean": 0.9410421252250671, "sampling/importance_sampling_ratio/min": 0.5725758671760559, "sampling/sampling_logp_difference/max": 0.659196138381958, "sampling/sampling_logp_difference/mean": 0.014091772958636284, "step": 418, "step_time": 24.08223919098964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.24218858778476715, "epoch": 0.419, "frac_reward_zero_std": 0.0, "grad_norm": 2.1772663593292236, "kl": 0.01279524713754654, "learning_rate": 3.272542485937369e-06, "loss": -0.2383, "num_tokens": 1178245.0, "reward": 0.45250001549720764, "reward_std": 0.6063758730888367, "rewards/reward_func/mean": 0.45250001549720764, "rewards/reward_func/std": 0.6063758730888367, "sampling/importance_sampling_ratio/max": 2.499350070953369, "sampling/importance_sampling_ratio/mean": 1.4136929512023926, "sampling/importance_sampling_ratio/min": 0.7487847208976746, "sampling/sampling_logp_difference/max": 0.44135117530822754, "sampling/sampling_logp_difference/mean": 0.027951395139098167, "step": 419, "step_time": 31.583855513017625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18253064155578613, "epoch": 0.42, "frac_reward_zero_std": 0.0, "grad_norm": 2.5164177417755127, "kl": 0.011706927791237831, "learning_rate": 3.264837849234685e-06, "loss": 0.4744, "num_tokens": 1181450.0, "reward": 0.7425000071525574, "reward_std": 0.5083552002906799, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5083552002906799, "sampling/importance_sampling_ratio/max": 2.3121345043182373, "sampling/importance_sampling_ratio/mean": 1.3182141780853271, "sampling/importance_sampling_ratio/min": 0.616348385810852, "sampling/sampling_logp_difference/max": 0.5747531652450562, "sampling/sampling_logp_difference/mean": 0.018289485946297646, "step": 420, "step_time": 28.404757851036265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.2051432728767395, "epoch": 0.421, "frac_reward_zero_std": 0.0, "grad_norm": 0.5034323334693909, "kl": 0.008445397019386292, "learning_rate": 3.257125189744877e-06, "loss": -0.1071, "num_tokens": 1184161.0, "reward": 0.7150000333786011, "reward_std": 0.5633530616760254, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5633530616760254, "sampling/importance_sampling_ratio/max": 0.9404759407043457, "sampling/importance_sampling_ratio/mean": 0.6898060441017151, "sampling/importance_sampling_ratio/min": 0.4551492929458618, "sampling/sampling_logp_difference/max": 0.5505709648132324, "sampling/sampling_logp_difference/mean": 0.019149456173181534, "step": 421, "step_time": 21.92337343201507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1616276502609253, "epoch": 0.422, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671867251396179, "kl": 0.012954740785062313, "learning_rate": 3.249404588370095e-06, "loss": -0.334, "num_tokens": 1186484.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 1.643309473991394, "sampling/importance_sampling_ratio/mean": 0.9076956510543823, "sampling/importance_sampling_ratio/min": 0.36759328842163086, "sampling/sampling_logp_difference/max": 0.5657436847686768, "sampling/sampling_logp_difference/mean": 0.022725669667124748, "step": 422, "step_time": 14.13331703300355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.14682143926620483, "epoch": 0.423, "frac_reward_zero_std": 0.0, "grad_norm": 1.3182878494262695, "kl": 0.02331027202308178, "learning_rate": 3.2416761260957925e-06, "loss": 0.334, "num_tokens": 1189429.0, "reward": 0.737500011920929, "reward_std": 0.5050659775733948, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5050659775733948, "sampling/importance_sampling_ratio/max": 1.4722726345062256, "sampling/importance_sampling_ratio/mean": 0.8051702976226807, "sampling/importance_sampling_ratio/min": 0.27855026721954346, "sampling/sampling_logp_difference/max": 0.7614901661872864, "sampling/sampling_logp_difference/mean": 0.02573915384709835, "step": 423, "step_time": 27.09744582301937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1699029952287674, "epoch": 0.424, "frac_reward_zero_std": 0.0, "grad_norm": 0.6649200916290283, "kl": 0.01759985461831093, "learning_rate": 3.233939883989882e-06, "loss": -0.058, "num_tokens": 1192233.0, "reward": 0.7024999856948853, "reward_std": 0.5883522629737854, "rewards/reward_func/mean": 0.7024999856948853, "rewards/reward_func/std": 0.5883522629737854, "sampling/importance_sampling_ratio/max": 0.9705274105072021, "sampling/importance_sampling_ratio/mean": 0.7915598154067993, "sampling/importance_sampling_ratio/min": 0.6766921877861023, "sampling/sampling_logp_difference/max": 0.5734982490539551, "sampling/sampling_logp_difference/mean": 0.01643470674753189, "step": 424, "step_time": 21.18876576900948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1905723661184311, "epoch": 0.425, "frac_reward_zero_std": 0.0, "grad_norm": 0.8266260623931885, "kl": 0.010311496444046497, "learning_rate": 3.2261959432018834e-06, "loss": -0.2075, "num_tokens": 1194976.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 1.6429001092910767, "sampling/importance_sampling_ratio/mean": 0.9485486745834351, "sampling/importance_sampling_ratio/min": 0.4286785125732422, "sampling/sampling_logp_difference/max": 0.34793710708618164, "sampling/sampling_logp_difference/mean": 0.022727351635694504, "step": 425, "step_time": 19.505853572976775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1744275987148285, "epoch": 0.426, "frac_reward_zero_std": 0.0, "grad_norm": 1.0066845417022705, "kl": 0.020252861082553864, "learning_rate": 3.218444384962071e-06, "loss": -0.1989, "num_tokens": 1197841.0, "reward": 0.23000000417232513, "reward_std": 0.5133549571037292, "rewards/reward_func/mean": 0.23000000417232513, "rewards/reward_func/std": 0.513355016708374, "sampling/importance_sampling_ratio/max": 1.1440238952636719, "sampling/importance_sampling_ratio/mean": 0.8823443055152893, "sampling/importance_sampling_ratio/min": 0.3196568787097931, "sampling/sampling_logp_difference/max": 0.5132656097412109, "sampling/sampling_logp_difference/mean": 0.020208690315485, "step": 426, "step_time": 29.83726480603218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.23464569449424744, "epoch": 0.427, "frac_reward_zero_std": 0.0, "grad_norm": 1.4916280508041382, "kl": 0.011275283060967922, "learning_rate": 3.210685290580622e-06, "loss": 0.0234, "num_tokens": 1200325.0, "reward": 0.7175000309944153, "reward_std": 0.4786352217197418, "rewards/reward_func/mean": 0.7175000309944153, "rewards/reward_func/std": 0.4786352217197418, "sampling/importance_sampling_ratio/max": 1.640309453010559, "sampling/importance_sampling_ratio/mean": 1.1747007369995117, "sampling/importance_sampling_ratio/min": 0.7040851712226868, "sampling/sampling_logp_difference/max": 0.36908257007598877, "sampling/sampling_logp_difference/mean": 0.024262448772788048, "step": 427, "step_time": 21.56584767898312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1757877916097641, "epoch": 0.428, "frac_reward_zero_std": 0.0, "grad_norm": 0.959442675113678, "kl": 0.009840963408350945, "learning_rate": 3.2029187414467645e-06, "loss": 0.05, "num_tokens": 1203239.0, "reward": 0.22749999165534973, "reward_std": 0.5154528617858887, "rewards/reward_func/mean": 0.22749999165534973, "rewards/reward_func/std": 0.5154529213905334, "sampling/importance_sampling_ratio/max": 1.8460829257965088, "sampling/importance_sampling_ratio/mean": 1.0033972263336182, "sampling/importance_sampling_ratio/min": 0.6472733020782471, "sampling/sampling_logp_difference/max": 0.4040954113006592, "sampling/sampling_logp_difference/mean": 0.014188762754201889, "step": 428, "step_time": 30.86460370401619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1875738501548767, "epoch": 0.429, "frac_reward_zero_std": 0.0, "grad_norm": 1.4470330476760864, "kl": 0.014530408196151257, "learning_rate": 3.1951448190279256e-06, "loss": 0.1976, "num_tokens": 1205863.0, "reward": 0.4675000011920929, "reward_std": 0.6091181635856628, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6091182231903076, "sampling/importance_sampling_ratio/max": 1.6635842323303223, "sampling/importance_sampling_ratio/mean": 1.14644193649292, "sampling/importance_sampling_ratio/min": 0.5210554003715515, "sampling/sampling_logp_difference/max": 0.5488607883453369, "sampling/sampling_logp_difference/mean": 0.026224778965115547, "step": 429, "step_time": 21.628061090013944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.15557315945625305, "epoch": 0.43, "frac_reward_zero_std": 0.0, "grad_norm": 0.8926362991333008, "kl": 0.017851298674941063, "learning_rate": 3.1873636048688714e-06, "loss": 0.0712, "num_tokens": 1208689.0, "reward": 0.48750001192092896, "reward_std": 0.568763256072998, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5687633156776428, "sampling/importance_sampling_ratio/max": 1.1969101428985596, "sampling/importance_sampling_ratio/mean": 0.790440559387207, "sampling/importance_sampling_ratio/min": 0.5136198401451111, "sampling/sampling_logp_difference/max": 0.3933427333831787, "sampling/sampling_logp_difference/mean": 0.01579180359840393, "step": 430, "step_time": 29.38150699500693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2304280698299408, "epoch": 0.431, "frac_reward_zero_std": 0.0, "grad_norm": 1.0460988283157349, "kl": 0.006227742414921522, "learning_rate": 3.1795751805908578e-06, "loss": -0.2066, "num_tokens": 1211434.0, "reward": 0.45249998569488525, "reward_std": 0.632844090461731, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6328441500663757, "sampling/importance_sampling_ratio/max": 1.550429344177246, "sampling/importance_sampling_ratio/mean": 1.0153253078460693, "sampling/importance_sampling_ratio/min": 0.555022656917572, "sampling/sampling_logp_difference/max": 0.2690218687057495, "sampling/sampling_logp_difference/mean": 0.018388638272881508, "step": 431, "step_time": 25.620207694999408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18220646679401398, "epoch": 0.432, "frac_reward_zero_std": 0.0, "grad_norm": 1.509669303894043, "kl": 0.005572461523115635, "learning_rate": 3.171779627890769e-06, "loss": -0.4208, "num_tokens": 1213619.0, "reward": 0.19249999523162842, "reward_std": 0.5384777784347534, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.5384778380393982, "sampling/importance_sampling_ratio/max": 1.647985577583313, "sampling/importance_sampling_ratio/mean": 1.0814270973205566, "sampling/importance_sampling_ratio/min": 0.6796345710754395, "sampling/sampling_logp_difference/max": 0.33365297317504883, "sampling/sampling_logp_difference/mean": 0.016511041671037674, "step": 432, "step_time": 25.500898238969967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2133781909942627, "epoch": 0.433, "frac_reward_zero_std": 0.0, "grad_norm": 2.2491116523742676, "kl": 0.012676224112510681, "learning_rate": 3.1639770285402632e-06, "loss": -0.2224, "num_tokens": 1216289.0, "reward": 0.47749999165534973, "reward_std": 0.6036761999130249, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6036762595176697, "sampling/importance_sampling_ratio/max": 1.739120364189148, "sampling/importance_sampling_ratio/mean": 1.4229412078857422, "sampling/importance_sampling_ratio/min": 1.1510462760925293, "sampling/sampling_logp_difference/max": 0.5488195419311523, "sampling/sampling_logp_difference/mean": 0.03019793890416622, "step": 433, "step_time": 25.49132546200417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.21396461129188538, "epoch": 0.434, "frac_reward_zero_std": 0.0, "grad_norm": 1.4936244487762451, "kl": 0.011119002476334572, "learning_rate": 3.1561674643849173e-06, "loss": 0.0824, "num_tokens": 1219147.0, "reward": 0.45249998569488525, "reward_std": 0.6322117447853088, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6322117447853088, "sampling/importance_sampling_ratio/max": 1.5655364990234375, "sampling/importance_sampling_ratio/mean": 1.161675214767456, "sampling/importance_sampling_ratio/min": 0.6416527032852173, "sampling/sampling_logp_difference/max": 0.610335111618042, "sampling/sampling_logp_difference/mean": 0.01973019726574421, "step": 434, "step_time": 24.910961086978205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.11842536926269531, "epoch": 0.435, "frac_reward_zero_std": 0.0, "grad_norm": 0.9198082685470581, "kl": 0.015619131736457348, "learning_rate": 3.148351017343363e-06, "loss": 0.079, "num_tokens": 1222527.0, "reward": 0.23750001192092896, "reward_std": 0.508486270904541, "rewards/reward_func/mean": 0.23750001192092896, "rewards/reward_func/std": 0.5084863305091858, "sampling/importance_sampling_ratio/max": 1.3733412027359009, "sampling/importance_sampling_ratio/mean": 0.997661828994751, "sampling/importance_sampling_ratio/min": 0.7153551578521729, "sampling/sampling_logp_difference/max": 0.5008001327514648, "sampling/sampling_logp_difference/mean": 0.01729048602283001, "step": 435, "step_time": 31.660875989997294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16725823283195496, "epoch": 0.436, "frac_reward_zero_std": 0.0, "grad_norm": 3.419487237930298, "kl": 0.014778782613575459, "learning_rate": 3.1405277694064306e-06, "loss": 0.5752, "num_tokens": 1225663.0, "reward": -0.07249999791383743, "reward_std": 0.07544313371181488, "rewards/reward_func/mean": -0.07249999791383743, "rewards/reward_func/std": 0.07544313371181488, "sampling/importance_sampling_ratio/max": 1.8051235675811768, "sampling/importance_sampling_ratio/mean": 1.0339508056640625, "sampling/importance_sampling_ratio/min": 0.44785377383232117, "sampling/sampling_logp_difference/max": 0.6251858472824097, "sampling/sampling_logp_difference/mean": 0.02647537738084793, "step": 436, "step_time": 40.4218522849842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17419789731502533, "epoch": 0.437, "frac_reward_zero_std": 0.0, "grad_norm": 0.8690032958984375, "kl": 0.01883860118687153, "learning_rate": 3.1326978026362907e-06, "loss": -0.0871, "num_tokens": 1229076.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.0255980491638184, "sampling/importance_sampling_ratio/mean": 0.7022855877876282, "sampling/importance_sampling_ratio/min": 0.5092651844024658, "sampling/sampling_logp_difference/max": 0.48917698860168457, "sampling/sampling_logp_difference/mean": 0.015678727999329567, "step": 437, "step_time": 32.31232901697513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.21026286482810974, "epoch": 0.438, "frac_reward_zero_std": 1.0, "grad_norm": 0.005759460385888815, "kl": 0.007680194918066263, "learning_rate": 3.1248611991655885e-06, "loss": 0.0001, "num_tokens": 1232150.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.33417546749115, "sampling/importance_sampling_ratio/mean": 0.9033915996551514, "sampling/importance_sampling_ratio/min": 0.6698701977729797, "sampling/sampling_logp_difference/max": 0.3432505130767822, "sampling/sampling_logp_difference/mean": 0.018152322620153427, "step": 438, "step_time": 18.231341011996847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.14759956300258636, "epoch": 0.439, "frac_reward_zero_std": 0.0, "grad_norm": 1.2261816263198853, "kl": 0.019186044111847878, "learning_rate": 3.1170180411965854e-06, "loss": 0.3765, "num_tokens": 1235570.0, "reward": 0.23749999701976776, "reward_std": 0.5083552002906799, "rewards/reward_func/mean": 0.23749999701976776, "rewards/reward_func/std": 0.5083552002906799, "sampling/importance_sampling_ratio/max": 2.6004860401153564, "sampling/importance_sampling_ratio/mean": 1.233945369720459, "sampling/importance_sampling_ratio/min": 0.46285900473594666, "sampling/sampling_logp_difference/max": 0.39178264141082764, "sampling/sampling_logp_difference/mean": 0.01674705743789673, "step": 439, "step_time": 33.93703045899747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16430194675922394, "epoch": 0.44, "frac_reward_zero_std": 0.0, "grad_norm": 0.9802895188331604, "kl": 0.006813048850744963, "learning_rate": 3.109168411000299e-06, "loss": 0.1898, "num_tokens": 1238187.0, "reward": 0.4399999976158142, "reward_std": 0.6263651847839355, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.6263651847839355, "sampling/importance_sampling_ratio/max": 1.204309344291687, "sampling/importance_sampling_ratio/mean": 0.871962308883667, "sampling/importance_sampling_ratio/min": 0.5646609663963318, "sampling/sampling_logp_difference/max": 0.3164086937904358, "sampling/sampling_logp_difference/mean": 0.013392717577517033, "step": 440, "step_time": 21.167131489957683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.20035582780838013, "epoch": 0.441, "frac_reward_zero_std": 0.0, "grad_norm": 0.7340916395187378, "kl": 0.011186124756932259, "learning_rate": 3.1013123909156347e-06, "loss": -0.0845, "num_tokens": 1240683.0, "reward": 0.7225000262260437, "reward_std": 0.5286697149276733, "rewards/reward_func/mean": 0.7225000262260437, "rewards/reward_func/std": 0.5286697149276733, "sampling/importance_sampling_ratio/max": 0.9713583588600159, "sampling/importance_sampling_ratio/mean": 0.7849164009094238, "sampling/importance_sampling_ratio/min": 0.5613242387771606, "sampling/sampling_logp_difference/max": 0.26843857765197754, "sampling/sampling_logp_difference/mean": 0.01341143436729908, "step": 441, "step_time": 16.943980013020337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.16980424523353577, "epoch": 0.442, "frac_reward_zero_std": 0.0, "grad_norm": 0.5944642424583435, "kl": 0.014027859084308147, "learning_rate": 3.093450063348525e-06, "loss": 0.134, "num_tokens": 1243497.0, "reward": 0.7275000214576721, "reward_std": 0.5383539795875549, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5383539795875549, "sampling/importance_sampling_ratio/max": 0.6911371946334839, "sampling/importance_sampling_ratio/mean": 0.5791789889335632, "sampling/importance_sampling_ratio/min": 0.46557480096817017, "sampling/sampling_logp_difference/max": 0.6771659851074219, "sampling/sampling_logp_difference/mean": 0.024288296699523926, "step": 442, "step_time": 25.64359609904932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16303375363349915, "epoch": 0.443, "frac_reward_zero_std": 0.0, "grad_norm": 0.7775272727012634, "kl": 0.012881884351372719, "learning_rate": 3.085581510771067e-06, "loss": -0.1205, "num_tokens": 1245746.0, "reward": 0.45499998331069946, "reward_std": 0.6293647885322571, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.6293647885322571, "sampling/importance_sampling_ratio/max": 0.9346107244491577, "sampling/importance_sampling_ratio/mean": 0.7319236397743225, "sampling/importance_sampling_ratio/min": 0.48719626665115356, "sampling/sampling_logp_difference/max": 0.6282789707183838, "sampling/sampling_logp_difference/mean": 0.018372397869825363, "step": 443, "step_time": 18.135229509032797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.2297874242067337, "epoch": 0.444, "frac_reward_zero_std": 0.0, "grad_norm": 0.817965567111969, "kl": 0.014152202755212784, "learning_rate": 3.0777068157206535e-06, "loss": 0.1093, "num_tokens": 1249061.0, "reward": 0.4925000071525574, "reward_std": 0.5860247611999512, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 0.8441367149353027, "sampling/importance_sampling_ratio/mean": 0.47796887159347534, "sampling/importance_sampling_ratio/min": 0.17214736342430115, "sampling/sampling_logp_difference/max": 0.7815263271331787, "sampling/sampling_logp_difference/mean": 0.038137998431921005, "step": 444, "step_time": 28.655308391025756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.22885003685951233, "epoch": 0.445, "frac_reward_zero_std": 0.0, "grad_norm": 1.3511842489242554, "kl": 0.00772506557404995, "learning_rate": 3.0698260607991094e-06, "loss": 0.0325, "num_tokens": 1251422.0, "reward": 0.20000000298023224, "reward_std": 0.533604085445404, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5336041450500488, "sampling/importance_sampling_ratio/max": 0.9810093641281128, "sampling/importance_sampling_ratio/mean": 0.8369464874267578, "sampling/importance_sampling_ratio/min": 0.7006155252456665, "sampling/sampling_logp_difference/max": 0.6134848594665527, "sampling/sampling_logp_difference/mean": 0.026290664449334145, "step": 445, "step_time": 21.8514681099914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.18265238404273987, "epoch": 0.446, "frac_reward_zero_std": 0.0, "grad_norm": 1.344211220741272, "kl": 0.021054904907941818, "learning_rate": 3.061939328671824e-06, "loss": 0.076, "num_tokens": 1254423.0, "reward": 0.4650000035762787, "reward_std": 0.5899434685707092, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.5899434685707092, "sampling/importance_sampling_ratio/max": 1.8497220277786255, "sampling/importance_sampling_ratio/mean": 0.9540822505950928, "sampling/importance_sampling_ratio/min": 0.4057038128376007, "sampling/sampling_logp_difference/max": 0.5755867958068848, "sampling/sampling_logp_difference/mean": 0.023195475339889526, "step": 446, "step_time": 29.458026565029286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.15342113375663757, "epoch": 0.447, "frac_reward_zero_std": 0.0, "grad_norm": 1.1540383100509644, "kl": 0.010543609969317913, "learning_rate": 3.054046702066886e-06, "loss": -0.1407, "num_tokens": 1257279.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.2020467519760132, "sampling/importance_sampling_ratio/mean": 1.102899193763733, "sampling/importance_sampling_ratio/min": 0.9906656742095947, "sampling/sampling_logp_difference/max": 0.2827274799346924, "sampling/sampling_logp_difference/mean": 0.013127765618264675, "step": 447, "step_time": 22.506202692980878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 63.25, "completions/mean_terminated_length": 63.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.19722220301628113, "epoch": 0.448, "frac_reward_zero_std": 0.0, "grad_norm": 1.220840573310852, "kl": 0.003073799656704068, "learning_rate": 3.0461482637742133e-06, "loss": 0.0518, "num_tokens": 1259906.0, "reward": 0.19750000536441803, "reward_std": 0.5248730182647705, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.5248730182647705, "sampling/importance_sampling_ratio/max": 0.9551300406455994, "sampling/importance_sampling_ratio/mean": 0.8550416231155396, "sampling/importance_sampling_ratio/min": 0.7603394389152527, "sampling/sampling_logp_difference/max": 0.4102543592453003, "sampling/sampling_logp_difference/mean": 0.014891261234879494, "step": 448, "step_time": 36.73868080397369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17630815505981445, "epoch": 0.449, "frac_reward_zero_std": 0.0, "grad_norm": 1.856231927871704, "kl": 0.011117054149508476, "learning_rate": 3.0382440966446876e-06, "loss": 0.037, "num_tokens": 1262736.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5773502588272095, "sampling/importance_sampling_ratio/max": 2.0507450103759766, "sampling/importance_sampling_ratio/mean": 1.4145398139953613, "sampling/importance_sampling_ratio/min": 0.7187467813491821, "sampling/sampling_logp_difference/max": 0.4698638916015625, "sampling/sampling_logp_difference/mean": 0.01780855655670166, "step": 449, "step_time": 22.830494567984715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.15421532094478607, "epoch": 0.45, "frac_reward_zero_std": 0.0, "grad_norm": 1.1135753393173218, "kl": 0.009813894517719746, "learning_rate": 3.0303342835892804e-06, "loss": -0.32, "num_tokens": 1265945.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 2.2968966960906982, "sampling/importance_sampling_ratio/mean": 1.2665021419525146, "sampling/importance_sampling_ratio/min": 0.6567671298980713, "sampling/sampling_logp_difference/max": 0.36028027534484863, "sampling/sampling_logp_difference/mean": 0.015600944869220257, "step": 450, "step_time": 19.50354215601692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.21015942096710205, "epoch": 0.451, "frac_reward_zero_std": 0.0, "grad_norm": 2.1419570446014404, "kl": 0.016089485958218575, "learning_rate": 3.0224189075781886e-06, "loss": -0.0507, "num_tokens": 1269379.0, "reward": 0.4925000071525574, "reward_std": 0.5803088545799255, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5803087949752808, "sampling/importance_sampling_ratio/max": 1.4987167119979858, "sampling/importance_sampling_ratio/mean": 1.0586427450180054, "sampling/importance_sampling_ratio/min": 0.43939098715782166, "sampling/sampling_logp_difference/max": 0.61039137840271, "sampling/sampling_logp_difference/mean": 0.028296321630477905, "step": 451, "step_time": 39.380030908971094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1875135898590088, "epoch": 0.452, "frac_reward_zero_std": 0.0, "grad_norm": 1.0662803649902344, "kl": 0.011223956942558289, "learning_rate": 3.014498051639959e-06, "loss": -0.0108, "num_tokens": 1272069.0, "reward": 0.18000000715255737, "reward_std": 0.5505754351615906, "rewards/reward_func/mean": 0.18000000715255737, "rewards/reward_func/std": 0.5505754947662354, "sampling/importance_sampling_ratio/max": 0.8112066388130188, "sampling/importance_sampling_ratio/mean": 0.7264506220817566, "sampling/importance_sampling_ratio/min": 0.6200752258300781, "sampling/sampling_logp_difference/max": 0.35492801666259766, "sampling/sampling_logp_difference/mean": 0.018855206668376923, "step": 452, "step_time": 32.283047639997676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16499558091163635, "epoch": 0.453, "frac_reward_zero_std": 0.0, "grad_norm": 0.7900223135948181, "kl": 0.03650446608662605, "learning_rate": 3.006571798860626e-06, "loss": -0.1684, "num_tokens": 1274944.0, "reward": 0.7300000190734863, "reward_std": 0.49362605810165405, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.49362605810165405, "sampling/importance_sampling_ratio/max": 1.2312389612197876, "sampling/importance_sampling_ratio/mean": 0.77052903175354, "sampling/importance_sampling_ratio/min": 0.43709203600883484, "sampling/sampling_logp_difference/max": 0.5031673908233643, "sampling/sampling_logp_difference/mean": 0.023659298196434975, "step": 453, "step_time": 22.37824486102909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.15133777260780334, "epoch": 0.454, "frac_reward_zero_std": 0.0, "grad_norm": 0.7295528650283813, "kl": 0.012507104314863682, "learning_rate": 2.9986402323828274e-06, "loss": 0.0378, "num_tokens": 1277684.0, "reward": 0.7225000262260437, "reward_std": 0.5221988558769226, "rewards/reward_func/mean": 0.7225000262260437, "rewards/reward_func/std": 0.5221988558769226, "sampling/importance_sampling_ratio/max": 0.9769784212112427, "sampling/importance_sampling_ratio/mean": 0.6797975897789001, "sampling/importance_sampling_ratio/min": 0.47467222809791565, "sampling/sampling_logp_difference/max": 0.4880702495574951, "sampling/sampling_logp_difference/mean": 0.01620212197303772, "step": 454, "step_time": 28.003292667039204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16707800328731537, "epoch": 0.455, "frac_reward_zero_std": 0.0, "grad_norm": 1.156707525253296, "kl": 0.012990863993763924, "learning_rate": 2.9907034354049443e-06, "loss": -0.0087, "num_tokens": 1280699.0, "reward": 0.7425000071525574, "reward_std": 0.5017552375793457, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5017552375793457, "sampling/importance_sampling_ratio/max": 0.7824573516845703, "sampling/importance_sampling_ratio/mean": 0.6438816785812378, "sampling/importance_sampling_ratio/min": 0.5770710706710815, "sampling/sampling_logp_difference/max": 0.6045162677764893, "sampling/sampling_logp_difference/mean": 0.017497852444648743, "step": 455, "step_time": 17.67139838502044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.22031190991401672, "epoch": 0.456, "frac_reward_zero_std": 0.0, "grad_norm": 0.8678343296051025, "kl": 0.019618511199951172, "learning_rate": 2.9827614911802205e-06, "loss": -0.0717, "num_tokens": 1284036.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 0.7323600649833679, "sampling/importance_sampling_ratio/mean": 0.5718560218811035, "sampling/importance_sampling_ratio/min": 0.4663974344730377, "sampling/sampling_logp_difference/max": 0.40730226039886475, "sampling/sampling_logp_difference/mean": 0.022960741072893143, "step": 456, "step_time": 28.382598477997817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1649620682001114, "epoch": 0.457, "frac_reward_zero_std": 0.0, "grad_norm": 1.9907453060150146, "kl": 0.004840234760195017, "learning_rate": 2.9748144830158925e-06, "loss": 0.048, "num_tokens": 1286990.0, "reward": 0.7124999761581421, "reward_std": 0.54217928647995, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.54217928647995, "sampling/importance_sampling_ratio/max": 1.0546389818191528, "sampling/importance_sampling_ratio/mean": 0.8202236294746399, "sampling/importance_sampling_ratio/min": 0.5169450044631958, "sampling/sampling_logp_difference/max": 0.35714370012283325, "sampling/sampling_logp_difference/mean": 0.018770365044474602, "step": 457, "step_time": 21.76817899598973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.19829636812210083, "epoch": 0.458, "frac_reward_zero_std": 0.0, "grad_norm": 1.185340404510498, "kl": 0.00844861101359129, "learning_rate": 2.966862494272316e-06, "loss": -0.1115, "num_tokens": 1289975.0, "reward": 0.2175000011920929, "reward_std": 0.522262692451477, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.5222627520561218, "sampling/importance_sampling_ratio/max": 1.465867042541504, "sampling/importance_sampling_ratio/mean": 1.0452473163604736, "sampling/importance_sampling_ratio/min": 0.33251479268074036, "sampling/sampling_logp_difference/max": 0.5924015045166016, "sampling/sampling_logp_difference/mean": 0.022284023463726044, "step": 458, "step_time": 35.12598103302298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.20876465737819672, "epoch": 0.459, "frac_reward_zero_std": 0.0, "grad_norm": 1.9874993562698364, "kl": 0.008869977667927742, "learning_rate": 2.9589056083620902e-06, "loss": -0.1618, "num_tokens": 1292795.0, "reward": 0.22750000655651093, "reward_std": 0.5162283182144165, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5162283182144165, "sampling/importance_sampling_ratio/max": 2.632225513458252, "sampling/importance_sampling_ratio/mean": 1.3114848136901855, "sampling/importance_sampling_ratio/min": 0.302666038274765, "sampling/sampling_logp_difference/max": 0.5814425945281982, "sampling/sampling_logp_difference/mean": 0.02165631391108036, "step": 459, "step_time": 29.1406745010172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.15365423262119293, "epoch": 0.46, "frac_reward_zero_std": 0.0, "grad_norm": 1.26414155960083, "kl": 0.011394418776035309, "learning_rate": 2.9509439087491837e-06, "loss": 0.2662, "num_tokens": 1295549.0, "reward": 0.48750001192092896, "reward_std": 0.5862522125244141, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5862522125244141, "sampling/importance_sampling_ratio/max": 1.703123688697815, "sampling/importance_sampling_ratio/mean": 0.9854624271392822, "sampling/importance_sampling_ratio/min": 0.4295061230659485, "sampling/sampling_logp_difference/max": 0.33905029296875, "sampling/sampling_logp_difference/mean": 0.016382405534386635, "step": 460, "step_time": 26.815834667999297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.19121307134628296, "epoch": 0.461, "frac_reward_zero_std": 0.0, "grad_norm": 0.6136273145675659, "kl": 0.010095298290252686, "learning_rate": 2.9429774789480576e-06, "loss": 0.0958, "num_tokens": 1297767.0, "reward": 0.7150000333786011, "reward_std": 0.5169461965560913, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5169461965560913, "sampling/importance_sampling_ratio/max": 1.1227145195007324, "sampling/importance_sampling_ratio/mean": 0.8902336359024048, "sampling/importance_sampling_ratio/min": 0.5804774761199951, "sampling/sampling_logp_difference/max": 0.6244083642959595, "sampling/sampling_logp_difference/mean": 0.016019826754927635, "step": 461, "step_time": 17.382083212956786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18492881953716278, "epoch": 0.462, "frac_reward_zero_std": 0.0, "grad_norm": 1.3573193550109863, "kl": 0.008144048042595387, "learning_rate": 2.93500640252279e-06, "loss": 0.0098, "num_tokens": 1300579.0, "reward": 0.20749999582767487, "reward_std": 0.5019544959068298, "rewards/reward_func/mean": 0.20749999582767487, "rewards/reward_func/std": 0.5019544959068298, "sampling/importance_sampling_ratio/max": 2.1250946521759033, "sampling/importance_sampling_ratio/mean": 1.2734090089797974, "sampling/importance_sampling_ratio/min": 0.624800443649292, "sampling/sampling_logp_difference/max": 0.4196096658706665, "sampling/sampling_logp_difference/mean": 0.01873563416302204, "step": 462, "step_time": 31.724418731988408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.2044009417295456, "epoch": 0.463, "frac_reward_zero_std": 0.0, "grad_norm": 1.0971373319625854, "kl": 0.007162611465901136, "learning_rate": 2.927030763086201e-06, "loss": 0.0145, "num_tokens": 1303153.0, "reward": 0.48500001430511475, "reward_std": 0.5951750874519348, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5951750874519348, "sampling/importance_sampling_ratio/max": 1.0362753868103027, "sampling/importance_sampling_ratio/mean": 0.8342453241348267, "sampling/importance_sampling_ratio/min": 0.6081456542015076, "sampling/sampling_logp_difference/max": 0.4436979293823242, "sampling/sampling_logp_difference/mean": 0.025296861305832863, "step": 463, "step_time": 25.053177120978944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.16809864342212677, "epoch": 0.464, "frac_reward_zero_std": 0.0, "grad_norm": 0.7943792939186096, "kl": 0.011327192187309265, "learning_rate": 2.9190506442989753e-06, "loss": -0.1586, "num_tokens": 1305950.0, "reward": 0.2475000023841858, "reward_std": 0.5016888380050659, "rewards/reward_func/mean": 0.2475000023841858, "rewards/reward_func/std": 0.5016888380050659, "sampling/importance_sampling_ratio/max": 1.1387665271759033, "sampling/importance_sampling_ratio/mean": 0.7272910475730896, "sampling/importance_sampling_ratio/min": 0.20925834774971008, "sampling/sampling_logp_difference/max": 0.6859614849090576, "sampling/sampling_logp_difference/mean": 0.017176760360598564, "step": 464, "step_time": 25.061609486001544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1855546087026596, "epoch": 0.465, "frac_reward_zero_std": 0.0, "grad_norm": 2.3222036361694336, "kl": 0.013585694134235382, "learning_rate": 2.9110661298687824e-06, "loss": 0.1546, "num_tokens": 1308608.0, "reward": 0.4325000047683716, "reward_std": 0.6563218235969543, "rewards/reward_func/mean": 0.4325000047683716, "rewards/reward_func/std": 0.6563218235969543, "sampling/importance_sampling_ratio/max": 2.3550212383270264, "sampling/importance_sampling_ratio/mean": 1.5367931127548218, "sampling/importance_sampling_ratio/min": 0.5216749906539917, "sampling/sampling_logp_difference/max": 0.3375256061553955, "sampling/sampling_logp_difference/mean": 0.020954973995685577, "step": 465, "step_time": 28.397672267979942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18773867189884186, "epoch": 0.466, "frac_reward_zero_std": 0.0, "grad_norm": 0.9811770915985107, "kl": 0.006499780807644129, "learning_rate": 2.9030773035493997e-06, "loss": -0.1522, "num_tokens": 1310922.0, "reward": 0.7274999618530273, "reward_std": 0.5450000166893005, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.5450000166893005, "sampling/importance_sampling_ratio/max": 1.4225895404815674, "sampling/importance_sampling_ratio/mean": 1.0681791305541992, "sampling/importance_sampling_ratio/min": 0.6892783045768738, "sampling/sampling_logp_difference/max": 0.3949413299560547, "sampling/sampling_logp_difference/mean": 0.017551708966493607, "step": 466, "step_time": 16.613432837999426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15255199372768402, "epoch": 0.467, "frac_reward_zero_std": 0.0, "grad_norm": 1.2089929580688477, "kl": 0.011558917351067066, "learning_rate": 2.8950842491398358e-06, "loss": -0.1123, "num_tokens": 1313900.0, "reward": 0.47999998927116394, "reward_std": 0.5953710675239563, "rewards/reward_func/mean": 0.47999998927116394, "rewards/reward_func/std": 0.5953710079193115, "sampling/importance_sampling_ratio/max": 1.2242178916931152, "sampling/importance_sampling_ratio/mean": 1.0623831748962402, "sampling/importance_sampling_ratio/min": 0.8670327663421631, "sampling/sampling_logp_difference/max": 0.34816741943359375, "sampling/sampling_logp_difference/mean": 0.017762713134288788, "step": 467, "step_time": 30.275761183002032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1784668266773224, "epoch": 0.468, "frac_reward_zero_std": 0.0, "grad_norm": 1.0684055089950562, "kl": 0.015394800342619419, "learning_rate": 2.8870870504834497e-06, "loss": -0.0954, "num_tokens": 1316399.0, "reward": 0.47999998927116394, "reward_std": 0.6004442572593689, "rewards/reward_func/mean": 0.47999998927116394, "rewards/reward_func/std": 0.6004442572593689, "sampling/importance_sampling_ratio/max": 1.1755908727645874, "sampling/importance_sampling_ratio/mean": 0.8824061155319214, "sampling/importance_sampling_ratio/min": 0.6028077006340027, "sampling/sampling_logp_difference/max": 0.3377223014831543, "sampling/sampling_logp_difference/mean": 0.017298569902777672, "step": 468, "step_time": 20.43618850701023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.17152951657772064, "epoch": 0.469, "frac_reward_zero_std": 0.0, "grad_norm": 0.8886656165122986, "kl": 0.00914768222719431, "learning_rate": 2.87908579146707e-06, "loss": -0.1819, "num_tokens": 1319130.0, "reward": -0.07750000059604645, "reward_std": 0.03095695748925209, "rewards/reward_func/mean": -0.07750000059604645, "rewards/reward_func/std": 0.03095695748925209, "sampling/importance_sampling_ratio/max": 0.9091010093688965, "sampling/importance_sampling_ratio/mean": 0.7135780453681946, "sampling/importance_sampling_ratio/min": 0.23703016340732574, "sampling/sampling_logp_difference/max": 0.7292177677154541, "sampling/sampling_logp_difference/mean": 0.02143814228475094, "step": 469, "step_time": 32.32495648501208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.19230961799621582, "epoch": 0.47, "frac_reward_zero_std": 0.0, "grad_norm": 1.2652502059936523, "kl": 0.013783320784568787, "learning_rate": 2.8710805560201184e-06, "loss": -0.1797, "num_tokens": 1322323.0, "reward": 0.4950000047683716, "reward_std": 0.5831237435340881, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5831238031387329, "sampling/importance_sampling_ratio/max": 1.113662838935852, "sampling/importance_sampling_ratio/mean": 0.8443402051925659, "sampling/importance_sampling_ratio/min": 0.37887251377105713, "sampling/sampling_logp_difference/max": 0.5682388544082642, "sampling/sampling_logp_difference/mean": 0.022412581369280815, "step": 470, "step_time": 27.587860508006997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.16694240272045135, "epoch": 0.471, "frac_reward_zero_std": 0.0, "grad_norm": 1.0029407739639282, "kl": 0.009018545970320702, "learning_rate": 2.8630714281137263e-06, "loss": -0.156, "num_tokens": 1325798.0, "reward": 0.4925000071525574, "reward_std": 0.5860247015953064, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.3442041873931885, "sampling/importance_sampling_ratio/mean": 0.9944065809249878, "sampling/importance_sampling_ratio/min": 0.686233401298523, "sampling/sampling_logp_difference/max": 0.4784134030342102, "sampling/sampling_logp_difference/mean": 0.015540587715804577, "step": 471, "step_time": 28.120615650026593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.20289495587348938, "epoch": 0.472, "frac_reward_zero_std": 0.0, "grad_norm": 2.520662784576416, "kl": 0.027400033548474312, "learning_rate": 2.8550584917598558e-06, "loss": 0.2446, "num_tokens": 1329058.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.524487018585205, "sampling/importance_sampling_ratio/mean": 1.0076969861984253, "sampling/importance_sampling_ratio/min": 0.7219834327697754, "sampling/sampling_logp_difference/max": 0.5227086544036865, "sampling/sampling_logp_difference/mean": 0.021278226748108864, "step": 472, "step_time": 24.774295494018588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17204667627811432, "epoch": 0.473, "frac_reward_zero_std": 0.0, "grad_norm": 0.8414384722709656, "kl": 0.007265313528478146, "learning_rate": 2.8470418310104175e-06, "loss": 0.0537, "num_tokens": 1331919.0, "reward": 0.7425000071525574, "reward_std": 0.5149999856948853, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5149999856948853, "sampling/importance_sampling_ratio/max": 1.3663551807403564, "sampling/importance_sampling_ratio/mean": 0.9165531992912292, "sampling/importance_sampling_ratio/min": 0.46905970573425293, "sampling/sampling_logp_difference/max": 0.29321417212486267, "sampling/sampling_logp_difference/mean": 0.01629566214978695, "step": 473, "step_time": 24.169442703016102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.15804213285446167, "epoch": 0.474, "frac_reward_zero_std": 0.0, "grad_norm": 2.2006685733795166, "kl": 0.007847634144127369, "learning_rate": 2.839021529956388e-06, "loss": -0.3355, "num_tokens": 1335061.0, "reward": 0.48000001907348633, "reward_std": 0.6006662249565125, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.600666344165802, "sampling/importance_sampling_ratio/max": 2.285823106765747, "sampling/importance_sampling_ratio/mean": 1.3572304248809814, "sampling/importance_sampling_ratio/min": 0.7469886541366577, "sampling/sampling_logp_difference/max": 0.42196738719940186, "sampling/sampling_logp_difference/mean": 0.015461845323443413, "step": 474, "step_time": 28.090530368033797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1962510198354721, "epoch": 0.475, "frac_reward_zero_std": 0.0, "grad_norm": 1.3902899026870728, "kl": 0.009486009366810322, "learning_rate": 2.8309976727269335e-06, "loss": 0.3383, "num_tokens": 1337871.0, "reward": 0.48250001668930054, "reward_std": 0.5979060530662537, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5979060530662537, "sampling/importance_sampling_ratio/max": 1.3710312843322754, "sampling/importance_sampling_ratio/mean": 1.031134843826294, "sampling/importance_sampling_ratio/min": 0.5034615397453308, "sampling/sampling_logp_difference/max": 0.4643425941467285, "sampling/sampling_logp_difference/mean": 0.021930821239948273, "step": 475, "step_time": 23.49796404998051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.15679052472114563, "epoch": 0.476, "frac_reward_zero_std": 0.0, "grad_norm": 0.9682862162590027, "kl": 0.036426376551389694, "learning_rate": 2.8229703434885165e-06, "loss": 0.1179, "num_tokens": 1340719.0, "reward": 0.20250000059604645, "reward_std": 0.5318129062652588, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.5318129062652588, "sampling/importance_sampling_ratio/max": 2.6286509037017822, "sampling/importance_sampling_ratio/mean": 1.392582654953003, "sampling/importance_sampling_ratio/min": 0.7407203912734985, "sampling/sampling_logp_difference/max": 0.6738049983978271, "sampling/sampling_logp_difference/mean": 0.022668922320008278, "step": 476, "step_time": 28.929097748012282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.21174106001853943, "epoch": 0.477, "frac_reward_zero_std": 0.0, "grad_norm": 0.9093917012214661, "kl": 0.013331391848623753, "learning_rate": 2.814939626444023e-06, "loss": 0.1749, "num_tokens": 1343237.0, "reward": 0.9950000047683716, "reward_std": 0.005773497279733419, "rewards/reward_func/mean": 0.9950000047683716, "rewards/reward_func/std": 0.005773497279733419, "sampling/importance_sampling_ratio/max": 0.8154817223548889, "sampling/importance_sampling_ratio/mean": 0.527961015701294, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3775796890258789, "sampling/sampling_logp_difference/mean": 0.02218007668852806, "step": 477, "step_time": 17.42173261701828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16854561865329742, "epoch": 0.478, "frac_reward_zero_std": 0.0, "grad_norm": 0.96002197265625, "kl": 0.008641577325761318, "learning_rate": 2.8069056058318754e-06, "loss": 0.0717, "num_tokens": 1346257.0, "reward": 0.45499998331069946, "reward_std": 0.630634605884552, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.630634605884552, "sampling/importance_sampling_ratio/max": 1.2033108472824097, "sampling/importance_sampling_ratio/mean": 0.8131914138793945, "sampling/importance_sampling_ratio/min": 0.6072388887405396, "sampling/sampling_logp_difference/max": 0.5217990875244141, "sampling/sampling_logp_difference/mean": 0.017629005014896393, "step": 478, "step_time": 29.908944410039112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18854458630084991, "epoch": 0.479, "frac_reward_zero_std": 0.0, "grad_norm": 1.1713976860046387, "kl": 0.015499746426939964, "learning_rate": 2.7988683659251475e-06, "loss": -0.0311, "num_tokens": 1349192.0, "reward": 0.4925000071525574, "reward_std": 0.574478030204773, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.574478030204773, "sampling/importance_sampling_ratio/max": 1.1547468900680542, "sampling/importance_sampling_ratio/mean": 0.7230081558227539, "sampling/importance_sampling_ratio/min": 0.36295026540756226, "sampling/sampling_logp_difference/max": 0.5829365253448486, "sampling/sampling_logp_difference/mean": 0.02886391617357731, "step": 479, "step_time": 29.55071289499756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.19413813948631287, "epoch": 0.48, "frac_reward_zero_std": 0.0, "grad_norm": 1.0189701318740845, "kl": 0.014416433870792389, "learning_rate": 2.7908279910306834e-06, "loss": -0.1441, "num_tokens": 1352552.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.3797539472579956, "sampling/importance_sampling_ratio/mean": 1.0011581182479858, "sampling/importance_sampling_ratio/min": 0.6776071786880493, "sampling/sampling_logp_difference/max": 0.30836057662963867, "sampling/sampling_logp_difference/mean": 0.017654133960604668, "step": 480, "step_time": 19.862246336007956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17528067529201508, "epoch": 0.481, "frac_reward_zero_std": 0.0, "grad_norm": 0.9865776896476746, "kl": 0.009348684921860695, "learning_rate": 2.7827845654882112e-06, "loss": 0.0686, "num_tokens": 1355625.0, "reward": 0.24000000953674316, "reward_std": 0.5066885948181152, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.5066885948181152, "sampling/importance_sampling_ratio/max": 1.2727447748184204, "sampling/importance_sampling_ratio/mean": 0.9886298179626465, "sampling/importance_sampling_ratio/min": 0.7678318619728088, "sampling/sampling_logp_difference/max": 0.2513253688812256, "sampling/sampling_logp_difference/mean": 0.015953661873936653, "step": 481, "step_time": 31.546946729009505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.20458126068115234, "epoch": 0.482, "frac_reward_zero_std": 0.0, "grad_norm": 0.9443128108978271, "kl": 0.004826865158975124, "learning_rate": 2.7747381736694573e-06, "loss": 0.0192, "num_tokens": 1357926.0, "reward": 0.4775000214576721, "reward_std": 0.5977945327758789, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5977945923805237, "sampling/importance_sampling_ratio/max": 1.140128493309021, "sampling/importance_sampling_ratio/mean": 0.9035488963127136, "sampling/importance_sampling_ratio/min": 0.6631909012794495, "sampling/sampling_logp_difference/max": 0.2973288297653198, "sampling/sampling_logp_difference/mean": 0.017087938264012337, "step": 482, "step_time": 22.129020470019896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17821061611175537, "epoch": 0.483, "frac_reward_zero_std": 0.0, "grad_norm": 1.3846080303192139, "kl": 0.009810151532292366, "learning_rate": 2.766688899977266e-06, "loss": -0.2358, "num_tokens": 1360883.0, "reward": 0.48250001668930054, "reward_std": 0.5920233726501465, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5920233726501465, "sampling/importance_sampling_ratio/max": 2.118821382522583, "sampling/importance_sampling_ratio/mean": 1.10526704788208, "sampling/importance_sampling_ratio/min": 0.5762240886688232, "sampling/sampling_logp_difference/max": 0.32975125312805176, "sampling/sampling_logp_difference/mean": 0.0186659824103117, "step": 483, "step_time": 29.97070738201728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.20785591006278992, "epoch": 0.484, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109732627868652, "kl": 0.005915760528296232, "learning_rate": 2.7586368288447094e-06, "loss": -0.1366, "num_tokens": 1363488.0, "reward": 0.45749998092651367, "reward_std": 0.6275016665458679, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6275016069412231, "sampling/importance_sampling_ratio/max": 1.455223798751831, "sampling/importance_sampling_ratio/mean": 1.0375548601150513, "sampling/importance_sampling_ratio/min": 0.7802204489707947, "sampling/sampling_logp_difference/max": 0.31172311305999756, "sampling/sampling_logp_difference/mean": 0.013572500087320805, "step": 484, "step_time": 16.008105660032015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.20897801220417023, "epoch": 0.485, "frac_reward_zero_std": 0.0, "grad_norm": 1.9368159770965576, "kl": 0.007568884640932083, "learning_rate": 2.750582044734203e-06, "loss": 0.354, "num_tokens": 1365946.0, "reward": 0.4624999761581421, "reward_std": 0.5977945327758789, "rewards/reward_func/mean": 0.4624999761581421, "rewards/reward_func/std": 0.5977945327758789, "sampling/importance_sampling_ratio/max": 1.999666452407837, "sampling/importance_sampling_ratio/mean": 1.2565072774887085, "sampling/importance_sampling_ratio/min": 0.8293865919113159, "sampling/sampling_logp_difference/max": 0.35970449447631836, "sampling/sampling_logp_difference/mean": 0.019272392615675926, "step": 485, "step_time": 26.663869270996656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18504998087882996, "epoch": 0.486, "frac_reward_zero_std": 0.0, "grad_norm": 1.1646453142166138, "kl": 0.010964952409267426, "learning_rate": 2.7425246321366205e-06, "loss": 0.1056, "num_tokens": 1369018.0, "reward": 0.22750000655651093, "reward_std": 0.5150647163391113, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5150647163391113, "sampling/importance_sampling_ratio/max": 1.2715320587158203, "sampling/importance_sampling_ratio/mean": 1.0238198041915894, "sampling/importance_sampling_ratio/min": 0.802073061466217, "sampling/sampling_logp_difference/max": 0.32739925384521484, "sampling/sampling_logp_difference/mean": 0.019910767674446106, "step": 486, "step_time": 25.798881227034144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.15390801429748535, "epoch": 0.487, "frac_reward_zero_std": 0.0, "grad_norm": 1.5774110555648804, "kl": 0.007523780222982168, "learning_rate": 2.7344646755704078e-06, "loss": -0.2303, "num_tokens": 1371671.0, "reward": -0.07750000059604645, "reward_std": 0.04349329695105553, "rewards/reward_func/mean": -0.07750000059604645, "rewards/reward_func/std": 0.04349329695105553, "sampling/importance_sampling_ratio/max": 1.8134368658065796, "sampling/importance_sampling_ratio/mean": 1.2338995933532715, "sampling/importance_sampling_ratio/min": 0.818612277507782, "sampling/sampling_logp_difference/max": 0.4050261974334717, "sampling/sampling_logp_difference/mean": 0.013379405252635479, "step": 487, "step_time": 36.71064544300316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16176699101924896, "epoch": 0.488, "frac_reward_zero_std": 0.0, "grad_norm": 0.8392361998558044, "kl": 0.011086840182542801, "learning_rate": 2.726402259580695e-06, "loss": 0.0925, "num_tokens": 1374639.0, "reward": 0.4699999988079071, "reward_std": 0.6133514046669006, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6133514046669006, "sampling/importance_sampling_ratio/max": 0.911862313747406, "sampling/importance_sampling_ratio/mean": 0.7351292371749878, "sampling/importance_sampling_ratio/min": 0.4567738473415375, "sampling/sampling_logp_difference/max": 0.8988639116287231, "sampling/sampling_logp_difference/mean": 0.018368083983659744, "step": 488, "step_time": 21.997132979042362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17942406237125397, "epoch": 0.489, "frac_reward_zero_std": 0.0, "grad_norm": 0.9050419926643372, "kl": 0.012715058401226997, "learning_rate": 2.71833746873841e-06, "loss": -0.1198, "num_tokens": 1377126.0, "reward": 0.7274999618530273, "reward_std": 0.5450000166893005, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.5450000166893005, "sampling/importance_sampling_ratio/max": 1.3942056894302368, "sampling/importance_sampling_ratio/mean": 1.1211340427398682, "sampling/importance_sampling_ratio/min": 0.8798677921295166, "sampling/sampling_logp_difference/max": 0.2652394771575928, "sampling/sampling_logp_difference/mean": 0.012573324143886566, "step": 489, "step_time": 22.15838368597906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18960313498973846, "epoch": 0.49, "frac_reward_zero_std": 0.0, "grad_norm": 0.9664250612258911, "kl": 0.013437488116323948, "learning_rate": 2.7102703876393942e-06, "loss": 0.2968, "num_tokens": 1379831.0, "reward": 0.21250000596046448, "reward_std": 0.5254442691802979, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.5254442691802979, "sampling/importance_sampling_ratio/max": 1.784225583076477, "sampling/importance_sampling_ratio/mean": 1.1598105430603027, "sampling/importance_sampling_ratio/min": 0.3810417950153351, "sampling/sampling_logp_difference/max": 0.5660219192504883, "sampling/sampling_logp_difference/mean": 0.01819697581231594, "step": 490, "step_time": 30.81265167199308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16736240684986115, "epoch": 0.491, "frac_reward_zero_std": 0.0, "grad_norm": 0.7280340790748596, "kl": 0.010734724812209606, "learning_rate": 2.702201100903511e-06, "loss": -0.1519, "num_tokens": 1382445.0, "reward": 0.7075000405311584, "reward_std": 0.5250635147094727, "rewards/reward_func/mean": 0.7075000405311584, "rewards/reward_func/std": 0.5250635147094727, "sampling/importance_sampling_ratio/max": 1.4367406368255615, "sampling/importance_sampling_ratio/mean": 0.9370206594467163, "sampling/importance_sampling_ratio/min": 0.5076593160629272, "sampling/sampling_logp_difference/max": 0.5285038948059082, "sampling/sampling_logp_difference/mean": 0.015835752710700035, "step": 491, "step_time": 25.99061773502035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1723225861787796, "epoch": 0.492, "frac_reward_zero_std": 0.0, "grad_norm": 0.6281760931015015, "kl": 0.009769605472683907, "learning_rate": 2.694129693173759e-06, "loss": -0.142, "num_tokens": 1385166.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.2545709609985352, "sampling/importance_sampling_ratio/mean": 0.9779781103134155, "sampling/importance_sampling_ratio/min": 0.6830295920372009, "sampling/sampling_logp_difference/max": 0.34907954931259155, "sampling/sampling_logp_difference/mean": 0.014659264124929905, "step": 492, "step_time": 15.197458020003978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1794813871383667, "epoch": 0.493, "frac_reward_zero_std": 0.0, "grad_norm": 0.7929661870002747, "kl": 0.006562918424606323, "learning_rate": 2.6860562491153854e-06, "loss": -0.0788, "num_tokens": 1387969.0, "reward": 0.6850000023841858, "reward_std": 0.5715767741203308, "rewards/reward_func/mean": 0.6850000023841858, "rewards/reward_func/std": 0.5715767741203308, "sampling/importance_sampling_ratio/max": 1.488052248954773, "sampling/importance_sampling_ratio/mean": 0.9847512245178223, "sampling/importance_sampling_ratio/min": 0.5077809691429138, "sampling/sampling_logp_difference/max": 0.2768237590789795, "sampling/sampling_logp_difference/mean": 0.018370725214481354, "step": 493, "step_time": 26.09608670300804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17461852729320526, "epoch": 0.494, "frac_reward_zero_std": 0.0, "grad_norm": 0.8837519884109497, "kl": 0.014624289236962795, "learning_rate": 2.6779808534149986e-06, "loss": -0.0879, "num_tokens": 1390697.0, "reward": -0.06749999523162842, "reward_std": 0.08421202749013901, "rewards/reward_func/mean": -0.06749999523162842, "rewards/reward_func/std": 0.0842120349407196, "sampling/importance_sampling_ratio/max": 1.4831488132476807, "sampling/importance_sampling_ratio/mean": 0.8945019841194153, "sampling/importance_sampling_ratio/min": 0.5493906736373901, "sampling/sampling_logp_difference/max": 0.41962993144989014, "sampling/sampling_logp_difference/mean": 0.02042003720998764, "step": 494, "step_time": 39.09278319502482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.18586626648902893, "epoch": 0.495, "frac_reward_zero_std": 0.0, "grad_norm": 1.9716275930404663, "kl": 0.012399252504110336, "learning_rate": 2.6699035907796796e-06, "loss": 0.3259, "num_tokens": 1393889.0, "reward": 0.7425000071525574, "reward_std": 0.5083552002906799, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5083552002906799, "sampling/importance_sampling_ratio/max": 1.6532326936721802, "sampling/importance_sampling_ratio/mean": 1.0114717483520508, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6867589950561523, "sampling/sampling_logp_difference/mean": 0.022546935826539993, "step": 495, "step_time": 25.205879007000476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2140168398618698, "epoch": 0.496, "frac_reward_zero_std": 0.0, "grad_norm": 1.4342621564865112, "kl": 0.004829188343137503, "learning_rate": 2.6618245459360896e-06, "loss": 0.027, "num_tokens": 1396354.0, "reward": 0.7050000429153442, "reward_std": 0.5701754689216614, "rewards/reward_func/mean": 0.7050000429153442, "rewards/reward_func/std": 0.5701754093170166, "sampling/importance_sampling_ratio/max": 1.883225440979004, "sampling/importance_sampling_ratio/mean": 1.333125352859497, "sampling/importance_sampling_ratio/min": 1.1138962507247925, "sampling/sampling_logp_difference/max": 0.4690258502960205, "sampling/sampling_logp_difference/mean": 0.018021350726485252, "step": 496, "step_time": 19.976822575030383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18866859376430511, "epoch": 0.497, "frac_reward_zero_std": 0.0, "grad_norm": 1.710184097290039, "kl": 0.00827334076166153, "learning_rate": 2.6537438036295876e-06, "loss": -0.1618, "num_tokens": 1399269.0, "reward": 0.4950000047683716, "reward_std": 0.5773791074752808, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5773791670799255, "sampling/importance_sampling_ratio/max": 2.2289981842041016, "sampling/importance_sampling_ratio/mean": 1.5547765493392944, "sampling/importance_sampling_ratio/min": 0.8371957540512085, "sampling/sampling_logp_difference/max": 0.6419382095336914, "sampling/sampling_logp_difference/mean": 0.01867852918803692, "step": 497, "step_time": 22.492317979980726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17990146577358246, "epoch": 0.498, "frac_reward_zero_std": 0.0, "grad_norm": 0.5475334525108337, "kl": 0.0055328975431621075, "learning_rate": 2.6456614486233344e-06, "loss": 0.0303, "num_tokens": 1401447.0, "reward": 0.7350000143051147, "reward_std": 0.5299999713897705, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5300000309944153, "sampling/importance_sampling_ratio/max": 0.9481450319290161, "sampling/importance_sampling_ratio/mean": 0.7446837425231934, "sampling/importance_sampling_ratio/min": 0.5886037349700928, "sampling/sampling_logp_difference/max": 0.2662320137023926, "sampling/sampling_logp_difference/mean": 0.013630669564008713, "step": 498, "step_time": 16.643326268997043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.16305726766586304, "epoch": 0.499, "frac_reward_zero_std": 0.0, "grad_norm": 2.166003704071045, "kl": 0.011041740886867046, "learning_rate": 2.6375775656974124e-06, "loss": -0.0664, "num_tokens": 1404115.0, "reward": 0.7174999713897705, "reward_std": 0.5257613658905029, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.5257613658905029, "sampling/importance_sampling_ratio/max": 1.4016609191894531, "sampling/importance_sampling_ratio/mean": 1.1854861974716187, "sampling/importance_sampling_ratio/min": 0.8115931749343872, "sampling/sampling_logp_difference/max": 0.38892555236816406, "sampling/sampling_logp_difference/mean": 0.020091494545340538, "step": 499, "step_time": 17.759405763004906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 61.5, "completions/mean_terminated_length": 61.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17052681744098663, "epoch": 0.5, "frac_reward_zero_std": 0.0, "grad_norm": 0.8255075216293335, "kl": 0.00986223854124546, "learning_rate": 2.6294922396479263e-06, "loss": -0.0037, "num_tokens": 1406840.0, "reward": 0.48250001668930054, "reward_std": 0.5918543934822083, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.591854453086853, "sampling/importance_sampling_ratio/max": 1.1053283214569092, "sampling/importance_sampling_ratio/mean": 0.8158130049705505, "sampling/importance_sampling_ratio/min": 0.5192359089851379, "sampling/sampling_logp_difference/max": 0.699356198310852, "sampling/sampling_logp_difference/mean": 0.017079275101423264, "step": 500, "step_time": 22.861548782966565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.25186288356781006, "epoch": 0.501, "frac_reward_zero_std": 0.0, "grad_norm": 1.2654176950454712, "kl": 0.02069578692317009, "learning_rate": 2.6214055552861213e-06, "loss": -0.0541, "num_tokens": 1409222.0, "reward": 0.9950000047683716, "reward_std": 0.009999990463256836, "rewards/reward_func/mean": 0.9950000047683716, "rewards/reward_func/std": 0.009999990463256836, "sampling/importance_sampling_ratio/max": 1.5382214784622192, "sampling/importance_sampling_ratio/mean": 0.9265186190605164, "sampling/importance_sampling_ratio/min": 0.5166036486625671, "sampling/sampling_logp_difference/max": 0.6103760004043579, "sampling/sampling_logp_difference/mean": 0.022312935441732407, "step": 501, "step_time": 15.314348345971666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.16949212551116943, "epoch": 0.502, "frac_reward_zero_std": 0.0, "grad_norm": 0.9278621673583984, "kl": 0.015267706476151943, "learning_rate": 2.613317597437489e-06, "loss": -0.228, "num_tokens": 1412243.0, "reward": 0.19249999523162842, "reward_std": 0.5361825227737427, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.5361825227737427, "sampling/importance_sampling_ratio/max": 1.0221174955368042, "sampling/importance_sampling_ratio/mean": 0.6522483825683594, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.3522300720214844, "sampling/sampling_logp_difference/mean": 0.018127264454960823, "step": 502, "step_time": 39.94463703001384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.23928596079349518, "epoch": 0.503, "frac_reward_zero_std": 0.0, "grad_norm": 1.4382623434066772, "kl": 0.009775945916771889, "learning_rate": 2.6052284509408805e-06, "loss": -0.3427, "num_tokens": 1415194.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 2.410933494567871, "sampling/importance_sampling_ratio/mean": 1.2314956188201904, "sampling/importance_sampling_ratio/min": 0.631985068321228, "sampling/sampling_logp_difference/max": 0.4792187213897705, "sampling/sampling_logp_difference/mean": 0.02336762100458145, "step": 503, "step_time": 21.74280083697522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17725226283073425, "epoch": 0.504, "frac_reward_zero_std": 0.0, "grad_norm": 0.9843229055404663, "kl": 0.005810337606817484, "learning_rate": 2.5971382006476153e-06, "loss": -0.2121, "num_tokens": 1417960.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.7483123540878296, "sampling/importance_sampling_ratio/mean": 1.0398032665252686, "sampling/importance_sampling_ratio/min": 0.6585251688957214, "sampling/sampling_logp_difference/max": 0.41960692405700684, "sampling/sampling_logp_difference/mean": 0.014749791473150253, "step": 504, "step_time": 15.361991964979097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.14139562845230103, "epoch": 0.505, "frac_reward_zero_std": 0.0, "grad_norm": 0.7805476784706116, "kl": 0.01681564934551716, "learning_rate": 2.5890469314205898e-06, "loss": -0.1002, "num_tokens": 1420559.0, "reward": 0.20749999582767487, "reward_std": 0.5099918842315674, "rewards/reward_func/mean": 0.20749999582767487, "rewards/reward_func/std": 0.5099918842315674, "sampling/importance_sampling_ratio/max": 1.0651004314422607, "sampling/importance_sampling_ratio/mean": 0.8007462024688721, "sampling/importance_sampling_ratio/min": 0.4837033152580261, "sampling/sampling_logp_difference/max": 0.6889355182647705, "sampling/sampling_logp_difference/mean": 0.014064198359847069, "step": 505, "step_time": 29.849281275994144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.23172004520893097, "epoch": 0.506, "frac_reward_zero_std": 0.0, "grad_norm": 1.893568754196167, "kl": 0.014917182736098766, "learning_rate": 2.5809547281333904e-06, "loss": -0.0372, "num_tokens": 1422827.0, "reward": 0.737500011920929, "reward_std": 0.5249999761581421, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5250000357627869, "sampling/importance_sampling_ratio/max": 1.1942881345748901, "sampling/importance_sampling_ratio/mean": 1.1483240127563477, "sampling/importance_sampling_ratio/min": 1.1207529306411743, "sampling/sampling_logp_difference/max": 0.29467952251434326, "sampling/sampling_logp_difference/mean": 0.024883747100830078, "step": 506, "step_time": 13.03722249402199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1905321329832077, "epoch": 0.507, "frac_reward_zero_std": 0.0, "grad_norm": 0.946755051612854, "kl": 0.010907978750765324, "learning_rate": 2.5728616756693995e-06, "loss": 0.1727, "num_tokens": 1424997.0, "reward": -0.08249999582767487, "reward_std": 0.040311288088560104, "rewards/reward_func/mean": -0.08249999582767487, "rewards/reward_func/std": 0.040311288088560104, "sampling/importance_sampling_ratio/max": 1.3704488277435303, "sampling/importance_sampling_ratio/mean": 0.8930014371871948, "sampling/importance_sampling_ratio/min": 0.4328288435935974, "sampling/sampling_logp_difference/max": 0.47841382026672363, "sampling/sampling_logp_difference/mean": 0.022593386471271515, "step": 507, "step_time": 31.28051599598257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1494290977716446, "epoch": 0.508, "frac_reward_zero_std": 0.0, "grad_norm": 1.4596552848815918, "kl": 0.005620867013931274, "learning_rate": 2.564767858920909e-06, "loss": 0.1029, "num_tokens": 1428254.0, "reward": 0.7400000095367432, "reward_std": 0.5000666379928589, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5000666975975037, "sampling/importance_sampling_ratio/max": 1.5702496767044067, "sampling/importance_sampling_ratio/mean": 1.0407274961471558, "sampling/importance_sampling_ratio/min": 0.7505809664726257, "sampling/sampling_logp_difference/max": 0.33408379554748535, "sampling/sampling_logp_difference/mean": 0.01773540861904621, "step": 508, "step_time": 26.952027681982145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.20422866940498352, "epoch": 0.509, "frac_reward_zero_std": 0.0, "grad_norm": 1.645565152168274, "kl": 0.01566474325954914, "learning_rate": 2.556673362788225e-06, "loss": 0.217, "num_tokens": 1431259.0, "reward": 0.4925000071525574, "reward_std": 0.5860247611999512, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.065393090248108, "sampling/importance_sampling_ratio/mean": 0.7448695302009583, "sampling/importance_sampling_ratio/min": 0.2881966233253479, "sampling/sampling_logp_difference/max": 0.6105712652206421, "sampling/sampling_logp_difference/mean": 0.02120855823159218, "step": 509, "step_time": 28.023565278039314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.20430995523929596, "epoch": 0.51, "frac_reward_zero_std": 0.0, "grad_norm": 1.4816001653671265, "kl": 0.010862949304282665, "learning_rate": 2.5485782721787837e-06, "loss": 0.1558, "num_tokens": 1433769.0, "reward": 0.4449999928474426, "reward_std": 0.6011378169059753, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.6011378169059753, "sampling/importance_sampling_ratio/max": 1.843724012374878, "sampling/importance_sampling_ratio/mean": 1.3014360666275024, "sampling/importance_sampling_ratio/min": 0.7630718350410461, "sampling/sampling_logp_difference/max": 0.3087763786315918, "sampling/sampling_logp_difference/mean": 0.020773708820343018, "step": 510, "step_time": 26.12072815798456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.2285492867231369, "epoch": 0.511, "frac_reward_zero_std": 0.0, "grad_norm": 1.3879424333572388, "kl": 0.00728601822629571, "learning_rate": 2.5404826720062544e-06, "loss": 0.2652, "num_tokens": 1436203.0, "reward": 0.9825000166893005, "reward_std": 0.022173555567860603, "rewards/reward_func/mean": 0.9825000166893005, "rewards/reward_func/std": 0.02217356488108635, "sampling/importance_sampling_ratio/max": 1.4307284355163574, "sampling/importance_sampling_ratio/mean": 1.0828708410263062, "sampling/importance_sampling_ratio/min": 0.6216052770614624, "sampling/sampling_logp_difference/max": 0.353057861328125, "sampling/sampling_logp_difference/mean": 0.01966438814997673, "step": 511, "step_time": 13.415545655007008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.15970304608345032, "epoch": 0.512, "frac_reward_zero_std": 0.0, "grad_norm": 1.7856215238571167, "kl": 0.006165032275021076, "learning_rate": 2.532386647189651e-06, "loss": -0.6857, "num_tokens": 1439312.0, "reward": 0.4749999940395355, "reward_std": 0.6015812754631042, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6015812158584595, "sampling/importance_sampling_ratio/max": 2.323873519897461, "sampling/importance_sampling_ratio/mean": 1.3109221458435059, "sampling/importance_sampling_ratio/min": 0.44600069522857666, "sampling/sampling_logp_difference/max": 0.5892675518989563, "sampling/sampling_logp_difference/mean": 0.02062816545367241, "step": 512, "step_time": 31.77552522404585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18610021471977234, "epoch": 0.513, "frac_reward_zero_std": 0.0, "grad_norm": 1.0915244817733765, "kl": 0.013113862834870815, "learning_rate": 2.5242902826524435e-06, "loss": 0.0573, "num_tokens": 1442007.0, "reward": 0.23250000178813934, "reward_std": 0.5118186473846436, "rewards/reward_func/mean": 0.23250000178813934, "rewards/reward_func/std": 0.5118186473846436, "sampling/importance_sampling_ratio/max": 1.5604934692382812, "sampling/importance_sampling_ratio/mean": 1.0759522914886475, "sampling/importance_sampling_ratio/min": 0.6970991492271423, "sampling/sampling_logp_difference/max": 0.3094635009765625, "sampling/sampling_logp_difference/mean": 0.020704736933112144, "step": 513, "step_time": 23.874655373976566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.192142054438591, "epoch": 0.514, "frac_reward_zero_std": 1.0, "grad_norm": 0.008482052013278008, "kl": 0.017910774797201157, "learning_rate": 2.5161936633216656e-06, "loss": 0.0002, "num_tokens": 1444446.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.721121072769165, "sampling/importance_sampling_ratio/mean": 1.080458641052246, "sampling/importance_sampling_ratio/min": 0.5908563733100891, "sampling/sampling_logp_difference/max": 0.3777496814727783, "sampling/sampling_logp_difference/mean": 0.01762978360056877, "step": 514, "step_time": 10.647670577978715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17675676941871643, "epoch": 0.515, "frac_reward_zero_std": 0.0, "grad_norm": 1.2516108751296997, "kl": 0.011988481506705284, "learning_rate": 2.5080968741270224e-06, "loss": -0.1829, "num_tokens": 1447439.0, "reward": -0.04749999940395355, "reward_std": 0.054999999701976776, "rewards/reward_func/mean": -0.04749999940395355, "rewards/reward_func/std": 0.055000003427267075, "sampling/importance_sampling_ratio/max": 1.7480775117874146, "sampling/importance_sampling_ratio/mean": 1.0923700332641602, "sampling/importance_sampling_ratio/min": 0.8533011674880981, "sampling/sampling_logp_difference/max": 0.39795398712158203, "sampling/sampling_logp_difference/mean": 0.018235115334391594, "step": 515, "step_time": 37.938585992960725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19351741671562195, "epoch": 0.516, "frac_reward_zero_std": 0.0, "grad_norm": 0.6676006317138672, "kl": 0.008421016857028008, "learning_rate": 2.5e-06, "loss": -0.1758, "num_tokens": 1450371.0, "reward": 0.699999988079071, "reward_std": 0.5339163541793823, "rewards/reward_func/mean": 0.699999988079071, "rewards/reward_func/std": 0.5339163541793823, "sampling/importance_sampling_ratio/max": 1.19438898563385, "sampling/importance_sampling_ratio/mean": 0.7760181427001953, "sampling/importance_sampling_ratio/min": 0.457307904958725, "sampling/sampling_logp_difference/max": 0.33774706721305847, "sampling/sampling_logp_difference/mean": 0.018520092591643333, "step": 516, "step_time": 21.774747831979766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.14100562036037445, "epoch": 0.517, "frac_reward_zero_std": 0.0, "grad_norm": 0.8444004654884338, "kl": 0.00738612050190568, "learning_rate": 2.491903125872979e-06, "loss": -0.1184, "num_tokens": 1453268.0, "reward": 0.4950000047683716, "reward_std": 0.5831809639930725, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5831809043884277, "sampling/importance_sampling_ratio/max": 1.2722883224487305, "sampling/importance_sampling_ratio/mean": 0.9399291276931763, "sampling/importance_sampling_ratio/min": 0.7480865716934204, "sampling/sampling_logp_difference/max": 0.6376893520355225, "sampling/sampling_logp_difference/mean": 0.014399477280676365, "step": 517, "step_time": 26.409534986014478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18155449628829956, "epoch": 0.518, "frac_reward_zero_std": 0.0, "grad_norm": 1.1618047952651978, "kl": 0.01793098635971546, "learning_rate": 2.4838063366783353e-06, "loss": 0.1822, "num_tokens": 1455986.0, "reward": 0.20250000059604645, "reward_std": 0.5251904129981995, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.5251904726028442, "sampling/importance_sampling_ratio/max": 1.599894404411316, "sampling/importance_sampling_ratio/mean": 1.082505464553833, "sampling/importance_sampling_ratio/min": 0.6201211214065552, "sampling/sampling_logp_difference/max": 0.3489055633544922, "sampling/sampling_logp_difference/mean": 0.021818285807967186, "step": 518, "step_time": 26.970722856000066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.1831318587064743, "epoch": 0.519, "frac_reward_zero_std": 0.0, "grad_norm": 1.2378774881362915, "kl": 0.012651619501411915, "learning_rate": 2.4757097173475574e-06, "loss": 0.0667, "num_tokens": 1458894.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.0824073553085327, "sampling/importance_sampling_ratio/mean": 0.9266972541809082, "sampling/importance_sampling_ratio/min": 0.7813526391983032, "sampling/sampling_logp_difference/max": 0.5059165954589844, "sampling/sampling_logp_difference/mean": 0.02004372328519821, "step": 519, "step_time": 21.491843908966985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16838109493255615, "epoch": 0.52, "frac_reward_zero_std": 0.0, "grad_norm": 0.8434763550758362, "kl": 0.007733389735221863, "learning_rate": 2.4676133528103497e-06, "loss": 0.0194, "num_tokens": 1461770.0, "reward": -0.03500000014901161, "reward_std": 0.0635085254907608, "rewards/reward_func/mean": -0.03500000014901161, "rewards/reward_func/std": 0.0635085329413414, "sampling/importance_sampling_ratio/max": 1.1065438985824585, "sampling/importance_sampling_ratio/mean": 0.9264258146286011, "sampling/importance_sampling_ratio/min": 0.6974965929985046, "sampling/sampling_logp_difference/max": 0.36107897758483887, "sampling/sampling_logp_difference/mean": 0.015059493482112885, "step": 520, "step_time": 39.35547555302037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.19911330938339233, "epoch": 0.521, "frac_reward_zero_std": 0.0, "grad_norm": 1.8318724632263184, "kl": 0.011490163393318653, "learning_rate": 2.4595173279937464e-06, "loss": -0.0109, "num_tokens": 1464185.0, "reward": 0.7400000095367432, "reward_std": 0.4936935603618622, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.4936935603618622, "sampling/importance_sampling_ratio/max": 1.0805097818374634, "sampling/importance_sampling_ratio/mean": 0.9681848287582397, "sampling/importance_sampling_ratio/min": 0.7221450805664062, "sampling/sampling_logp_difference/max": 0.34889841079711914, "sampling/sampling_logp_difference/mean": 0.01457962580025196, "step": 521, "step_time": 16.09977254498517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.18741485476493835, "epoch": 0.522, "frac_reward_zero_std": 0.0, "grad_norm": 1.0452879667282104, "kl": 0.012586656957864761, "learning_rate": 2.4514217278212167e-06, "loss": -0.0749, "num_tokens": 1466950.0, "reward": 0.987500011920929, "reward_std": 0.02499999664723873, "rewards/reward_func/mean": 0.987500011920929, "rewards/reward_func/std": 0.025000005960464478, "sampling/importance_sampling_ratio/max": 1.0689207315444946, "sampling/importance_sampling_ratio/mean": 0.7424354553222656, "sampling/importance_sampling_ratio/min": 0.5102779269218445, "sampling/sampling_logp_difference/max": 0.4722095727920532, "sampling/sampling_logp_difference/mean": 0.01803731732070446, "step": 522, "step_time": 14.279075495956931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17661763727664948, "epoch": 0.523, "frac_reward_zero_std": 0.0, "grad_norm": 1.9156960248947144, "kl": 0.008152654394507408, "learning_rate": 2.4433266372117755e-06, "loss": -0.3714, "num_tokens": 1469876.0, "reward": 0.22750000655651093, "reward_std": 0.5155822038650513, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5155822038650513, "sampling/importance_sampling_ratio/max": 1.866234540939331, "sampling/importance_sampling_ratio/mean": 1.3091788291931152, "sampling/importance_sampling_ratio/min": 0.9727583527565002, "sampling/sampling_logp_difference/max": 0.36745166778564453, "sampling/sampling_logp_difference/mean": 0.017901722341775894, "step": 523, "step_time": 30.717596149013843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17863890528678894, "epoch": 0.524, "frac_reward_zero_std": 0.0, "grad_norm": 0.9030314683914185, "kl": 0.007508215494453907, "learning_rate": 2.435232141079092e-06, "loss": 0.1725, "num_tokens": 1472421.0, "reward": 0.20000000298023224, "reward_std": 0.5274466872215271, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5274466872215271, "sampling/importance_sampling_ratio/max": 1.1555169820785522, "sampling/importance_sampling_ratio/mean": 0.8161022067070007, "sampling/importance_sampling_ratio/min": 0.5451900959014893, "sampling/sampling_logp_difference/max": 0.28760766983032227, "sampling/sampling_logp_difference/mean": 0.015879105776548386, "step": 524, "step_time": 25.066866231965832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "entropy": 0.14379380643367767, "epoch": 0.525, "frac_reward_zero_std": 0.0, "grad_norm": 0.7370908856391907, "kl": 0.011990920640528202, "learning_rate": 2.4271383243306017e-06, "loss": -0.0283, "num_tokens": 1475424.0, "reward": 0.7400000095367432, "reward_std": 0.5133549571037292, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5133549571037292, "sampling/importance_sampling_ratio/max": 1.0179579257965088, "sampling/importance_sampling_ratio/mean": 0.8601334095001221, "sampling/importance_sampling_ratio/min": 0.5928218364715576, "sampling/sampling_logp_difference/max": 0.3529176712036133, "sampling/sampling_logp_difference/mean": 0.014607901684939861, "step": 525, "step_time": 26.68306946498342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.14729547500610352, "epoch": 0.526, "frac_reward_zero_std": 0.0, "grad_norm": 0.7215335965156555, "kl": 0.009026481769979, "learning_rate": 2.419045271866611e-06, "loss": 0.0307, "num_tokens": 1478346.0, "reward": 0.1875, "reward_std": 0.5451223254203796, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.5451223254203796, "sampling/importance_sampling_ratio/max": 1.1420716047286987, "sampling/importance_sampling_ratio/mean": 0.8303381204605103, "sampling/importance_sampling_ratio/min": 0.4673976004123688, "sampling/sampling_logp_difference/max": 0.5289645195007324, "sampling/sampling_logp_difference/mean": 0.01608535833656788, "step": 526, "step_time": 27.129467924009077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.166289284825325, "epoch": 0.527, "frac_reward_zero_std": 0.0, "grad_norm": 1.398809552192688, "kl": 0.008139373734593391, "learning_rate": 2.410953068579411e-06, "loss": 0.1299, "num_tokens": 1481050.0, "reward": 0.7350000143051147, "reward_std": 0.4972256124019623, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.49722564220428467, "sampling/importance_sampling_ratio/max": 1.3298542499542236, "sampling/importance_sampling_ratio/mean": 0.7289666533470154, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.47699785232543945, "sampling/sampling_logp_difference/mean": 0.02605799213051796, "step": 527, "step_time": 18.169425265979953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.20324444770812988, "epoch": 0.528, "frac_reward_zero_std": 0.0, "grad_norm": 0.754597008228302, "kl": 0.008088250644505024, "learning_rate": 2.402861799352386e-06, "loss": 0.016, "num_tokens": 1483489.0, "reward": 0.7150000333786011, "reward_std": 0.5633530616760254, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5633530616760254, "sampling/importance_sampling_ratio/max": 1.2615830898284912, "sampling/importance_sampling_ratio/mean": 0.8606312870979309, "sampling/importance_sampling_ratio/min": 0.4028084874153137, "sampling/sampling_logp_difference/max": 0.5268696546554565, "sampling/sampling_logp_difference/mean": 0.021507736295461655, "step": 528, "step_time": 19.545294457988348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1742866486310959, "epoch": 0.529, "frac_reward_zero_std": 0.0, "grad_norm": 1.2617595195770264, "kl": 0.007816199213266373, "learning_rate": 2.3947715490591207e-06, "loss": -0.2317, "num_tokens": 1485938.0, "reward": 0.7174999713897705, "reward_std": 0.5386635065078735, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.5386635065078735, "sampling/importance_sampling_ratio/max": 1.918709635734558, "sampling/importance_sampling_ratio/mean": 1.129919171333313, "sampling/importance_sampling_ratio/min": 0.6727879047393799, "sampling/sampling_logp_difference/max": 0.38248884677886963, "sampling/sampling_logp_difference/mean": 0.018840909004211426, "step": 529, "step_time": 18.472583163005766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.13940078020095825, "epoch": 0.53, "frac_reward_zero_std": 0.0, "grad_norm": 0.7972716093063354, "kl": 0.008482039906084538, "learning_rate": 2.3866824025625123e-06, "loss": 0.1123, "num_tokens": 1488795.0, "reward": 0.4650000035762787, "reward_std": 0.596294105052948, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.596294105052948, "sampling/importance_sampling_ratio/max": 0.8242881894111633, "sampling/importance_sampling_ratio/mean": 0.7320878505706787, "sampling/importance_sampling_ratio/min": 0.5937121510505676, "sampling/sampling_logp_difference/max": 0.29733002185821533, "sampling/sampling_logp_difference/mean": 0.016436217352747917, "step": 530, "step_time": 30.412361131981015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.17818306386470795, "epoch": 0.531, "frac_reward_zero_std": 0.0, "grad_norm": 1.2425706386566162, "kl": 0.014305947348475456, "learning_rate": 2.3785944447138804e-06, "loss": -0.1203, "num_tokens": 1491469.0, "reward": 0.45250001549720764, "reward_std": 0.6053305268287659, "rewards/reward_func/mean": 0.45250001549720764, "rewards/reward_func/std": 0.6053305268287659, "sampling/importance_sampling_ratio/max": 1.2656933069229126, "sampling/importance_sampling_ratio/mean": 0.8115237951278687, "sampling/importance_sampling_ratio/min": 0.3949430286884308, "sampling/sampling_logp_difference/max": 0.5058352947235107, "sampling/sampling_logp_difference/mean": 0.022585075348615646, "step": 531, "step_time": 24.967694299004506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.20081201195716858, "epoch": 0.532, "frac_reward_zero_std": 0.0, "grad_norm": 1.0724437236785889, "kl": 0.010874232277274132, "learning_rate": 2.370507760352074e-06, "loss": -0.1851, "num_tokens": 1494080.0, "reward": 0.45249998569488525, "reward_std": 0.6037313938140869, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6037314534187317, "sampling/importance_sampling_ratio/max": 1.407867670059204, "sampling/importance_sampling_ratio/mean": 1.033663034439087, "sampling/importance_sampling_ratio/min": 0.824311375617981, "sampling/sampling_logp_difference/max": 0.3961402177810669, "sampling/sampling_logp_difference/mean": 0.01788468472659588, "step": 532, "step_time": 21.77053941297345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19946612417697906, "epoch": 0.533, "frac_reward_zero_std": 0.0, "grad_norm": 1.3230572938919067, "kl": 0.03468668833374977, "learning_rate": 2.362422434302588e-06, "loss": 0.504, "num_tokens": 1496938.0, "reward": 0.49000000953674316, "reward_std": 0.5888972282409668, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5888972282409668, "sampling/importance_sampling_ratio/max": 2.244739294052124, "sampling/importance_sampling_ratio/mean": 1.1178169250488281, "sampling/importance_sampling_ratio/min": 0.5032606720924377, "sampling/sampling_logp_difference/max": 1.2163681983947754, "sampling/sampling_logp_difference/mean": 0.028579946607351303, "step": 533, "step_time": 27.13377457804745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 46.25, "completions/mean_terminated_length": 46.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1647951304912567, "epoch": 0.534, "frac_reward_zero_std": 0.0, "grad_norm": 0.9566431045532227, "kl": 0.013867082074284554, "learning_rate": 2.3543385513766656e-06, "loss": 0.1023, "num_tokens": 1499691.0, "reward": 0.4675000011920929, "reward_std": 0.6171641945838928, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6171642541885376, "sampling/importance_sampling_ratio/max": 0.9357319474220276, "sampling/importance_sampling_ratio/mean": 0.6655901670455933, "sampling/importance_sampling_ratio/min": 0.44350138306617737, "sampling/sampling_logp_difference/max": 0.7063963413238525, "sampling/sampling_logp_difference/mean": 0.02746039256453514, "step": 534, "step_time": 20.59041227598209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17848798632621765, "epoch": 0.535, "frac_reward_zero_std": 0.0, "grad_norm": 0.8498501181602478, "kl": 0.01196146197617054, "learning_rate": 2.3462561963704132e-06, "loss": -0.1697, "num_tokens": 1502370.0, "reward": 0.45249998569488525, "reward_std": 0.6325280070304871, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6325280070304871, "sampling/importance_sampling_ratio/max": 1.0234965085983276, "sampling/importance_sampling_ratio/mean": 0.733917772769928, "sampling/importance_sampling_ratio/min": 0.30929893255233765, "sampling/sampling_logp_difference/max": 1.0350680351257324, "sampling/sampling_logp_difference/mean": 0.026453692466020584, "step": 535, "step_time": 24.445700572046917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17638397216796875, "epoch": 0.536, "frac_reward_zero_std": 0.0, "grad_norm": 0.9261518120765686, "kl": 0.01115446723997593, "learning_rate": 2.3381754540639108e-06, "loss": -0.1435, "num_tokens": 1505296.0, "reward": 0.45749998092651367, "reward_std": 0.6275016665458679, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6275016069412231, "sampling/importance_sampling_ratio/max": 1.2285207509994507, "sampling/importance_sampling_ratio/mean": 0.8515561819076538, "sampling/importance_sampling_ratio/min": 0.33226272463798523, "sampling/sampling_logp_difference/max": 0.47838377952575684, "sampling/sampling_logp_difference/mean": 0.014927973970770836, "step": 536, "step_time": 21.155329635948874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.15329031646251678, "epoch": 0.537, "frac_reward_zero_std": 0.0, "grad_norm": 0.6353661417961121, "kl": 0.015477515757083893, "learning_rate": 2.330096409220321e-06, "loss": -0.1097, "num_tokens": 1507929.0, "reward": 0.7450000047683716, "reward_std": 0.5033554434776306, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5033553838729858, "sampling/importance_sampling_ratio/max": 1.1324278116226196, "sampling/importance_sampling_ratio/mean": 0.7372781038284302, "sampling/importance_sampling_ratio/min": 0.5220059752464294, "sampling/sampling_logp_difference/max": 0.3785877227783203, "sampling/sampling_logp_difference/mean": 0.015728922560811043, "step": 537, "step_time": 14.985001627996098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.20855900645256042, "epoch": 0.538, "frac_reward_zero_std": 0.0, "grad_norm": 1.7643048763275146, "kl": 0.010992582887411118, "learning_rate": 2.3220191465850014e-06, "loss": -0.1111, "num_tokens": 1510193.0, "reward": 0.19499999284744263, "reward_std": 0.5376802086830139, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.5376802086830139, "sampling/importance_sampling_ratio/max": 1.476129412651062, "sampling/importance_sampling_ratio/mean": 1.1287429332733154, "sampling/importance_sampling_ratio/min": 0.8235626220703125, "sampling/sampling_logp_difference/max": 0.5869985818862915, "sampling/sampling_logp_difference/mean": 0.02106183022260666, "step": 538, "step_time": 25.34966034599347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19438575208187103, "epoch": 0.539, "frac_reward_zero_std": 0.0, "grad_norm": 1.584336519241333, "kl": 0.00730791175737977, "learning_rate": 2.3139437508846155e-06, "loss": 0.3355, "num_tokens": 1512948.0, "reward": 0.7325000166893005, "reward_std": 0.5217518210411072, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.521751880645752, "sampling/importance_sampling_ratio/max": 1.8651015758514404, "sampling/importance_sampling_ratio/mean": 1.2515689134597778, "sampling/importance_sampling_ratio/min": 0.5448046326637268, "sampling/sampling_logp_difference/max": 0.30795323848724365, "sampling/sampling_logp_difference/mean": 0.02124161459505558, "step": 539, "step_time": 24.067540337971877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18355119228363037, "epoch": 0.54, "frac_reward_zero_std": 0.0, "grad_norm": 0.5327951908111572, "kl": 0.01167966052889824, "learning_rate": 2.3058703068262424e-06, "loss": 0.1243, "num_tokens": 1515354.0, "reward": 0.7050000429153442, "reward_std": 0.5700584650039673, "rewards/reward_func/mean": 0.7050000429153442, "rewards/reward_func/std": 0.5700584650039673, "sampling/importance_sampling_ratio/max": 0.764620304107666, "sampling/importance_sampling_ratio/mean": 0.4999999403953552, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.4059181213378906, "sampling/sampling_logp_difference/mean": 0.021746091544628143, "step": 540, "step_time": 16.71362544398289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.15002837777137756, "epoch": 0.541, "frac_reward_zero_std": 0.0, "grad_norm": 1.2311583757400513, "kl": 0.0030083279125392437, "learning_rate": 2.29779889909649e-06, "loss": -0.0473, "num_tokens": 1517568.0, "reward": 0.7300000190734863, "reward_std": 0.5333541631698608, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5333541631698608, "sampling/importance_sampling_ratio/max": 1.2323473691940308, "sampling/importance_sampling_ratio/mean": 0.9639792442321777, "sampling/importance_sampling_ratio/min": 0.686269223690033, "sampling/sampling_logp_difference/max": 0.19243264198303223, "sampling/sampling_logp_difference/mean": 0.011879880912601948, "step": 541, "step_time": 15.196048629004508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.21146175265312195, "epoch": 0.542, "frac_reward_zero_std": 0.0, "grad_norm": 0.7916557788848877, "kl": 0.013928363099694252, "learning_rate": 2.289729612360606e-06, "loss": 0.0797, "num_tokens": 1520513.0, "reward": 0.48750001192092896, "reward_std": 0.5804237127304077, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5804236531257629, "sampling/importance_sampling_ratio/max": 1.1241979598999023, "sampling/importance_sampling_ratio/mean": 0.7101346254348755, "sampling/importance_sampling_ratio/min": 0.4773297905921936, "sampling/sampling_logp_difference/max": 0.3672764301300049, "sampling/sampling_logp_difference/mean": 0.0200125090777874, "step": 542, "step_time": 36.038357565994374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.18652306497097015, "epoch": 0.543, "frac_reward_zero_std": 0.0, "grad_norm": 2.43630051612854, "kl": 0.02263796329498291, "learning_rate": 2.2816625312615903e-06, "loss": 0.2206, "num_tokens": 1523526.0, "reward": 0.4724999964237213, "reward_std": 0.6097745299339294, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6097745895385742, "sampling/importance_sampling_ratio/max": 2.1651194095611572, "sampling/importance_sampling_ratio/mean": 1.468912124633789, "sampling/importance_sampling_ratio/min": 0.6567851901054382, "sampling/sampling_logp_difference/max": 1.0532493591308594, "sampling/sampling_logp_difference/mean": 0.03162749856710434, "step": 543, "step_time": 31.075706564995926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.15327595174312592, "epoch": 0.544, "frac_reward_zero_std": 0.0, "grad_norm": 1.2160587310791016, "kl": 0.010151979513466358, "learning_rate": 2.273597740419306e-06, "loss": -0.4477, "num_tokens": 1526807.0, "reward": 0.4750000238418579, "reward_std": 0.5949509739875793, "rewards/reward_func/mean": 0.4750000238418579, "rewards/reward_func/std": 0.5949509739875793, "sampling/importance_sampling_ratio/max": 1.9238375425338745, "sampling/importance_sampling_ratio/mean": 0.9990794658660889, "sampling/importance_sampling_ratio/min": 0.3582225739955902, "sampling/sampling_logp_difference/max": 0.5171002149581909, "sampling/sampling_logp_difference/mean": 0.018163105472922325, "step": 544, "step_time": 27.20578465901781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16996438801288605, "epoch": 0.545, "frac_reward_zero_std": 0.0, "grad_norm": 1.3375942707061768, "kl": 0.016139943152666092, "learning_rate": 2.265535324429593e-06, "loss": 0.3377, "num_tokens": 1529760.0, "reward": 0.48750001192092896, "reward_std": 0.5860816240310669, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5860816240310669, "sampling/importance_sampling_ratio/max": 2.1955041885375977, "sampling/importance_sampling_ratio/mean": 1.2705998420715332, "sampling/importance_sampling_ratio/min": 0.80571448802948, "sampling/sampling_logp_difference/max": 0.4598921537399292, "sampling/sampling_logp_difference/mean": 0.01941205933690071, "step": 545, "step_time": 25.924980301992036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16664376854896545, "epoch": 0.546, "frac_reward_zero_std": 0.0, "grad_norm": 0.8485649824142456, "kl": 0.007426074240356684, "learning_rate": 2.25747536786338e-06, "loss": -0.03, "num_tokens": 1532639.0, "reward": 0.7400000095367432, "reward_std": 0.5001999735832214, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5001999735832214, "sampling/importance_sampling_ratio/max": 1.1380683183670044, "sampling/importance_sampling_ratio/mean": 0.8656550049781799, "sampling/importance_sampling_ratio/min": 0.5211918950080872, "sampling/sampling_logp_difference/max": 0.38797032833099365, "sampling/sampling_logp_difference/mean": 0.01528889685869217, "step": 546, "step_time": 17.495372254983522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15959912538528442, "epoch": 0.547, "frac_reward_zero_std": 0.0, "grad_norm": 0.9632635116577148, "kl": 0.01292438618838787, "learning_rate": 2.2494179552657977e-06, "loss": 0.0933, "num_tokens": 1535454.0, "reward": 0.4650000035762787, "reward_std": 0.6204031109809875, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.6204030513763428, "sampling/importance_sampling_ratio/max": 1.0888525247573853, "sampling/importance_sampling_ratio/mean": 0.7432224750518799, "sampling/importance_sampling_ratio/min": 0.49483758211135864, "sampling/sampling_logp_difference/max": 0.6475106477737427, "sampling/sampling_logp_difference/mean": 0.021321093663573265, "step": 547, "step_time": 27.391975069011096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1825985163450241, "epoch": 0.548, "frac_reward_zero_std": 0.0, "grad_norm": 3.443380117416382, "kl": 0.011872466653585434, "learning_rate": 2.241363171155291e-06, "loss": -0.0021, "num_tokens": 1538455.0, "reward": 0.4775000214576721, "reward_std": 0.6034553050994873, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.6034553050994873, "sampling/importance_sampling_ratio/max": 1.975149154663086, "sampling/importance_sampling_ratio/mean": 1.3384923934936523, "sampling/importance_sampling_ratio/min": 0.7999183535575867, "sampling/sampling_logp_difference/max": 0.659188985824585, "sampling/sampling_logp_difference/mean": 0.018748536705970764, "step": 548, "step_time": 24.796489498985466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 59.0, "completions/mean_terminated_length": 59.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.15279485285282135, "epoch": 0.549, "frac_reward_zero_std": 0.0, "grad_norm": 0.560249924659729, "kl": 0.004618494305759668, "learning_rate": 2.2333111000227343e-06, "loss": -0.0835, "num_tokens": 1540827.0, "reward": 0.9850000143051147, "reward_std": 0.02999999187886715, "rewards/reward_func/mean": 0.9850000143051147, "rewards/reward_func/std": 0.030000001192092896, "sampling/importance_sampling_ratio/max": 0.9628943800926208, "sampling/importance_sampling_ratio/mean": 0.8231489658355713, "sampling/importance_sampling_ratio/min": 0.6100350618362427, "sampling/sampling_logp_difference/max": 0.5080480575561523, "sampling/sampling_logp_difference/mean": 0.01045480277389288, "step": 549, "step_time": 16.673855780041777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 59.5, "completions/mean_terminated_length": 59.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2084929198026657, "epoch": 0.55, "frac_reward_zero_std": 0.0, "grad_norm": 1.401795506477356, "kl": 0.011550961062312126, "learning_rate": 2.225261826330543e-06, "loss": -0.1234, "num_tokens": 1543544.0, "reward": 0.4775000214576721, "reward_std": 0.6040074825286865, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.6040074825286865, "sampling/importance_sampling_ratio/max": 1.4999781847000122, "sampling/importance_sampling_ratio/mean": 1.0326167345046997, "sampling/importance_sampling_ratio/min": 0.5983251333236694, "sampling/sampling_logp_difference/max": 0.49149179458618164, "sampling/sampling_logp_difference/mean": 0.021012894809246063, "step": 550, "step_time": 28.006484573008493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.17507153749465942, "epoch": 0.551, "frac_reward_zero_std": 0.0, "grad_norm": 1.0373094081878662, "kl": 0.013095785863697529, "learning_rate": 2.2172154345117896e-06, "loss": -0.0395, "num_tokens": 1546795.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 0.9167131185531616, "sampling/importance_sampling_ratio/mean": 0.8577436208724976, "sampling/importance_sampling_ratio/min": 0.7917744517326355, "sampling/sampling_logp_difference/max": 0.2589302062988281, "sampling/sampling_logp_difference/mean": 0.013819430954754353, "step": 551, "step_time": 28.05247093900107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.0, "completions/mean_terminated_length": 47.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.1935310810804367, "epoch": 0.552, "frac_reward_zero_std": 0.0, "grad_norm": 1.0377888679504395, "kl": 0.0087743503972888, "learning_rate": 2.209172008969317e-06, "loss": -0.1455, "num_tokens": 1549234.0, "reward": 0.4399999976158142, "reward_std": 0.6468384861946106, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.6468384861946106, "sampling/importance_sampling_ratio/max": 1.0421725511550903, "sampling/importance_sampling_ratio/mean": 0.8222566843032837, "sampling/importance_sampling_ratio/min": 0.6519286632537842, "sampling/sampling_logp_difference/max": 0.4905295968055725, "sampling/sampling_logp_difference/mean": 0.02359793521463871, "step": 552, "step_time": 22.694751097005792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1965019553899765, "epoch": 0.553, "frac_reward_zero_std": 0.0, "grad_norm": 1.491653323173523, "kl": 0.005615359637886286, "learning_rate": 2.2011316340748533e-06, "loss": 0.0919, "num_tokens": 1551599.0, "reward": 0.737500011920929, "reward_std": 0.5116883516311646, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5116883516311646, "sampling/importance_sampling_ratio/max": 1.1223933696746826, "sampling/importance_sampling_ratio/mean": 0.9234539270401001, "sampling/importance_sampling_ratio/min": 0.5564112663269043, "sampling/sampling_logp_difference/max": 0.3128211498260498, "sampling/sampling_logp_difference/mean": 0.021106241270899773, "step": 553, "step_time": 14.317525877966546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1963396668434143, "epoch": 0.554, "frac_reward_zero_std": 0.0, "grad_norm": 0.6469763517379761, "kl": 0.03931301087141037, "learning_rate": 2.1930943941681255e-06, "loss": -0.3369, "num_tokens": 1554641.0, "reward": 0.7325000166893005, "reward_std": 0.5216879844665527, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5216879844665527, "sampling/importance_sampling_ratio/max": 1.881239414215088, "sampling/importance_sampling_ratio/mean": 0.8046857118606567, "sampling/importance_sampling_ratio/min": 0.15505538880825043, "sampling/sampling_logp_difference/max": 0.8890175819396973, "sampling/sampling_logp_difference/mean": 0.036655493080616, "step": 554, "step_time": 29.9529142450192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.17906050384044647, "epoch": 0.555, "frac_reward_zero_std": 0.0, "grad_norm": 1.2063989639282227, "kl": 0.016760854050517082, "learning_rate": 2.185060373555978e-06, "loss": -0.2184, "num_tokens": 1557332.0, "reward": 0.737500011920929, "reward_std": 0.5183547735214233, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5183547735214233, "sampling/importance_sampling_ratio/max": 1.7113896608352661, "sampling/importance_sampling_ratio/mean": 0.997212290763855, "sampling/importance_sampling_ratio/min": 0.3892698884010315, "sampling/sampling_logp_difference/max": 0.48302412033081055, "sampling/sampling_logp_difference/mean": 0.020756032317876816, "step": 555, "step_time": 13.989471086009871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.15140654146671295, "epoch": 0.556, "frac_reward_zero_std": 0.0, "grad_norm": 0.9366339445114136, "kl": 0.006299484521150589, "learning_rate": 2.1770296565114847e-06, "loss": -0.2567, "num_tokens": 1559914.0, "reward": 0.4350000023841858, "reward_std": 0.6245265007019043, "rewards/reward_func/mean": 0.4350000023841858, "rewards/reward_func/std": 0.6245265007019043, "sampling/importance_sampling_ratio/max": 1.2573277950286865, "sampling/importance_sampling_ratio/mean": 1.002599835395813, "sampling/importance_sampling_ratio/min": 0.6998505592346191, "sampling/sampling_logp_difference/max": 0.310474157333374, "sampling/sampling_logp_difference/mean": 0.011613940820097923, "step": 556, "step_time": 27.522587520012166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1880137324333191, "epoch": 0.557, "frac_reward_zero_std": 0.0, "grad_norm": 1.1455353498458862, "kl": 0.010775743052363396, "learning_rate": 2.169002327273068e-06, "loss": -0.1295, "num_tokens": 1562544.0, "reward": 0.4650000035762787, "reward_std": 0.5956228971481323, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.5956228971481323, "sampling/importance_sampling_ratio/max": 1.2928119897842407, "sampling/importance_sampling_ratio/mean": 1.0278964042663574, "sampling/importance_sampling_ratio/min": 0.899336576461792, "sampling/sampling_logp_difference/max": 0.4132804870605469, "sampling/sampling_logp_difference/mean": 0.01673755794763565, "step": 557, "step_time": 30.28098002698971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18321138620376587, "epoch": 0.558, "frac_reward_zero_std": 0.0, "grad_norm": 0.68003910779953, "kl": 0.018360601738095284, "learning_rate": 2.1609784700436122e-06, "loss": -0.008, "num_tokens": 1565574.0, "reward": 0.24000000953674316, "reward_std": 0.5068201422691345, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.5068202018737793, "sampling/importance_sampling_ratio/max": 1.2733200788497925, "sampling/importance_sampling_ratio/mean": 0.8020591735839844, "sampling/importance_sampling_ratio/min": 0.5536126494407654, "sampling/sampling_logp_difference/max": 0.7074928283691406, "sampling/sampling_logp_difference/mean": 0.017244476824998856, "step": 558, "step_time": 22.527393996017054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.13634589314460754, "epoch": 0.559, "frac_reward_zero_std": 0.0, "grad_norm": 1.1534173488616943, "kl": 0.013781298883259296, "learning_rate": 2.1529581689895838e-06, "loss": 0.0767, "num_tokens": 1568534.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 1.4943662881851196, "sampling/importance_sampling_ratio/mean": 0.9975372552871704, "sampling/importance_sampling_ratio/min": 0.6996281147003174, "sampling/sampling_logp_difference/max": 0.5049158334732056, "sampling/sampling_logp_difference/mean": 0.017069125548005104, "step": 559, "step_time": 23.435088104975875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1607673466205597, "epoch": 0.56, "frac_reward_zero_std": 0.0, "grad_norm": 0.6935126185417175, "kl": 0.0070341844111680984, "learning_rate": 2.1449415082401455e-06, "loss": -0.0265, "num_tokens": 1570963.0, "reward": 0.7275000214576721, "reward_std": 0.4987567961215973, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.4987567663192749, "sampling/importance_sampling_ratio/max": 0.9071668386459351, "sampling/importance_sampling_ratio/mean": 0.7926523685455322, "sampling/importance_sampling_ratio/min": 0.7285634875297546, "sampling/sampling_logp_difference/max": 0.31881195306777954, "sampling/sampling_logp_difference/mean": 0.012405512854456902, "step": 560, "step_time": 17.11262461298611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.17074787616729736, "epoch": 0.561, "frac_reward_zero_std": 0.0, "grad_norm": 0.9756953120231628, "kl": 0.0076380237005651, "learning_rate": 2.136928571886275e-06, "loss": 0.2236, "num_tokens": 1573250.0, "reward": 0.44999998807907104, "reward_std": 0.6403124332427979, "rewards/reward_func/mean": 0.44999998807907104, "rewards/reward_func/std": 0.6403124332427979, "sampling/importance_sampling_ratio/max": 1.7570196390151978, "sampling/importance_sampling_ratio/mean": 0.9478094577789307, "sampling/importance_sampling_ratio/min": 0.5307155847549438, "sampling/sampling_logp_difference/max": 0.4615117311477661, "sampling/sampling_logp_difference/mean": 0.015140177682042122, "step": 561, "step_time": 18.707036845968105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18240706622600555, "epoch": 0.562, "frac_reward_zero_std": 0.0, "grad_norm": 0.9161487221717834, "kl": 0.008828874677419662, "learning_rate": 2.128919443979882e-06, "loss": 0.0056, "num_tokens": 1575809.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.0961627960205078, "sampling/importance_sampling_ratio/mean": 0.9545211791992188, "sampling/importance_sampling_ratio/min": 0.764712929725647, "sampling/sampling_logp_difference/max": 0.2911878824234009, "sampling/sampling_logp_difference/mean": 0.01353977620601654, "step": 562, "step_time": 19.942122620006558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.16890135407447815, "epoch": 0.563, "frac_reward_zero_std": 0.0, "grad_norm": 1.6746490001678467, "kl": 0.012181010097265244, "learning_rate": 2.12091420853293e-06, "loss": 0.204, "num_tokens": 1578820.0, "reward": 0.7350000143051147, "reward_std": 0.49027201533317566, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.49027204513549805, "sampling/importance_sampling_ratio/max": 1.197340726852417, "sampling/importance_sampling_ratio/mean": 1.0842669010162354, "sampling/importance_sampling_ratio/min": 0.9950591921806335, "sampling/sampling_logp_difference/max": 0.4949207305908203, "sampling/sampling_logp_difference/mean": 0.015737464651465416, "step": 563, "step_time": 28.231109806976747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1925123929977417, "epoch": 0.564, "frac_reward_zero_std": 0.0, "grad_norm": 1.0380994081497192, "kl": 0.010998104698956013, "learning_rate": 2.1129129495165508e-06, "loss": -0.112, "num_tokens": 1581695.0, "reward": 0.6875, "reward_std": 0.47296053171157837, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.47296053171157837, "sampling/importance_sampling_ratio/max": 1.6410164833068848, "sampling/importance_sampling_ratio/mean": 1.1356191635131836, "sampling/importance_sampling_ratio/min": 0.6680893898010254, "sampling/sampling_logp_difference/max": 0.5261433124542236, "sampling/sampling_logp_difference/mean": 0.020300768315792084, "step": 564, "step_time": 32.89950092602521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.21554763615131378, "epoch": 0.565, "frac_reward_zero_std": 0.0, "grad_norm": 1.1489139795303345, "kl": 0.010165700688958168, "learning_rate": 2.104915750860164e-06, "loss": -0.1273, "num_tokens": 1584339.0, "reward": 0.23000000417232513, "reward_std": 0.5134848356246948, "rewards/reward_func/mean": 0.23000000417232513, "rewards/reward_func/std": 0.5134848356246948, "sampling/importance_sampling_ratio/max": 1.5302848815917969, "sampling/importance_sampling_ratio/mean": 1.1176159381866455, "sampling/importance_sampling_ratio/min": 0.6175261735916138, "sampling/sampling_logp_difference/max": 0.6813218593597412, "sampling/sampling_logp_difference/mean": 0.022268250584602356, "step": 565, "step_time": 25.675593061023392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.20563244819641113, "epoch": 0.566, "frac_reward_zero_std": 0.0, "grad_norm": 0.9566208720207214, "kl": 0.0067217121832072735, "learning_rate": 2.0969226964506007e-06, "loss": -0.3049, "num_tokens": 1586594.0, "reward": 0.6625000238418579, "reward_std": 0.5435301661491394, "rewards/reward_func/mean": 0.6625000238418579, "rewards/reward_func/std": 0.5435301065444946, "sampling/importance_sampling_ratio/max": 2.1684482097625732, "sampling/importance_sampling_ratio/mean": 1.4941298961639404, "sampling/importance_sampling_ratio/min": 0.6982694268226624, "sampling/sampling_logp_difference/max": 0.47567856311798096, "sampling/sampling_logp_difference/mean": 0.019946280866861343, "step": 566, "step_time": 23.137613758968655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16373972594738007, "epoch": 0.567, "frac_reward_zero_std": 0.0, "grad_norm": 1.4902416467666626, "kl": 0.010137942619621754, "learning_rate": 2.0889338701312184e-06, "loss": -0.4467, "num_tokens": 1589405.0, "reward": 0.4675000011920929, "reward_std": 0.6171642541885376, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6171642541885376, "sampling/importance_sampling_ratio/max": 2.313650608062744, "sampling/importance_sampling_ratio/mean": 1.0985288619995117, "sampling/importance_sampling_ratio/min": 0.3526371419429779, "sampling/sampling_logp_difference/max": 0.714303195476532, "sampling/sampling_logp_difference/mean": 0.01857610233128071, "step": 567, "step_time": 23.3855633129715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.19416585564613342, "epoch": 0.568, "frac_reward_zero_std": 0.0, "grad_norm": 1.3988885879516602, "kl": 0.019713953137397766, "learning_rate": 2.080949355701025e-06, "loss": -0.1722, "num_tokens": 1591768.0, "reward": 0.45249998569488525, "reward_std": 0.6323171257972717, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6323171854019165, "sampling/importance_sampling_ratio/max": 1.5450454950332642, "sampling/importance_sampling_ratio/mean": 1.0158276557922363, "sampling/importance_sampling_ratio/min": 0.4932379722595215, "sampling/sampling_logp_difference/max": 0.6672790050506592, "sampling/sampling_logp_difference/mean": 0.024283327162265778, "step": 568, "step_time": 21.466254916973412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17207637429237366, "epoch": 0.569, "frac_reward_zero_std": 0.0, "grad_norm": 1.1548738479614258, "kl": 0.010397680103778839, "learning_rate": 2.072969236913799e-06, "loss": -0.1417, "num_tokens": 1594820.0, "reward": 0.45749998092651367, "reward_std": 0.6302579045295715, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6302579045295715, "sampling/importance_sampling_ratio/max": 1.1821357011795044, "sampling/importance_sampling_ratio/mean": 0.9595094323158264, "sampling/importance_sampling_ratio/min": 0.40532124042510986, "sampling/sampling_logp_difference/max": 0.3690462112426758, "sampling/sampling_logp_difference/mean": 0.015716174617409706, "step": 569, "step_time": 22.32372966100229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1591309756040573, "epoch": 0.57, "frac_reward_zero_std": 0.0, "grad_norm": 1.9510987997055054, "kl": 0.023440148681402206, "learning_rate": 2.0649935974772104e-06, "loss": 0.0434, "num_tokens": 1598168.0, "reward": 0.9724999666213989, "reward_std": 0.05500000715255737, "rewards/reward_func/mean": 0.9724999666213989, "rewards/reward_func/std": 0.05500000715255737, "sampling/importance_sampling_ratio/max": 1.2546721696853638, "sampling/importance_sampling_ratio/mean": 0.9336391687393188, "sampling/importance_sampling_ratio/min": 0.7026205658912659, "sampling/sampling_logp_difference/max": 0.3571431636810303, "sampling/sampling_logp_difference/mean": 0.01680237054824829, "step": 570, "step_time": 24.72128713602433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16578571498394012, "epoch": 0.571, "frac_reward_zero_std": 0.0, "grad_norm": 1.0916157960891724, "kl": 0.017099041491746902, "learning_rate": 2.0570225210519433e-06, "loss": -0.2066, "num_tokens": 1600515.0, "reward": 0.4650000035762787, "reward_std": 0.6197042465209961, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.6197042465209961, "sampling/importance_sampling_ratio/max": 1.4480493068695068, "sampling/importance_sampling_ratio/mean": 0.8650913238525391, "sampling/importance_sampling_ratio/min": 0.49321138858795166, "sampling/sampling_logp_difference/max": 0.6714671850204468, "sampling/sampling_logp_difference/mean": 0.023404696956276894, "step": 571, "step_time": 20.396697135001887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19837191700935364, "epoch": 0.572, "frac_reward_zero_std": 0.0, "grad_norm": 1.0660628080368042, "kl": 0.01980995200574398, "learning_rate": 2.0490560912508167e-06, "loss": -0.2105, "num_tokens": 1603811.0, "reward": 0.4699999988079071, "reward_std": 0.6139489412307739, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6139489412307739, "sampling/importance_sampling_ratio/max": 1.367896318435669, "sampling/importance_sampling_ratio/mean": 0.7417945861816406, "sampling/importance_sampling_ratio/min": 0.3725450932979584, "sampling/sampling_logp_difference/max": 0.4173922538757324, "sampling/sampling_logp_difference/mean": 0.024606123566627502, "step": 572, "step_time": 33.77874282002449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1615472137928009, "epoch": 0.573, "frac_reward_zero_std": 0.0, "grad_norm": 1.644134521484375, "kl": 0.018680034205317497, "learning_rate": 2.04109439163791e-06, "loss": -0.5068, "num_tokens": 1606367.0, "reward": 0.18250000476837158, "reward_std": 0.5472583174705505, "rewards/reward_func/mean": 0.18250000476837158, "rewards/reward_func/std": 0.5472583174705505, "sampling/importance_sampling_ratio/max": 2.0408365726470947, "sampling/importance_sampling_ratio/mean": 1.2683742046356201, "sampling/importance_sampling_ratio/min": 0.5370249152183533, "sampling/sampling_logp_difference/max": 0.42431628704071045, "sampling/sampling_logp_difference/mean": 0.02046387642621994, "step": 573, "step_time": 24.692111626034603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1776924580335617, "epoch": 0.574, "frac_reward_zero_std": 0.0, "grad_norm": 0.9342427849769592, "kl": 0.1494654268026352, "learning_rate": 2.0331375057276844e-06, "loss": -0.0557, "num_tokens": 1609038.0, "reward": 0.42250001430511475, "reward_std": 0.632369875907898, "rewards/reward_func/mean": 0.42250001430511475, "rewards/reward_func/std": 0.6323699355125427, "sampling/importance_sampling_ratio/max": 0.9660295248031616, "sampling/importance_sampling_ratio/mean": 0.6741440296173096, "sampling/importance_sampling_ratio/min": 0.43455618619918823, "sampling/sampling_logp_difference/max": 0.5720276832580566, "sampling/sampling_logp_difference/mean": 0.021102583035826683, "step": 574, "step_time": 37.63404021598399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 44.75, "completions/mean_terminated_length": 44.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1711556613445282, "epoch": 0.575, "frac_reward_zero_std": 0.0, "grad_norm": 1.2140580415725708, "kl": 0.011556285433471203, "learning_rate": 2.025185516984108e-06, "loss": -0.1598, "num_tokens": 1611764.0, "reward": 0.7300000190734863, "reward_std": 0.5136795043945312, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5136795043945312, "sampling/importance_sampling_ratio/max": 1.7562966346740723, "sampling/importance_sampling_ratio/mean": 1.0274772644042969, "sampling/importance_sampling_ratio/min": 0.525954008102417, "sampling/sampling_logp_difference/max": 0.6746053695678711, "sampling/sampling_logp_difference/mean": 0.023900100961327553, "step": 575, "step_time": 23.891220649995375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.19333860278129578, "epoch": 0.576, "frac_reward_zero_std": 0.0, "grad_norm": 2.476130962371826, "kl": 0.014102641493082047, "learning_rate": 2.0172385088197804e-06, "loss": 0.3935, "num_tokens": 1614582.0, "reward": 0.7425000071525574, "reward_std": 0.5083552002906799, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5083552002906799, "sampling/importance_sampling_ratio/max": 1.6876217126846313, "sampling/importance_sampling_ratio/mean": 1.1186442375183105, "sampling/importance_sampling_ratio/min": 0.8143072128295898, "sampling/sampling_logp_difference/max": 0.7093896865844727, "sampling/sampling_logp_difference/mean": 0.019945265725255013, "step": 576, "step_time": 25.426872402022127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.18721306324005127, "epoch": 0.577, "frac_reward_zero_std": 0.0, "grad_norm": 1.5255836248397827, "kl": 0.008960286155343056, "learning_rate": 2.0092965645950565e-06, "loss": -0.1193, "num_tokens": 1617056.0, "reward": 0.737500011920929, "reward_std": 0.5183547735214233, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5183547735214233, "sampling/importance_sampling_ratio/max": 1.4819788932800293, "sampling/importance_sampling_ratio/mean": 1.1694765090942383, "sampling/importance_sampling_ratio/min": 0.787470817565918, "sampling/sampling_logp_difference/max": 0.6177096366882324, "sampling/sampling_logp_difference/mean": 0.02337830886244774, "step": 577, "step_time": 18.628113362996373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.17364999651908875, "epoch": 0.578, "frac_reward_zero_std": 0.0, "grad_norm": 1.625807762145996, "kl": 0.00645641190931201, "learning_rate": 2.001359767617173e-06, "loss": -0.0531, "num_tokens": 1619852.0, "reward": 0.17000000178813934, "reward_std": 0.5469308495521545, "rewards/reward_func/mean": 0.17000000178813934, "rewards/reward_func/std": 0.5469308495521545, "sampling/importance_sampling_ratio/max": 1.6043492555618286, "sampling/importance_sampling_ratio/mean": 1.288439154624939, "sampling/importance_sampling_ratio/min": 0.7983236312866211, "sampling/sampling_logp_difference/max": 0.47069698572158813, "sampling/sampling_logp_difference/mean": 0.015227319672703743, "step": 578, "step_time": 31.546281149960123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16249112784862518, "epoch": 0.579, "frac_reward_zero_std": 0.0, "grad_norm": 0.855338454246521, "kl": 0.014340748079121113, "learning_rate": 1.993428201139375e-06, "loss": 0.1614, "num_tokens": 1623202.0, "reward": 0.49000000953674316, "reward_std": 0.5891236066818237, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5891236066818237, "sampling/importance_sampling_ratio/max": 1.1258803606033325, "sampling/importance_sampling_ratio/mean": 0.5905387997627258, "sampling/importance_sampling_ratio/min": 0.3119637966156006, "sampling/sampling_logp_difference/max": 0.5721312761306763, "sampling/sampling_logp_difference/mean": 0.02034752443432808, "step": 579, "step_time": 27.761928386986256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16493532061576843, "epoch": 0.58, "frac_reward_zero_std": 0.0, "grad_norm": 0.66352778673172, "kl": 0.015348327346146107, "learning_rate": 1.9855019483600413e-06, "loss": -0.0229, "num_tokens": 1626081.0, "reward": 0.9850000143051147, "reward_std": 0.02999999187886715, "rewards/reward_func/mean": 0.9850000143051147, "rewards/reward_func/std": 0.030000001192092896, "sampling/importance_sampling_ratio/max": 1.0244231224060059, "sampling/importance_sampling_ratio/mean": 0.7888048887252808, "sampling/importance_sampling_ratio/min": 0.47145190834999084, "sampling/sampling_logp_difference/max": 0.45228952169418335, "sampling/sampling_logp_difference/mean": 0.019240688532590866, "step": 580, "step_time": 22.822745835001115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17383331060409546, "epoch": 0.581, "frac_reward_zero_std": 0.0, "grad_norm": 1.0017318725585938, "kl": 0.023910004645586014, "learning_rate": 1.9775810924218126e-06, "loss": -0.126, "num_tokens": 1628684.0, "reward": -0.044999998062849045, "reward_std": 0.05196152627468109, "rewards/reward_func/mean": -0.044999998062849045, "rewards/reward_func/std": 0.05196152254939079, "sampling/importance_sampling_ratio/max": 2.389702796936035, "sampling/importance_sampling_ratio/mean": 1.265378713607788, "sampling/importance_sampling_ratio/min": 0.5106785297393799, "sampling/sampling_logp_difference/max": 0.36163783073425293, "sampling/sampling_logp_difference/mean": 0.019938837736845016, "step": 581, "step_time": 29.458055525959935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1399996280670166, "epoch": 0.582, "frac_reward_zero_std": 0.0, "grad_norm": 0.6925249695777893, "kl": 0.011698340997099876, "learning_rate": 1.96966571641072e-06, "loss": -0.0738, "num_tokens": 1631345.0, "reward": 0.7300000190734863, "reward_std": 0.5400000214576721, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 0.9766188859939575, "sampling/importance_sampling_ratio/mean": 0.7278833985328674, "sampling/importance_sampling_ratio/min": 0.6173131465911865, "sampling/sampling_logp_difference/max": 0.44561052322387695, "sampling/sampling_logp_difference/mean": 0.014748035930097103, "step": 582, "step_time": 19.465083101997152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19408434629440308, "epoch": 0.583, "frac_reward_zero_std": 0.0, "grad_norm": 1.1201423406600952, "kl": 0.01857304945588112, "learning_rate": 1.9617559033553128e-06, "loss": 0.2215, "num_tokens": 1634311.0, "reward": 0.23750001192092896, "reward_std": 0.5018216967582703, "rewards/reward_func/mean": 0.23750001192092896, "rewards/reward_func/std": 0.5018216967582703, "sampling/importance_sampling_ratio/max": 1.6090428829193115, "sampling/importance_sampling_ratio/mean": 0.7992262840270996, "sampling/importance_sampling_ratio/min": 0.3101765513420105, "sampling/sampling_logp_difference/max": 0.5018956661224365, "sampling/sampling_logp_difference/mean": 0.028043806552886963, "step": 583, "step_time": 34.52744635200361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.19991213083267212, "epoch": 0.584, "frac_reward_zero_std": 0.0, "grad_norm": 0.630154550075531, "kl": 0.021365532651543617, "learning_rate": 1.953851736225787e-06, "loss": 0.175, "num_tokens": 1637059.0, "reward": 0.7300000190734863, "reward_std": 0.5400000214576721, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 0.71980220079422, "sampling/importance_sampling_ratio/mean": 0.43649959564208984, "sampling/importance_sampling_ratio/min": 0.1439371407032013, "sampling/sampling_logp_difference/max": 0.9585878849029541, "sampling/sampling_logp_difference/mean": 0.025984574109315872, "step": 584, "step_time": 18.098591865040362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.19742368161678314, "epoch": 0.585, "frac_reward_zero_std": 0.0, "grad_norm": 0.6319071054458618, "kl": 0.01630057953298092, "learning_rate": 1.945953297933115e-06, "loss": 0.1864, "num_tokens": 1639288.0, "reward": 0.42250001430511475, "reward_std": 0.6327913999557495, "rewards/reward_func/mean": 0.42250001430511475, "rewards/reward_func/std": 0.6327914595603943, "sampling/importance_sampling_ratio/max": 0.7741903066635132, "sampling/importance_sampling_ratio/mean": 0.5842657089233398, "sampling/importance_sampling_ratio/min": 0.2320796549320221, "sampling/sampling_logp_difference/max": 0.39844298362731934, "sampling/sampling_logp_difference/mean": 0.021566934883594513, "step": 585, "step_time": 24.720460382988676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18527953326702118, "epoch": 0.586, "frac_reward_zero_std": 0.0, "grad_norm": 0.9200698733329773, "kl": 0.016404125839471817, "learning_rate": 1.9380606713281773e-06, "loss": 0.3698, "num_tokens": 1642225.0, "reward": 0.4724999964237213, "reward_std": 0.5865932703018188, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.5865932703018188, "sampling/importance_sampling_ratio/max": 1.3946725130081177, "sampling/importance_sampling_ratio/mean": 0.9849956035614014, "sampling/importance_sampling_ratio/min": 0.4897424578666687, "sampling/sampling_logp_difference/max": 0.6236510276794434, "sampling/sampling_logp_difference/mean": 0.021024227142333984, "step": 586, "step_time": 33.771775234956294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19586940109729767, "epoch": 0.587, "frac_reward_zero_std": 0.0, "grad_norm": 1.336060881614685, "kl": 0.0107067059725523, "learning_rate": 1.9301739392008923e-06, "loss": 0.1619, "num_tokens": 1645063.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 2.8116209506988525, "sampling/importance_sampling_ratio/mean": 1.5140750408172607, "sampling/importance_sampling_ratio/min": 0.5542681813240051, "sampling/sampling_logp_difference/max": 0.6473186016082764, "sampling/sampling_logp_difference/mean": 0.02368604578077793, "step": 587, "step_time": 13.073956284031738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.14482243359088898, "epoch": 0.588, "frac_reward_zero_std": 0.0, "grad_norm": 0.7719702124595642, "kl": 0.018998006358742714, "learning_rate": 1.9222931842793473e-06, "loss": -0.087, "num_tokens": 1648369.0, "reward": 0.48750001192092896, "reward_std": 0.5860247611999512, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.0106639862060547, "sampling/importance_sampling_ratio/mean": 0.5729196667671204, "sampling/importance_sampling_ratio/min": 0.312263160943985, "sampling/sampling_logp_difference/max": 0.676027774810791, "sampling/sampling_logp_difference/mean": 0.025145273655653, "step": 588, "step_time": 28.816294016025495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.2115717977285385, "epoch": 0.589, "frac_reward_zero_std": 0.0, "grad_norm": 1.1957529783248901, "kl": 0.0218299999833107, "learning_rate": 1.914418489228934e-06, "loss": 0.1514, "num_tokens": 1651100.0, "reward": 0.42500001192092896, "reward_std": 0.630634605884552, "rewards/reward_func/mean": 0.42500001192092896, "rewards/reward_func/std": 0.630634605884552, "sampling/importance_sampling_ratio/max": 1.4291170835494995, "sampling/importance_sampling_ratio/mean": 0.789110004901886, "sampling/importance_sampling_ratio/min": 0.0659986138343811, "sampling/sampling_logp_difference/max": 0.9352502822875977, "sampling/sampling_logp_difference/mean": 0.02863622084259987, "step": 589, "step_time": 25.840631875034887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1859912872314453, "epoch": 0.59, "frac_reward_zero_std": 0.0, "grad_norm": 1.1814531087875366, "kl": 0.013163971714675426, "learning_rate": 1.9065499366514759e-06, "loss": 0.2143, "num_tokens": 1653462.0, "reward": 0.46000000834465027, "reward_std": 0.6235917806625366, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.6235917210578918, "sampling/importance_sampling_ratio/max": 1.8124785423278809, "sampling/importance_sampling_ratio/mean": 0.9648075103759766, "sampling/importance_sampling_ratio/min": 0.3617645800113678, "sampling/sampling_logp_difference/max": 0.45198607444763184, "sampling/sampling_logp_difference/mean": 0.02289563976228237, "step": 590, "step_time": 22.446324992983136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1949092149734497, "epoch": 0.591, "frac_reward_zero_std": 0.0, "grad_norm": 1.0955817699432373, "kl": 0.020752057433128357, "learning_rate": 1.8986876090843668e-06, "loss": -0.1271, "num_tokens": 1656211.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.8768986463546753, "sampling/importance_sampling_ratio/mean": 1.0667930841445923, "sampling/importance_sampling_ratio/min": 0.6090972423553467, "sampling/sampling_logp_difference/max": 0.5111887454986572, "sampling/sampling_logp_difference/mean": 0.020936936140060425, "step": 591, "step_time": 22.87828828796046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.15343612432479858, "epoch": 0.592, "frac_reward_zero_std": 0.0, "grad_norm": 0.6913102269172668, "kl": 0.01498109009116888, "learning_rate": 1.8908315889997009e-06, "loss": 0.0334, "num_tokens": 1659040.0, "reward": 0.4749999940395355, "reward_std": 0.6062726974487305, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6062727570533752, "sampling/importance_sampling_ratio/max": 0.7413890361785889, "sampling/importance_sampling_ratio/mean": 0.6881894469261169, "sampling/importance_sampling_ratio/min": 0.618586003780365, "sampling/sampling_logp_difference/max": 0.392078161239624, "sampling/sampling_logp_difference/mean": 0.017242174595594406, "step": 592, "step_time": 19.608906787005253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.19732992351055145, "epoch": 0.593, "frac_reward_zero_std": 0.0, "grad_norm": 1.0058780908584595, "kl": 0.0164633858948946, "learning_rate": 1.8829819588034143e-06, "loss": -0.109, "num_tokens": 1662040.0, "reward": 0.7000000476837158, "reward_std": 0.5736433267593384, "rewards/reward_func/mean": 0.7000000476837158, "rewards/reward_func/std": 0.5736433267593384, "sampling/importance_sampling_ratio/max": 1.3050215244293213, "sampling/importance_sampling_ratio/mean": 0.8046103715896606, "sampling/importance_sampling_ratio/min": 0.5838481783866882, "sampling/sampling_logp_difference/max": 0.5241312980651855, "sampling/sampling_logp_difference/mean": 0.019689172506332397, "step": 593, "step_time": 27.828738048963714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 61.0, "completions/mean_terminated_length": 61.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.18106843531131744, "epoch": 0.594, "frac_reward_zero_std": 0.0, "grad_norm": 0.9014976024627686, "kl": 0.00932732317596674, "learning_rate": 1.8751388008344117e-06, "loss": 0.1213, "num_tokens": 1664850.0, "reward": 0.19499999284744263, "reward_std": 0.5369978547096252, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.5369978547096252, "sampling/importance_sampling_ratio/max": 1.3304373025894165, "sampling/importance_sampling_ratio/mean": 0.9427988529205322, "sampling/importance_sampling_ratio/min": 0.5334572196006775, "sampling/sampling_logp_difference/max": 0.32513856887817383, "sampling/sampling_logp_difference/mean": 0.018368016928434372, "step": 594, "step_time": 24.162014703033492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.17777368426322937, "epoch": 0.595, "frac_reward_zero_std": 0.0, "grad_norm": 1.1153016090393066, "kl": 0.01979278028011322, "learning_rate": 1.8673021973637095e-06, "loss": -0.1784, "num_tokens": 1667657.0, "reward": 0.47749999165534973, "reward_std": 0.6044487953186035, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6044487953186035, "sampling/importance_sampling_ratio/max": 1.5729310512542725, "sampling/importance_sampling_ratio/mean": 0.9825156331062317, "sampling/importance_sampling_ratio/min": 0.18744976818561554, "sampling/sampling_logp_difference/max": 0.5443459749221802, "sampling/sampling_logp_difference/mean": 0.022275365889072418, "step": 595, "step_time": 29.35185990802711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.14938239753246307, "epoch": 0.596, "frac_reward_zero_std": 0.0, "grad_norm": 1.1242605447769165, "kl": 0.015342378057539463, "learning_rate": 1.8594722305935691e-06, "loss": -0.0693, "num_tokens": 1670525.0, "reward": 0.17499999701976776, "reward_std": 0.5518755912780762, "rewards/reward_func/mean": 0.17499999701976776, "rewards/reward_func/std": 0.5518755912780762, "sampling/importance_sampling_ratio/max": 1.4106215238571167, "sampling/importance_sampling_ratio/mean": 1.279104471206665, "sampling/importance_sampling_ratio/min": 1.156643033027649, "sampling/sampling_logp_difference/max": 0.2774249315261841, "sampling/sampling_logp_difference/mean": 0.014182811602950096, "step": 596, "step_time": 32.65863726002863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.17952626943588257, "epoch": 0.597, "frac_reward_zero_std": 0.0, "grad_norm": 0.7371413707733154, "kl": 0.010804688557982445, "learning_rate": 1.8516489826566375e-06, "loss": -0.2337, "num_tokens": 1673262.0, "reward": 0.45499998331069946, "reward_std": 0.6236719489097595, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.6236718893051147, "sampling/importance_sampling_ratio/max": 1.470967411994934, "sampling/importance_sampling_ratio/mean": 0.8384777903556824, "sampling/importance_sampling_ratio/min": 0.5241918563842773, "sampling/sampling_logp_difference/max": 0.5816419720649719, "sampling/sampling_logp_difference/mean": 0.02132377028465271, "step": 597, "step_time": 23.880131153971888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18995465338230133, "epoch": 0.598, "frac_reward_zero_std": 0.0, "grad_norm": 1.5209081172943115, "kl": 0.007019128184765577, "learning_rate": 1.8438325356150827e-06, "loss": 0.109, "num_tokens": 1675909.0, "reward": 0.6875, "reward_std": 0.5530144572257996, "rewards/reward_func/mean": 0.6875, "rewards/reward_func/std": 0.5530144572257996, "sampling/importance_sampling_ratio/max": 1.6807653903961182, "sampling/importance_sampling_ratio/mean": 1.2256295680999756, "sampling/importance_sampling_ratio/min": 0.8899608254432678, "sampling/sampling_logp_difference/max": 0.22742819786071777, "sampling/sampling_logp_difference/mean": 0.015189962461590767, "step": 598, "step_time": 29.437737072992604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.16451837122440338, "epoch": 0.599, "frac_reward_zero_std": 0.0, "grad_norm": 1.023432731628418, "kl": 0.14609260857105255, "learning_rate": 1.8360229714597372e-06, "loss": 0.4399, "num_tokens": 1678628.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 1.5721912384033203, "sampling/importance_sampling_ratio/mean": 0.7656333446502686, "sampling/importance_sampling_ratio/min": 0.2525116503238678, "sampling/sampling_logp_difference/max": 1.0618338584899902, "sampling/sampling_logp_difference/mean": 0.02548466995358467, "step": 599, "step_time": 17.41107401397312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.15857627987861633, "epoch": 0.6, "frac_reward_zero_std": 0.0, "grad_norm": 1.2168829441070557, "kl": 0.05447656288743019, "learning_rate": 1.828220372109232e-06, "loss": -0.0058, "num_tokens": 1681469.0, "reward": 0.4724999964237213, "reward_std": 0.5865932703018188, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.5865932703018188, "sampling/importance_sampling_ratio/max": 1.4920563697814941, "sampling/importance_sampling_ratio/mean": 1.0213264226913452, "sampling/importance_sampling_ratio/min": 0.4725002348423004, "sampling/sampling_logp_difference/max": 0.2785921096801758, "sampling/sampling_logp_difference/mean": 0.014490817673504353, "step": 600, "step_time": 30.039025848032907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2049519568681717, "epoch": 0.601, "frac_reward_zero_std": 0.0, "grad_norm": 1.173938274383545, "kl": 0.013983277603983879, "learning_rate": 1.8204248194091429e-06, "loss": -0.2237, "num_tokens": 1684103.0, "reward": 0.23250000178813934, "reward_std": 0.5054618120193481, "rewards/reward_func/mean": 0.23250000178813934, "rewards/reward_func/std": 0.5054618716239929, "sampling/importance_sampling_ratio/max": 1.2881920337677002, "sampling/importance_sampling_ratio/mean": 0.965518593788147, "sampling/importance_sampling_ratio/min": 0.46143829822540283, "sampling/sampling_logp_difference/max": 0.3543124198913574, "sampling/sampling_logp_difference/mean": 0.018465539440512657, "step": 601, "step_time": 26.462812028999906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1853489726781845, "epoch": 0.602, "frac_reward_zero_std": 0.0, "grad_norm": 1.0446908473968506, "kl": 0.009114722721278667, "learning_rate": 1.8126363951311288e-06, "loss": -0.197, "num_tokens": 1686986.0, "reward": 0.47749999165534973, "reward_std": 0.6044487953186035, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6044487953186035, "sampling/importance_sampling_ratio/max": 1.6912254095077515, "sampling/importance_sampling_ratio/mean": 0.9279754161834717, "sampling/importance_sampling_ratio/min": 0.4393068850040436, "sampling/sampling_logp_difference/max": 0.3386092185974121, "sampling/sampling_logp_difference/mean": 0.017108067870140076, "step": 602, "step_time": 24.53866340604145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1854662150144577, "epoch": 0.603, "frac_reward_zero_std": 0.0, "grad_norm": 1.105086088180542, "kl": 0.007829285226762295, "learning_rate": 1.8048551809720752e-06, "loss": 0.1776, "num_tokens": 1689477.0, "reward": 0.47749999165534973, "reward_std": 0.6044487953186035, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6044487953186035, "sampling/importance_sampling_ratio/max": 1.1140719652175903, "sampling/importance_sampling_ratio/mean": 0.899004340171814, "sampling/importance_sampling_ratio/min": 0.5815832018852234, "sampling/sampling_logp_difference/max": 0.30672919750213623, "sampling/sampling_logp_difference/mean": 0.019260572269558907, "step": 603, "step_time": 21.417619167943485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18471243977546692, "epoch": 0.604, "frac_reward_zero_std": 0.0, "grad_norm": 0.9076993465423584, "kl": 0.0099685899913311, "learning_rate": 1.797081258553236e-06, "loss": 0.1323, "num_tokens": 1691882.0, "reward": 0.4625000059604645, "reward_std": 0.6157583594322205, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.6157583594322205, "sampling/importance_sampling_ratio/max": 0.9357434511184692, "sampling/importance_sampling_ratio/mean": 0.8136001825332642, "sampling/importance_sampling_ratio/min": 0.4781160354614258, "sampling/sampling_logp_difference/max": 0.4795507788658142, "sampling/sampling_logp_difference/mean": 0.021835872903466225, "step": 604, "step_time": 22.944665168004576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.16018766164779663, "epoch": 0.605, "frac_reward_zero_std": 0.0, "grad_norm": 1.521013855934143, "kl": 0.010981081053614616, "learning_rate": 1.7893147094193786e-06, "loss": -0.0166, "num_tokens": 1694951.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.9275833368301392, "sampling/importance_sampling_ratio/mean": 1.327082633972168, "sampling/importance_sampling_ratio/min": 0.6004852056503296, "sampling/sampling_logp_difference/max": 0.419214129447937, "sampling/sampling_logp_difference/mean": 0.013401848264038563, "step": 605, "step_time": 25.95119515201077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15139813721179962, "epoch": 0.606, "frac_reward_zero_std": 0.0, "grad_norm": 0.6222726106643677, "kl": 0.00893695279955864, "learning_rate": 1.7815556150379298e-06, "loss": -0.1969, "num_tokens": 1697889.0, "reward": 0.7475000023841858, "reward_std": 0.4983556270599365, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.4983556270599365, "sampling/importance_sampling_ratio/max": 1.323899507522583, "sampling/importance_sampling_ratio/mean": 0.8617182970046997, "sampling/importance_sampling_ratio/min": 0.4974691569805145, "sampling/sampling_logp_difference/max": 0.8163425922393799, "sampling/sampling_logp_difference/mean": 0.020332610234618187, "step": 606, "step_time": 18.556259293982293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1512926071882248, "epoch": 0.607, "frac_reward_zero_std": 0.0, "grad_norm": 0.9089526534080505, "kl": 0.007725096307694912, "learning_rate": 1.7738040567981168e-06, "loss": -0.0871, "num_tokens": 1700661.0, "reward": 0.4625000059604645, "reward_std": 0.6048347353935242, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.6048347353935242, "sampling/importance_sampling_ratio/max": 1.14837646484375, "sampling/importance_sampling_ratio/mean": 0.9830376505851746, "sampling/importance_sampling_ratio/min": 0.8003469109535217, "sampling/sampling_logp_difference/max": 0.3560880422592163, "sampling/sampling_logp_difference/mean": 0.013200397603213787, "step": 607, "step_time": 28.59869863698259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 74.0, "completions/max_terminated_length": 74.0, "completions/mean_length": 61.5, "completions/mean_terminated_length": 61.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.14027364552021027, "epoch": 0.608, "frac_reward_zero_std": 0.0, "grad_norm": 1.3680777549743652, "kl": 0.00956844724714756, "learning_rate": 1.766060116010118e-06, "loss": -0.2955, "num_tokens": 1703058.0, "reward": 0.4675000011920929, "reward_std": 0.6148915886878967, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6148915886878967, "sampling/importance_sampling_ratio/max": 2.8477611541748047, "sampling/importance_sampling_ratio/mean": 1.18727707862854, "sampling/importance_sampling_ratio/min": 0.4462479054927826, "sampling/sampling_logp_difference/max": 0.5033063888549805, "sampling/sampling_logp_difference/mean": 0.016581332311034203, "step": 608, "step_time": 18.61690143001033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.16553883254528046, "epoch": 0.609, "frac_reward_zero_std": 0.0, "grad_norm": 1.0794106721878052, "kl": 0.007191778626292944, "learning_rate": 1.7583238739042086e-06, "loss": 0.1176, "num_tokens": 1705576.0, "reward": 0.45750001072883606, "reward_std": 0.6105940341949463, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.6105939745903015, "sampling/importance_sampling_ratio/max": 1.1590137481689453, "sampling/importance_sampling_ratio/mean": 0.8807634115219116, "sampling/importance_sampling_ratio/min": 0.5563974976539612, "sampling/sampling_logp_difference/max": 0.30944275856018066, "sampling/sampling_logp_difference/mean": 0.014899509027600288, "step": 609, "step_time": 22.78535911202198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1535315066576004, "epoch": 0.61, "frac_reward_zero_std": 0.0, "grad_norm": 1.4391672611236572, "kl": 0.03223855420947075, "learning_rate": 1.7505954116299062e-06, "loss": 0.1651, "num_tokens": 1708818.0, "reward": 0.49000000953674316, "reward_std": 0.5891236066818237, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5891236066818237, "sampling/importance_sampling_ratio/max": 1.5282551050186157, "sampling/importance_sampling_ratio/mean": 1.0372858047485352, "sampling/importance_sampling_ratio/min": 0.2889481484889984, "sampling/sampling_logp_difference/max": 0.572127103805542, "sampling/sampling_logp_difference/mean": 0.02021615393459797, "step": 610, "step_time": 30.98839571402641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19234591722488403, "epoch": 0.611, "frac_reward_zero_std": 0.0, "grad_norm": 1.294937014579773, "kl": 0.016634028404951096, "learning_rate": 1.7428748102551237e-06, "loss": -0.303, "num_tokens": 1711276.0, "reward": 0.7425000071525574, "reward_std": 0.5017552375793457, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5017552375793457, "sampling/importance_sampling_ratio/max": 2.7839548587799072, "sampling/importance_sampling_ratio/mean": 1.3824663162231445, "sampling/importance_sampling_ratio/min": 0.4395228326320648, "sampling/sampling_logp_difference/max": 0.5747545957565308, "sampling/sampling_logp_difference/mean": 0.02406739816069603, "step": 611, "step_time": 14.359794297022745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16999617218971252, "epoch": 0.612, "frac_reward_zero_std": 0.0, "grad_norm": 2.833671808242798, "kl": 0.0250686202198267, "learning_rate": 1.7351621507653157e-06, "loss": 0.2797, "num_tokens": 1714516.0, "reward": 0.7325000166893005, "reward_std": 0.5350000262260437, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5349999666213989, "sampling/importance_sampling_ratio/max": 2.0819358825683594, "sampling/importance_sampling_ratio/mean": 1.1299160718917847, "sampling/importance_sampling_ratio/min": 0.24995754659175873, "sampling/sampling_logp_difference/max": 1.0179526805877686, "sampling/sampling_logp_difference/mean": 0.03260910138487816, "step": 612, "step_time": 24.082860004971735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1484750509262085, "epoch": 0.613, "frac_reward_zero_std": 0.0, "grad_norm": 1.1963163614273071, "kl": 0.0441855862736702, "learning_rate": 1.7274575140626318e-06, "loss": -0.1002, "num_tokens": 1718413.0, "reward": 0.24000000953674316, "reward_std": 0.4933558702468872, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.4933558702468872, "sampling/importance_sampling_ratio/max": 0.8912676572799683, "sampling/importance_sampling_ratio/mean": 0.6959623098373413, "sampling/importance_sampling_ratio/min": 0.39840662479400635, "sampling/sampling_logp_difference/max": 0.5747554302215576, "sampling/sampling_logp_difference/mean": 0.026096556335687637, "step": 613, "step_time": 43.97198292700341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1514982283115387, "epoch": 0.614, "frac_reward_zero_std": 0.0, "grad_norm": 1.0199906826019287, "kl": 0.014400538057088852, "learning_rate": 1.7197609809650644e-06, "loss": -0.0998, "num_tokens": 1721719.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 2.079946517944336, "sampling/importance_sampling_ratio/mean": 1.0082184076309204, "sampling/importance_sampling_ratio/min": 0.5153579711914062, "sampling/sampling_logp_difference/max": 0.9355231523513794, "sampling/sampling_logp_difference/mean": 0.01996752619743347, "step": 614, "step_time": 22.822635727003217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.20047390460968018, "epoch": 0.615, "frac_reward_zero_std": 0.0, "grad_norm": 1.229144811630249, "kl": 0.012390729039907455, "learning_rate": 1.7120726322056042e-06, "loss": 0.4014, "num_tokens": 1724200.0, "reward": 0.49000000953674316, "reward_std": 0.5831523537635803, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5831523537635803, "sampling/importance_sampling_ratio/max": 2.113356828689575, "sampling/importance_sampling_ratio/mean": 1.0747568607330322, "sampling/importance_sampling_ratio/min": 0.5301244258880615, "sampling/sampling_logp_difference/max": 0.4836369752883911, "sampling/sampling_logp_difference/mean": 0.019983848556876183, "step": 615, "step_time": 24.27560904499842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.20185032486915588, "epoch": 0.616, "frac_reward_zero_std": 0.0, "grad_norm": 1.7269939184188843, "kl": 0.009226170368492603, "learning_rate": 1.7043925484313911e-06, "loss": 0.0598, "num_tokens": 1727018.0, "reward": 0.7425000071525574, "reward_std": 0.5083552002906799, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5083552002906799, "sampling/importance_sampling_ratio/max": 2.852564811706543, "sampling/importance_sampling_ratio/mean": 1.4245896339416504, "sampling/importance_sampling_ratio/min": 0.6945921182632446, "sampling/sampling_logp_difference/max": 0.3470289707183838, "sampling/sampling_logp_difference/mean": 0.018863201141357422, "step": 616, "step_time": 21.071833376016002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 60.75, "completions/mean_terminated_length": 60.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19654488563537598, "epoch": 0.617, "frac_reward_zero_std": 0.0, "grad_norm": 0.6523365378379822, "kl": 0.012686912901699543, "learning_rate": 1.6967208102028698e-06, "loss": -0.0675, "num_tokens": 1729325.0, "reward": 0.7049999833106995, "reward_std": 0.5443344712257385, "rewards/reward_func/mean": 0.7049999833106995, "rewards/reward_func/std": 0.5443344712257385, "sampling/importance_sampling_ratio/max": 1.3317081928253174, "sampling/importance_sampling_ratio/mean": 0.7678804397583008, "sampling/importance_sampling_ratio/min": 0.4516746699810028, "sampling/sampling_logp_difference/max": 0.46208614110946655, "sampling/sampling_logp_difference/mean": 0.01937464065849781, "step": 617, "step_time": 17.772947912977543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.23392610251903534, "epoch": 0.618, "frac_reward_zero_std": 0.0, "grad_norm": 2.0327062606811523, "kl": 0.010529535822570324, "learning_rate": 1.6890574979929447e-06, "loss": 0.3228, "num_tokens": 1731944.0, "reward": 0.47749999165534973, "reward_std": 0.6044487953186035, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6044487953186035, "sampling/importance_sampling_ratio/max": 2.015913248062134, "sampling/importance_sampling_ratio/mean": 1.030210256576538, "sampling/importance_sampling_ratio/min": 0.560074508190155, "sampling/sampling_logp_difference/max": 1.053152322769165, "sampling/sampling_logp_difference/mean": 0.0350525826215744, "step": 618, "step_time": 29.9040975740063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.1547178030014038, "epoch": 0.619, "frac_reward_zero_std": 0.0, "grad_norm": 1.3999649286270142, "kl": 0.019614914432168007, "learning_rate": 1.6814026921861337e-06, "loss": -0.1002, "num_tokens": 1734585.0, "reward": 0.7275000214576721, "reward_std": 0.4926374852657318, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.4926374852657318, "sampling/importance_sampling_ratio/max": 1.3106412887573242, "sampling/importance_sampling_ratio/mean": 0.9512451887130737, "sampling/importance_sampling_ratio/min": 0.6110233664512634, "sampling/sampling_logp_difference/max": 0.5037369728088379, "sampling/sampling_logp_difference/mean": 0.01598307676613331, "step": 619, "step_time": 19.045389618026093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1751425713300705, "epoch": 0.62, "frac_reward_zero_std": 0.0, "grad_norm": 0.7295264005661011, "kl": 0.011225121095776558, "learning_rate": 1.6737564730777283e-06, "loss": -0.3366, "num_tokens": 1737227.0, "reward": 0.7400000095367432, "reward_std": 0.5001999735832214, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5001999735832214, "sampling/importance_sampling_ratio/max": 1.2189929485321045, "sampling/importance_sampling_ratio/mean": 0.8347501754760742, "sampling/importance_sampling_ratio/min": 0.24944087862968445, "sampling/sampling_logp_difference/max": 1.0151824951171875, "sampling/sampling_logp_difference/mean": 0.021678216755390167, "step": 620, "step_time": 16.819252650951967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.19474118947982788, "epoch": 0.621, "frac_reward_zero_std": 0.0, "grad_norm": 1.7841765880584717, "kl": 0.014002404175698757, "learning_rate": 1.6661189208729492e-06, "loss": -0.2837, "num_tokens": 1739711.0, "reward": 0.47749999165534973, "reward_std": 0.6044487953186035, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6044487953186035, "sampling/importance_sampling_ratio/max": 2.476973533630371, "sampling/importance_sampling_ratio/mean": 1.5411036014556885, "sampling/importance_sampling_ratio/min": 0.5951814651489258, "sampling/sampling_logp_difference/max": 0.564987063407898, "sampling/sampling_logp_difference/mean": 0.018215788528323174, "step": 621, "step_time": 27.346593428985216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1748156100511551, "epoch": 0.622, "frac_reward_zero_std": 0.0, "grad_norm": 0.7917118072509766, "kl": 0.004283021669834852, "learning_rate": 1.6584901156861038e-06, "loss": 0.1524, "num_tokens": 1742135.0, "reward": 0.17500001192092896, "reward_std": 0.5531425476074219, "rewards/reward_func/mean": 0.17500001192092896, "rewards/reward_func/std": 0.5531425476074219, "sampling/importance_sampling_ratio/max": 1.7983412742614746, "sampling/importance_sampling_ratio/mean": 1.3537887334823608, "sampling/importance_sampling_ratio/min": 0.9527789950370789, "sampling/sampling_logp_difference/max": 0.35057902336120605, "sampling/sampling_logp_difference/mean": 0.012552245520055294, "step": 622, "step_time": 28.052379538014065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.16706061363220215, "epoch": 0.623, "frac_reward_zero_std": 0.0, "grad_norm": 1.6962339878082275, "kl": 0.013952253386378288, "learning_rate": 1.6508701375397488e-06, "loss": 0.2434, "num_tokens": 1744380.0, "reward": 0.7174999713897705, "reward_std": 0.5649999976158142, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.5649999976158142, "sampling/importance_sampling_ratio/max": 1.278971552848816, "sampling/importance_sampling_ratio/mean": 0.9865517616271973, "sampling/importance_sampling_ratio/min": 0.5540949106216431, "sampling/sampling_logp_difference/max": 0.3299221992492676, "sampling/sampling_logp_difference/mean": 0.017239151522517204, "step": 623, "step_time": 15.752895593002904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16032390296459198, "epoch": 0.624, "frac_reward_zero_std": 0.0, "grad_norm": 0.8977182507514954, "kl": 0.028268292546272278, "learning_rate": 1.6432590663638504e-06, "loss": -0.2438, "num_tokens": 1747448.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 2.308405637741089, "sampling/importance_sampling_ratio/mean": 0.8999727964401245, "sampling/importance_sampling_ratio/min": 0.22045670449733734, "sampling/sampling_logp_difference/max": 0.5744023323059082, "sampling/sampling_logp_difference/mean": 0.024297351017594337, "step": 624, "step_time": 23.286001041997224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.17862281203269958, "epoch": 0.625, "frac_reward_zero_std": 1.0, "grad_norm": 0.008277460001409054, "kl": 0.008649054914712906, "learning_rate": 1.635656981994943e-06, "loss": 0.0001, "num_tokens": 1749721.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4836041927337646, "sampling/importance_sampling_ratio/mean": 1.074285864830017, "sampling/importance_sampling_ratio/min": 0.6899040341377258, "sampling/sampling_logp_difference/max": 0.31301069259643555, "sampling/sampling_logp_difference/mean": 0.01357162743806839, "step": 625, "step_time": 13.827146177005488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2573089897632599, "epoch": 0.626, "frac_reward_zero_std": 0.0, "grad_norm": 1.2861988544464111, "kl": 0.030322056263685226, "learning_rate": 1.6280639641752944e-06, "loss": -0.184, "num_tokens": 1752455.0, "reward": 0.1899999976158142, "reward_std": 0.5424020886421204, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.5424020886421204, "sampling/importance_sampling_ratio/max": 1.3694077730178833, "sampling/importance_sampling_ratio/mean": 0.9944738745689392, "sampling/importance_sampling_ratio/min": 0.6162763833999634, "sampling/sampling_logp_difference/max": 0.4895906448364258, "sampling/sampling_logp_difference/mean": 0.028272125869989395, "step": 626, "step_time": 28.689937060000375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.20585273206233978, "epoch": 0.627, "frac_reward_zero_std": 0.0, "grad_norm": 0.7033559679985046, "kl": 0.02161390893161297, "learning_rate": 1.6204800925520685e-06, "loss": -0.018, "num_tokens": 1755304.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 0.7158298492431641, "sampling/importance_sampling_ratio/mean": 0.5595700740814209, "sampling/importance_sampling_ratio/min": 0.3621256351470947, "sampling/sampling_logp_difference/max": 0.8840136528015137, "sampling/sampling_logp_difference/mean": 0.025085879489779472, "step": 627, "step_time": 22.490009845991153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 46.75, "completions/mean_terminated_length": 46.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1496848464012146, "epoch": 0.628, "frac_reward_zero_std": 0.0, "grad_norm": 0.9651160836219788, "kl": 0.007663455791771412, "learning_rate": 1.6129054466764904e-06, "loss": -0.1549, "num_tokens": 1757791.0, "reward": 0.44999998807907104, "reward_std": 0.6403124332427979, "rewards/reward_func/mean": 0.44999998807907104, "rewards/reward_func/std": 0.6403124332427979, "sampling/importance_sampling_ratio/max": 1.333585262298584, "sampling/importance_sampling_ratio/mean": 0.9744949340820312, "sampling/importance_sampling_ratio/min": 0.7222581505775452, "sampling/sampling_logp_difference/max": 0.38552796840667725, "sampling/sampling_logp_difference/mean": 0.014469567686319351, "step": 628, "step_time": 15.626681672991253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1612822264432907, "epoch": 0.629, "frac_reward_zero_std": 0.0, "grad_norm": 0.8435804843902588, "kl": 0.007938998751342297, "learning_rate": 1.6053401060030098e-06, "loss": 0.0684, "num_tokens": 1760634.0, "reward": 0.23499999940395355, "reward_std": 0.5102613568305969, "rewards/reward_func/mean": 0.23499999940395355, "rewards/reward_func/std": 0.5102613568305969, "sampling/importance_sampling_ratio/max": 1.4544060230255127, "sampling/importance_sampling_ratio/mean": 0.9853469133377075, "sampling/importance_sampling_ratio/min": 0.7932815551757812, "sampling/sampling_logp_difference/max": 0.35282135009765625, "sampling/sampling_logp_difference/mean": 0.015229961834847927, "step": 629, "step_time": 30.167976055003237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.20417587459087372, "epoch": 0.63, "frac_reward_zero_std": 0.0, "grad_norm": 0.9814347624778748, "kl": 0.015003271400928497, "learning_rate": 1.5977841498884725e-06, "loss": 0.363, "num_tokens": 1763066.0, "reward": 0.19499999284744263, "reward_std": 0.5395368337631226, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.5395368337631226, "sampling/importance_sampling_ratio/max": 2.2209384441375732, "sampling/importance_sampling_ratio/mean": 1.1268428564071655, "sampling/importance_sampling_ratio/min": 0.4993911683559418, "sampling/sampling_logp_difference/max": 0.5290529727935791, "sampling/sampling_logp_difference/mean": 0.02136404998600483, "step": 630, "step_time": 27.980159974016715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.16995222866535187, "epoch": 0.631, "frac_reward_zero_std": 1.0, "grad_norm": 0.009162414819002151, "kl": 0.01722177304327488, "learning_rate": 1.5902376575912815e-06, "loss": 0.0002, "num_tokens": 1765789.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.1549965143203735, "sampling/importance_sampling_ratio/mean": 0.7937719821929932, "sampling/importance_sampling_ratio/min": 0.47422870993614197, "sampling/sampling_logp_difference/max": 0.37606120109558105, "sampling/sampling_logp_difference/mean": 0.01726781576871872, "step": 631, "step_time": 11.725124338001478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16530251502990723, "epoch": 0.632, "frac_reward_zero_std": 0.0, "grad_norm": 0.9526640176773071, "kl": 0.011067393235862255, "learning_rate": 1.58270070827057e-06, "loss": 0.0392, "num_tokens": 1768655.0, "reward": 0.48750001192092896, "reward_std": 0.5803088545799255, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5803087949752808, "sampling/importance_sampling_ratio/max": 1.0793237686157227, "sampling/importance_sampling_ratio/mean": 0.8970098495483398, "sampling/importance_sampling_ratio/min": 0.6435756683349609, "sampling/sampling_logp_difference/max": 0.4121640920639038, "sampling/sampling_logp_difference/mean": 0.01485708262771368, "step": 632, "step_time": 29.608028165996075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17630241811275482, "epoch": 0.633, "frac_reward_zero_std": 0.0, "grad_norm": 1.9082945585250854, "kl": 0.018133385106921196, "learning_rate": 1.5751733809853703e-06, "loss": 0.1827, "num_tokens": 1771877.0, "reward": 0.45499998331069946, "reward_std": 0.6293647885322571, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.6293647885322571, "sampling/importance_sampling_ratio/max": 2.0230774879455566, "sampling/importance_sampling_ratio/mean": 1.002569317817688, "sampling/importance_sampling_ratio/min": 0.27002227306365967, "sampling/sampling_logp_difference/max": 0.4743722677230835, "sampling/sampling_logp_difference/mean": 0.024580249562859535, "step": 633, "step_time": 27.879755026020575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.15197305381298065, "epoch": 0.634, "frac_reward_zero_std": 0.0, "grad_norm": 0.46289920806884766, "kl": 0.013469088822603226, "learning_rate": 1.5676557546937838e-06, "loss": -0.2573, "num_tokens": 1775391.0, "reward": 0.7124999761581421, "reward_std": 0.5484752058982849, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.5484752058982849, "sampling/importance_sampling_ratio/max": 1.194197654724121, "sampling/importance_sampling_ratio/mean": 0.5205033421516418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6592068672180176, "sampling/sampling_logp_difference/mean": 0.027445437386631966, "step": 634, "step_time": 40.635141404985916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1714676171541214, "epoch": 0.635, "frac_reward_zero_std": 0.0, "grad_norm": 0.8837884664535522, "kl": 0.02252151630818844, "learning_rate": 1.5601479082521526e-06, "loss": -0.0471, "num_tokens": 1778145.0, "reward": 0.48750001192092896, "reward_std": 0.5921359658241272, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.592136025428772, "sampling/importance_sampling_ratio/max": 1.0499279499053955, "sampling/importance_sampling_ratio/mean": 0.8758193254470825, "sampling/importance_sampling_ratio/min": 0.6838318109512329, "sampling/sampling_logp_difference/max": 0.5107488632202148, "sampling/sampling_logp_difference/mean": 0.02029205486178398, "step": 635, "step_time": 21.37632375600515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18645331263542175, "epoch": 0.636, "frac_reward_zero_std": 0.0, "grad_norm": 1.2903982400894165, "kl": 0.016369538381695747, "learning_rate": 1.5526499204142332e-06, "loss": -0.1523, "num_tokens": 1781367.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 2.013786792755127, "sampling/importance_sampling_ratio/mean": 1.0665322542190552, "sampling/importance_sampling_ratio/min": 0.5317853093147278, "sampling/sampling_logp_difference/max": 0.3103519678115845, "sampling/sampling_logp_difference/mean": 0.02173052355647087, "step": 636, "step_time": 23.108704175043385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17679551243782043, "epoch": 0.637, "frac_reward_zero_std": 0.0, "grad_norm": 0.955349326133728, "kl": 0.009664141573011875, "learning_rate": 1.545161869830371e-06, "loss": 0.0719, "num_tokens": 1783991.0, "reward": 0.7150000333786011, "reward_std": 0.5699999928474426, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5699999928474426, "sampling/importance_sampling_ratio/max": 1.4639298915863037, "sampling/importance_sampling_ratio/mean": 1.0480222702026367, "sampling/importance_sampling_ratio/min": 0.7020666003227234, "sampling/sampling_logp_difference/max": 0.3138582706451416, "sampling/sampling_logp_difference/mean": 0.011089324951171875, "step": 637, "step_time": 18.63492452300852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.159947469830513, "epoch": 0.638, "frac_reward_zero_std": 0.0, "grad_norm": 1.0940121412277222, "kl": 0.014933659695088863, "learning_rate": 1.5376838350466724e-06, "loss": -0.0867, "num_tokens": 1786884.0, "reward": 0.45750001072883606, "reward_std": 0.6171641945838928, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.6171642541885376, "sampling/importance_sampling_ratio/max": 1.1959325075149536, "sampling/importance_sampling_ratio/mean": 0.8900156617164612, "sampling/importance_sampling_ratio/min": 0.6036337614059448, "sampling/sampling_logp_difference/max": 0.47559189796447754, "sampling/sampling_logp_difference/mean": 0.017820995301008224, "step": 638, "step_time": 33.278494848986156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2013646513223648, "epoch": 0.639, "frac_reward_zero_std": 0.0, "grad_norm": 1.0432056188583374, "kl": 0.026985956355929375, "learning_rate": 1.530215894504184e-06, "loss": -0.0132, "num_tokens": 1789678.0, "reward": 0.7425000071525574, "reward_std": 0.5017552375793457, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5017552375793457, "sampling/importance_sampling_ratio/max": 1.2238497734069824, "sampling/importance_sampling_ratio/mean": 0.886970043182373, "sampling/importance_sampling_ratio/min": 0.5661554932594299, "sampling/sampling_logp_difference/max": 0.5970216393470764, "sampling/sampling_logp_difference/mean": 0.021844329312443733, "step": 639, "step_time": 12.41227082698606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18106475472450256, "epoch": 0.64, "frac_reward_zero_std": 0.0, "grad_norm": 0.865650475025177, "kl": 0.444346159696579, "learning_rate": 1.5227581265380686e-06, "loss": -0.3312, "num_tokens": 1792304.0, "reward": 0.7475000023841858, "reward_std": 0.4983556270599365, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.4983556270599365, "sampling/importance_sampling_ratio/max": 1.6102694272994995, "sampling/importance_sampling_ratio/mean": 0.8960154056549072, "sampling/importance_sampling_ratio/min": 0.31319472193717957, "sampling/sampling_logp_difference/max": 1.0692663192749023, "sampling/sampling_logp_difference/mean": 0.018450118601322174, "step": 640, "step_time": 15.496065863990225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1728666126728058, "epoch": 0.641, "frac_reward_zero_std": 1.0, "grad_norm": 0.021963167935609818, "kl": 0.020881017670035362, "learning_rate": 1.5153106093767827e-06, "loss": 0.0002, "num_tokens": 1794964.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4746590852737427, "sampling/importance_sampling_ratio/mean": 0.9942984580993652, "sampling/importance_sampling_ratio/min": 0.6818205118179321, "sampling/sampling_logp_difference/max": 0.34184932708740234, "sampling/sampling_logp_difference/mean": 0.016611801460385323, "step": 641, "step_time": 20.57313274795888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 43.0, "completions/mean_terminated_length": 43.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.19762620329856873, "epoch": 0.642, "frac_reward_zero_std": 0.0, "grad_norm": 1.5351296663284302, "kl": 0.018166663125157356, "learning_rate": 1.5078734211412574e-06, "loss": -0.0284, "num_tokens": 1798231.0, "reward": 0.4775000214576721, "reward_std": 0.6040074825286865, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.6040074825286865, "sampling/importance_sampling_ratio/max": 1.053011417388916, "sampling/importance_sampling_ratio/mean": 0.7307313680648804, "sampling/importance_sampling_ratio/min": 0.5174073576927185, "sampling/sampling_logp_difference/max": 0.5195529460906982, "sampling/sampling_logp_difference/mean": 0.023538639768958092, "step": 642, "step_time": 27.16691463603638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1876855194568634, "epoch": 0.643, "frac_reward_zero_std": 0.0, "grad_norm": 1.1284147500991821, "kl": 0.007414819672703743, "learning_rate": 1.5004466398440776e-06, "loss": 0.1299, "num_tokens": 1800520.0, "reward": 0.4325000047683716, "reward_std": 0.6554069519042969, "rewards/reward_func/mean": 0.4325000047683716, "rewards/reward_func/std": 0.6554070115089417, "sampling/importance_sampling_ratio/max": 1.329459309577942, "sampling/importance_sampling_ratio/mean": 0.8920758962631226, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5020637512207031, "sampling/sampling_logp_difference/mean": 0.016520686447620392, "step": 643, "step_time": 27.637942540983204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.20719251036643982, "epoch": 0.644, "frac_reward_zero_std": 0.0, "grad_norm": 1.5510611534118652, "kl": 0.0067186555825173855, "learning_rate": 1.4930303433886661e-06, "loss": -0.2051, "num_tokens": 1803282.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.1756839752197266, "sampling/importance_sampling_ratio/mean": 0.861975908279419, "sampling/importance_sampling_ratio/min": 0.479557067155838, "sampling/sampling_logp_difference/max": 0.35436010360717773, "sampling/sampling_logp_difference/mean": 0.019414277747273445, "step": 644, "step_time": 15.842635200999212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.13877488672733307, "epoch": 0.645, "frac_reward_zero_std": 0.0, "grad_norm": 1.8780481815338135, "kl": 0.018046772107481956, "learning_rate": 1.4856246095684623e-06, "loss": -0.1059, "num_tokens": 1806281.0, "reward": 0.987500011920929, "reward_std": 0.018929697573184967, "rewards/reward_func/mean": 0.987500011920929, "rewards/reward_func/std": 0.018929706886410713, "sampling/importance_sampling_ratio/max": 2.7975311279296875, "sampling/importance_sampling_ratio/mean": 1.587235927581787, "sampling/importance_sampling_ratio/min": 0.6787910461425781, "sampling/sampling_logp_difference/max": 0.5745187997817993, "sampling/sampling_logp_difference/mean": 0.020058711990714073, "step": 645, "step_time": 17.148567152034957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.19523835182189941, "epoch": 0.646, "frac_reward_zero_std": 0.0, "grad_norm": 1.3609191179275513, "kl": 0.007370285224169493, "learning_rate": 1.4782295160661103e-06, "loss": 0.2769, "num_tokens": 1808779.0, "reward": 0.9850000143051147, "reward_std": 0.02999999187886715, "rewards/reward_func/mean": 0.9850000143051147, "rewards/reward_func/std": 0.030000001192092896, "sampling/importance_sampling_ratio/max": 1.708103060722351, "sampling/importance_sampling_ratio/mean": 1.0419939756393433, "sampling/importance_sampling_ratio/min": 0.7096189260482788, "sampling/sampling_logp_difference/max": 0.41626930236816406, "sampling/sampling_logp_difference/mean": 0.019789589568972588, "step": 646, "step_time": 18.010711931972764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.17364248633384705, "epoch": 0.647, "frac_reward_zero_std": 0.0, "grad_norm": 0.7105161547660828, "kl": 0.006607310846447945, "learning_rate": 1.4708451404526409e-06, "loss": -0.1065, "num_tokens": 1811652.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 0.9654606580734253, "sampling/importance_sampling_ratio/mean": 0.7610697746276855, "sampling/importance_sampling_ratio/min": 0.5392307043075562, "sampling/sampling_logp_difference/max": 0.2949722111225128, "sampling/sampling_logp_difference/mean": 0.016251305118203163, "step": 647, "step_time": 14.916728026000783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.21131202578544617, "epoch": 0.648, "frac_reward_zero_std": 0.0, "grad_norm": 1.7781920433044434, "kl": 0.011264685541391373, "learning_rate": 1.4634715601866607e-06, "loss": 0.2741, "num_tokens": 1814182.0, "reward": 0.7124999761581421, "reward_std": 0.5750000476837158, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.574999988079071, "sampling/importance_sampling_ratio/max": 2.010082721710205, "sampling/importance_sampling_ratio/mean": 1.3933961391448975, "sampling/importance_sampling_ratio/min": 1.029809594154358, "sampling/sampling_logp_difference/max": 0.5129172801971436, "sampling/sampling_logp_difference/mean": 0.022472243756055832, "step": 648, "step_time": 16.201536645006854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18080459535121918, "epoch": 0.649, "frac_reward_zero_std": 0.0, "grad_norm": 0.9737091064453125, "kl": 0.007052244618535042, "learning_rate": 1.4561088526135376e-06, "loss": -0.1707, "num_tokens": 1816913.0, "reward": -0.0925000011920929, "reward_std": 0.06751543283462524, "rewards/reward_func/mean": -0.0925000011920929, "rewards/reward_func/std": 0.06751543283462524, "sampling/importance_sampling_ratio/max": 1.0769755840301514, "sampling/importance_sampling_ratio/mean": 0.8287715911865234, "sampling/importance_sampling_ratio/min": 0.5557464957237244, "sampling/sampling_logp_difference/max": 0.31096673011779785, "sampling/sampling_logp_difference/mean": 0.014231499284505844, "step": 649, "step_time": 38.773539151996374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1829139143228531, "epoch": 0.65, "frac_reward_zero_std": 0.0, "grad_norm": 1.1503514051437378, "kl": 0.012820237316191196, "learning_rate": 1.4487570949645888e-06, "loss": -0.1889, "num_tokens": 1819875.0, "reward": 0.7425000071525574, "reward_std": 0.5017552375793457, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5017552375793457, "sampling/importance_sampling_ratio/max": 1.4220715761184692, "sampling/importance_sampling_ratio/mean": 1.1539939641952515, "sampling/importance_sampling_ratio/min": 0.9863265156745911, "sampling/sampling_logp_difference/max": 0.3510575294494629, "sampling/sampling_logp_difference/mean": 0.01926713064312935, "step": 650, "step_time": 19.381827126955613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 59.5, "completions/mean_terminated_length": 59.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.20623010396957397, "epoch": 0.651, "frac_reward_zero_std": 0.0, "grad_norm": 0.5961728096008301, "kl": 0.0288254227489233, "learning_rate": 1.4414163643562755e-06, "loss": 0.0005, "num_tokens": 1822561.0, "reward": 0.7275000214576721, "reward_std": 0.5317502617835999, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5317502617835999, "sampling/importance_sampling_ratio/max": 0.7234391570091248, "sampling/importance_sampling_ratio/mean": 0.5926581621170044, "sampling/importance_sampling_ratio/min": 0.42438483238220215, "sampling/sampling_logp_difference/max": 1.2557284832000732, "sampling/sampling_logp_difference/mean": 0.023554274812340736, "step": 651, "step_time": 26.128514588985126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16743320226669312, "epoch": 0.652, "frac_reward_zero_std": 0.0, "grad_norm": 0.8823656439781189, "kl": 0.011811461299657822, "learning_rate": 1.434086737789386e-06, "loss": -0.166, "num_tokens": 1825029.0, "reward": 0.9700000286102295, "reward_std": 0.03162277862429619, "rewards/reward_func/mean": 0.9700000286102295, "rewards/reward_func/std": 0.03162277862429619, "sampling/importance_sampling_ratio/max": 2.17000675201416, "sampling/importance_sampling_ratio/mean": 1.222658634185791, "sampling/importance_sampling_ratio/min": 0.6390091776847839, "sampling/sampling_logp_difference/max": 0.4357842206954956, "sampling/sampling_logp_difference/mean": 0.01880563236773014, "step": 652, "step_time": 17.110911049996503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 63.0, "completions/mean_terminated_length": 63.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1673194020986557, "epoch": 0.653, "frac_reward_zero_std": 0.0, "grad_norm": 0.604546070098877, "kl": 0.008947128430008888, "learning_rate": 1.4267682921482356e-06, "loss": 0.0147, "num_tokens": 1827397.0, "reward": 0.48500001430511475, "reward_std": 0.5951750874519348, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5951750874519348, "sampling/importance_sampling_ratio/max": 1.546205759048462, "sampling/importance_sampling_ratio/mean": 0.8122150897979736, "sampling/importance_sampling_ratio/min": 0.3368344306945801, "sampling/sampling_logp_difference/max": 0.34247398376464844, "sampling/sampling_logp_difference/mean": 0.015716126188635826, "step": 653, "step_time": 13.921281819988508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.17234474420547485, "epoch": 0.654, "frac_reward_zero_std": 0.0, "grad_norm": 1.5408908128738403, "kl": 0.014583198353648186, "learning_rate": 1.419461104199856e-06, "loss": 0.2291, "num_tokens": 1830064.0, "reward": 0.23250000178813934, "reward_std": 0.5116883516311646, "rewards/reward_func/mean": 0.23250000178813934, "rewards/reward_func/std": 0.5116883516311646, "sampling/importance_sampling_ratio/max": 1.566091537475586, "sampling/importance_sampling_ratio/mean": 1.0534472465515137, "sampling/importance_sampling_ratio/min": 0.6251006722450256, "sampling/sampling_logp_difference/max": 0.6353094577789307, "sampling/sampling_logp_difference/mean": 0.021035198122262955, "step": 654, "step_time": 24.753491388983093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1364516317844391, "epoch": 0.655, "frac_reward_zero_std": 0.0, "grad_norm": 0.6769875884056091, "kl": 0.008834201842546463, "learning_rate": 1.4121652505931922e-06, "loss": 0.0824, "num_tokens": 1833587.0, "reward": 0.48750001192092896, "reward_std": 0.591910719871521, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5919107794761658, "sampling/importance_sampling_ratio/max": 0.6912497878074646, "sampling/importance_sampling_ratio/mean": 0.509769856929779, "sampling/importance_sampling_ratio/min": 0.3794373869895935, "sampling/sampling_logp_difference/max": 0.6592261791229248, "sampling/sampling_logp_difference/mean": 0.02417614869773388, "step": 655, "step_time": 32.55206028401153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 71.0, "completions/mean_terminated_length": 71.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1921452283859253, "epoch": 0.656, "frac_reward_zero_std": 0.0, "grad_norm": 1.0380533933639526, "kl": 0.009938723407685757, "learning_rate": 1.4048808078582943e-06, "loss": 0.11, "num_tokens": 1836346.0, "reward": 0.45499998331069946, "reward_std": 0.6297882795333862, "rewards/reward_func/mean": 0.45499998331069946, "rewards/reward_func/std": 0.629788339138031, "sampling/importance_sampling_ratio/max": 1.2282116413116455, "sampling/importance_sampling_ratio/mean": 1.0161831378936768, "sampling/importance_sampling_ratio/min": 0.7652660012245178, "sampling/sampling_logp_difference/max": 0.39948660135269165, "sampling/sampling_logp_difference/mean": 0.01752253621816635, "step": 656, "step_time": 27.885944550973363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.18256758153438568, "epoch": 0.657, "frac_reward_zero_std": 0.0, "grad_norm": 0.9640325903892517, "kl": 0.01692541129887104, "learning_rate": 1.3976078524055203e-06, "loss": 0.1515, "num_tokens": 1839182.0, "reward": 0.7350000143051147, "reward_std": 0.4908156394958496, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.490815669298172, "sampling/importance_sampling_ratio/max": 1.297997236251831, "sampling/importance_sampling_ratio/mean": 0.8474175930023193, "sampling/importance_sampling_ratio/min": 0.3517695367336273, "sampling/sampling_logp_difference/max": 0.488314151763916, "sampling/sampling_logp_difference/mean": 0.016320403665304184, "step": 657, "step_time": 16.276209645031486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17791733145713806, "epoch": 0.658, "frac_reward_zero_std": 0.0, "grad_norm": 1.3244234323501587, "kl": 0.008289128541946411, "learning_rate": 1.3903464605247325e-06, "loss": -0.3197, "num_tokens": 1841825.0, "reward": 0.7150000333786011, "reward_std": 0.5243726372718811, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5243726372718811, "sampling/importance_sampling_ratio/max": 2.3620522022247314, "sampling/importance_sampling_ratio/mean": 1.5487117767333984, "sampling/importance_sampling_ratio/min": 0.6798550486564636, "sampling/sampling_logp_difference/max": 0.6386370658874512, "sampling/sampling_logp_difference/mean": 0.01854069158434868, "step": 658, "step_time": 17.539338600006886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.17234854400157928, "epoch": 0.659, "frac_reward_zero_std": 0.0, "grad_norm": 2.2361860275268555, "kl": 0.01219615712761879, "learning_rate": 1.3830967083844944e-06, "loss": -0.241, "num_tokens": 1844535.0, "reward": -0.10999999940395355, "reward_std": 0.0707106813788414, "rewards/reward_func/mean": -0.10999999940395355, "rewards/reward_func/std": 0.0707106739282608, "sampling/importance_sampling_ratio/max": 2.0153262615203857, "sampling/importance_sampling_ratio/mean": 1.037646770477295, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6524457931518555, "sampling/sampling_logp_difference/mean": 0.01723821833729744, "step": 659, "step_time": 41.24755599995842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18733687698841095, "epoch": 0.66, "frac_reward_zero_std": 0.0, "grad_norm": 2.358020544052124, "kl": 0.014615255407989025, "learning_rate": 1.375858672031276e-06, "loss": -0.2713, "num_tokens": 1847200.0, "reward": 0.17499999701976776, "reward_std": 0.5507873296737671, "rewards/reward_func/mean": 0.17499999701976776, "rewards/reward_func/std": 0.5507873296737671, "sampling/importance_sampling_ratio/max": 1.5951515436172485, "sampling/importance_sampling_ratio/mean": 1.1068426370620728, "sampling/importance_sampling_ratio/min": 0.637260913848877, "sampling/sampling_logp_difference/max": 0.5581474304199219, "sampling/sampling_logp_difference/mean": 0.026981312781572342, "step": 660, "step_time": 32.79299765301403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.23001980781555176, "epoch": 0.661, "frac_reward_zero_std": 0.0, "grad_norm": 1.8256839513778687, "kl": 0.061692360788583755, "learning_rate": 1.3686324273886531e-06, "loss": -0.3991, "num_tokens": 1850308.0, "reward": 0.2475000023841858, "reward_std": 0.5016888380050659, "rewards/reward_func/mean": 0.2475000023841858, "rewards/reward_func/std": 0.5016888380050659, "sampling/importance_sampling_ratio/max": 1.4826430082321167, "sampling/importance_sampling_ratio/mean": 0.8365529775619507, "sampling/importance_sampling_ratio/min": 0.3060137629508972, "sampling/sampling_logp_difference/max": 0.5171537399291992, "sampling/sampling_logp_difference/mean": 0.023233352228999138, "step": 661, "step_time": 31.00193968100939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2016131430864334, "epoch": 0.662, "frac_reward_zero_std": 0.0, "grad_norm": 0.6349130272865295, "kl": 0.022461358457803726, "learning_rate": 1.3614180502565136e-06, "loss": -0.1201, "num_tokens": 1852802.0, "reward": 0.7274999618530273, "reward_std": 0.5450000166893005, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.5450000166893005, "sampling/importance_sampling_ratio/max": 1.4755222797393799, "sampling/importance_sampling_ratio/mean": 0.7413867712020874, "sampling/importance_sampling_ratio/min": 0.4321173429489136, "sampling/sampling_logp_difference/max": 0.6283726692199707, "sampling/sampling_logp_difference/mean": 0.02016855590045452, "step": 662, "step_time": 17.329544788983185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1864590346813202, "epoch": 0.663, "frac_reward_zero_std": 0.0, "grad_norm": 1.3951979875564575, "kl": 0.011292452923953533, "learning_rate": 1.3542156163102582e-06, "loss": 0.058, "num_tokens": 1855634.0, "reward": 0.44999998807907104, "reward_std": 0.6352952122688293, "rewards/reward_func/mean": 0.44999998807907104, "rewards/reward_func/std": 0.6352952122688293, "sampling/importance_sampling_ratio/max": 1.81913161277771, "sampling/importance_sampling_ratio/mean": 1.1675968170166016, "sampling/importance_sampling_ratio/min": 0.730985701084137, "sampling/sampling_logp_difference/max": 0.8182584643363953, "sampling/sampling_logp_difference/mean": 0.02268899232149124, "step": 663, "step_time": 24.764746856992133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.15506213903427124, "epoch": 0.664, "frac_reward_zero_std": 0.0, "grad_norm": 0.9998541474342346, "kl": 0.011365209706127644, "learning_rate": 1.3470252011000124e-06, "loss": 0.0792, "num_tokens": 1858410.0, "reward": 0.7350000143051147, "reward_std": 0.5299999713897705, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5300000309944153, "sampling/importance_sampling_ratio/max": 1.0146489143371582, "sampling/importance_sampling_ratio/mean": 0.8550244569778442, "sampling/importance_sampling_ratio/min": 0.6337332725524902, "sampling/sampling_logp_difference/max": 0.4183235168457031, "sampling/sampling_logp_difference/mean": 0.018454013392329216, "step": 664, "step_time": 26.48663061199477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.15724357962608337, "epoch": 0.665, "frac_reward_zero_std": 0.0, "grad_norm": 0.6985964775085449, "kl": 0.00816549826413393, "learning_rate": 1.3398468800498293e-06, "loss": 0.0756, "num_tokens": 1860752.0, "reward": 0.45750001072883606, "reward_std": 0.6265447735786438, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.6265448331832886, "sampling/importance_sampling_ratio/max": 1.019221544265747, "sampling/importance_sampling_ratio/mean": 0.6432468891143799, "sampling/importance_sampling_ratio/min": 0.3392317593097687, "sampling/sampling_logp_difference/max": 0.5014891624450684, "sampling/sampling_logp_difference/mean": 0.018090814352035522, "step": 665, "step_time": 20.372971600969322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.1832060068845749, "epoch": 0.666, "frac_reward_zero_std": 0.0, "grad_norm": 1.3574000597000122, "kl": 0.020083967596292496, "learning_rate": 1.3326807284568984e-06, "loss": 0.1276, "num_tokens": 1863612.0, "reward": 0.7350000143051147, "reward_std": 0.5233545899391174, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 1.1306984424591064, "sampling/importance_sampling_ratio/mean": 0.8299560546875, "sampling/importance_sampling_ratio/min": 0.42608001828193665, "sampling/sampling_logp_difference/max": 0.4198506474494934, "sampling/sampling_logp_difference/mean": 0.019277848303318024, "step": 666, "step_time": 17.04821144498419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.15777072310447693, "epoch": 0.667, "frac_reward_zero_std": 0.0, "grad_norm": 1.265883207321167, "kl": 0.014509282074868679, "learning_rate": 1.3255268214907612e-06, "loss": 0.0454, "num_tokens": 1866263.0, "reward": 0.9900000095367432, "reward_std": 0.014142122119665146, "rewards/reward_func/mean": 0.9900000095367432, "rewards/reward_func/std": 0.014142122119665146, "sampling/importance_sampling_ratio/max": 1.3779301643371582, "sampling/importance_sampling_ratio/mean": 0.9226425886154175, "sampling/importance_sampling_ratio/min": 0.48822417855262756, "sampling/sampling_logp_difference/max": 0.5214121341705322, "sampling/sampling_logp_difference/mean": 0.018436823040246964, "step": 667, "step_time": 23.33435485197697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.18605072796344757, "epoch": 0.668, "frac_reward_zero_std": 0.0, "grad_norm": 0.6162489652633667, "kl": 0.014018435031175613, "learning_rate": 1.3183852341925147e-06, "loss": -0.0265, "num_tokens": 1869067.0, "reward": 0.47749999165534973, "reward_std": 0.6036761999130249, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6036762595176697, "sampling/importance_sampling_ratio/max": 0.679753839969635, "sampling/importance_sampling_ratio/mean": 0.49573904275894165, "sampling/importance_sampling_ratio/min": 0.3718257248401642, "sampling/sampling_logp_difference/max": 0.5304445028305054, "sampling/sampling_logp_difference/mean": 0.025768741965293884, "step": 668, "step_time": 24.589709472027607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.22388552129268646, "epoch": 0.669, "frac_reward_zero_std": 0.0, "grad_norm": 1.3845494985580444, "kl": 0.014910580590367317, "learning_rate": 1.3112560414740315e-06, "loss": -0.1834, "num_tokens": 1871503.0, "reward": 0.4950000047683716, "reward_std": 0.5773791074752808, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5773791670799255, "sampling/importance_sampling_ratio/max": 1.4454587697982788, "sampling/importance_sampling_ratio/mean": 0.9307541847229004, "sampling/importance_sampling_ratio/min": 0.6241759061813354, "sampling/sampling_logp_difference/max": 0.7658398151397705, "sampling/sampling_logp_difference/mean": 0.026516573503613472, "step": 669, "step_time": 16.99825400696136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18292450904846191, "epoch": 0.67, "frac_reward_zero_std": 0.0, "grad_norm": 1.8048133850097656, "kl": 0.009369716979563236, "learning_rate": 1.3041393181171688e-06, "loss": 0.086, "num_tokens": 1874376.0, "reward": 0.7050000429153442, "reward_std": 0.5634121894836426, "rewards/reward_func/mean": 0.7050000429153442, "rewards/reward_func/std": 0.5634121894836426, "sampling/importance_sampling_ratio/max": 1.192623496055603, "sampling/importance_sampling_ratio/mean": 1.0461325645446777, "sampling/importance_sampling_ratio/min": 0.8841845393180847, "sampling/sampling_logp_difference/max": 0.2285858392715454, "sampling/sampling_logp_difference/mean": 0.018768103793263435, "step": 670, "step_time": 29.174343423044775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.16078607738018036, "epoch": 0.671, "frac_reward_zero_std": 0.0, "grad_norm": 1.3540736436843872, "kl": 0.02202623337507248, "learning_rate": 1.2970351387729875e-06, "loss": -0.527, "num_tokens": 1877738.0, "reward": 0.4724999964237213, "reward_std": 0.6097745299339294, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6097745895385742, "sampling/importance_sampling_ratio/max": 1.7255882024765015, "sampling/importance_sampling_ratio/mean": 1.0588319301605225, "sampling/importance_sampling_ratio/min": 0.4830848276615143, "sampling/sampling_logp_difference/max": 0.4921238422393799, "sampling/sampling_logp_difference/mean": 0.01737845316529274, "step": 671, "step_time": 24.853901392023545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 62.5, "completions/mean_terminated_length": 62.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.15825623273849487, "epoch": 0.672, "frac_reward_zero_std": 0.0, "grad_norm": 0.8058205246925354, "kl": 0.009610535576939583, "learning_rate": 1.2899435779609681e-06, "loss": 0.1731, "num_tokens": 1880874.0, "reward": 0.48250001668930054, "reward_std": 0.5979060530662537, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5979060530662537, "sampling/importance_sampling_ratio/max": 1.1514559984207153, "sampling/importance_sampling_ratio/mean": 0.8971196413040161, "sampling/importance_sampling_ratio/min": 0.6232560276985168, "sampling/sampling_logp_difference/max": 0.5202617645263672, "sampling/sampling_logp_difference/mean": 0.016872987151145935, "step": 672, "step_time": 32.89026307495078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.18354015052318573, "epoch": 0.673, "frac_reward_zero_std": 1.0, "grad_norm": 0.007973631843924522, "kl": 0.014432860538363457, "learning_rate": 1.2828647100682263e-06, "loss": 0.0001, "num_tokens": 1884065.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3207707405090332, "sampling/importance_sampling_ratio/mean": 0.912696897983551, "sampling/importance_sampling_ratio/min": 0.6875709295272827, "sampling/sampling_logp_difference/max": 0.4234578609466553, "sampling/sampling_logp_difference/mean": 0.020133472979068756, "step": 673, "step_time": 17.80598710302729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18049323558807373, "epoch": 0.674, "frac_reward_zero_std": 0.0, "grad_norm": 0.7679958939552307, "kl": 0.010193000547587872, "learning_rate": 1.275798609348738e-06, "loss": -0.3207, "num_tokens": 1886509.0, "reward": 0.4399999976158142, "reward_std": 0.5844085216522217, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.5844085812568665, "sampling/importance_sampling_ratio/max": 1.156139850616455, "sampling/importance_sampling_ratio/mean": 0.6962330341339111, "sampling/importance_sampling_ratio/min": 0.14326363801956177, "sampling/sampling_logp_difference/max": 1.455923080444336, "sampling/sampling_logp_difference/mean": 0.02460920624434948, "step": 674, "step_time": 30.253741153981537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.15432065725326538, "epoch": 0.675, "frac_reward_zero_std": 0.0, "grad_norm": 2.570892333984375, "kl": 0.017859335988759995, "learning_rate": 1.2687453499225547e-06, "loss": -0.5015, "num_tokens": 1889042.0, "reward": 0.22499999403953552, "reward_std": 0.5108489990234375, "rewards/reward_func/mean": 0.22499999403953552, "rewards/reward_func/std": 0.5108489394187927, "sampling/importance_sampling_ratio/max": 2.251554489135742, "sampling/importance_sampling_ratio/mean": 1.3541197776794434, "sampling/importance_sampling_ratio/min": 0.5651237368583679, "sampling/sampling_logp_difference/max": 0.4240577220916748, "sampling/sampling_logp_difference/mean": 0.02245471626520157, "step": 675, "step_time": 26.519882961991243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1628585159778595, "epoch": 0.676, "frac_reward_zero_std": 0.0, "grad_norm": 0.7254642844200134, "kl": 0.05806570500135422, "learning_rate": 1.2617050057750322e-06, "loss": -0.4227, "num_tokens": 1891786.0, "reward": 0.737500011920929, "reward_std": 0.5050659775733948, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5050659775733948, "sampling/importance_sampling_ratio/max": 2.115140438079834, "sampling/importance_sampling_ratio/mean": 1.0242891311645508, "sampling/importance_sampling_ratio/min": 0.20064905285835266, "sampling/sampling_logp_difference/max": 0.6591830253601074, "sampling/sampling_logp_difference/mean": 0.02457844652235508, "step": 676, "step_time": 19.943034565018024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.16330550611019135, "epoch": 0.677, "frac_reward_zero_std": 0.0, "grad_norm": 0.7413411736488342, "kl": 0.007159217726439238, "learning_rate": 1.2546776507560468e-06, "loss": 0.1468, "num_tokens": 1894216.0, "reward": 0.4375, "reward_std": 0.6323699355125427, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.6323699355125427, "sampling/importance_sampling_ratio/max": 1.284358263015747, "sampling/importance_sampling_ratio/mean": 0.9655959606170654, "sampling/importance_sampling_ratio/min": 0.5000113248825073, "sampling/sampling_logp_difference/max": 0.4805019497871399, "sampling/sampling_logp_difference/mean": 0.012945675291121006, "step": 677, "step_time": 28.45717353798682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1774313598871231, "epoch": 0.678, "frac_reward_zero_std": 0.0, "grad_norm": 1.1628450155258179, "kl": 0.012850012630224228, "learning_rate": 1.2476633585792287e-06, "loss": -0.1957, "num_tokens": 1896728.0, "reward": 0.4649999737739563, "reward_std": 0.596294105052948, "rewards/reward_func/mean": 0.4649999737739563, "rewards/reward_func/std": 0.596294105052948, "sampling/importance_sampling_ratio/max": 1.5097466707229614, "sampling/importance_sampling_ratio/mean": 1.2127716541290283, "sampling/importance_sampling_ratio/min": 0.498039186000824, "sampling/sampling_logp_difference/max": 0.6689786911010742, "sampling/sampling_logp_difference/mean": 0.01599772274494171, "step": 678, "step_time": 23.39260895602638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 79.0, "completions/max_terminated_length": 79.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.15781891345977783, "epoch": 0.679, "frac_reward_zero_std": 0.0, "grad_norm": 3.9718613624572754, "kl": 0.02338583767414093, "learning_rate": 1.2406622028211846e-06, "loss": 0.0532, "num_tokens": 1900014.0, "reward": 0.7275000214576721, "reward_std": 0.5383539795875549, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5383539795875549, "sampling/importance_sampling_ratio/max": 2.530552864074707, "sampling/importance_sampling_ratio/mean": 1.50727117061615, "sampling/importance_sampling_ratio/min": 0.31091585755348206, "sampling/sampling_logp_difference/max": 0.7136077880859375, "sampling/sampling_logp_difference/mean": 0.023946184664964676, "step": 679, "step_time": 21.57341799803544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.16204150021076202, "epoch": 0.68, "frac_reward_zero_std": 0.0, "grad_norm": 1.456026315689087, "kl": 0.039873380213975906, "learning_rate": 1.2336742569207235e-06, "loss": -0.0357, "num_tokens": 1902728.0, "reward": 0.7250000238418579, "reward_std": 0.5301886796951294, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.5301886796951294, "sampling/importance_sampling_ratio/max": 1.3115283250808716, "sampling/importance_sampling_ratio/mean": 0.9447559118270874, "sampling/importance_sampling_ratio/min": 0.5883325338363647, "sampling/sampling_logp_difference/max": 1.0115026235580444, "sampling/sampling_logp_difference/mean": 0.024918431416153908, "step": 680, "step_time": 27.44724629999837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 60.5, "completions/mean_terminated_length": 60.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.22230394184589386, "epoch": 0.681, "frac_reward_zero_std": 0.0, "grad_norm": 1.2441941499710083, "kl": 0.016717134043574333, "learning_rate": 1.2266995941780934e-06, "loss": 0.0086, "num_tokens": 1905156.0, "reward": 0.4675000011920929, "reward_std": 0.6091728806495667, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6091729402542114, "sampling/importance_sampling_ratio/max": 1.5115474462509155, "sampling/importance_sampling_ratio/mean": 1.0425636768341064, "sampling/importance_sampling_ratio/min": 0.6816808581352234, "sampling/sampling_logp_difference/max": 0.7789735794067383, "sampling/sampling_logp_difference/mean": 0.024120084941387177, "step": 681, "step_time": 17.231432911998127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "entropy": 0.20740243792533875, "epoch": 0.682, "frac_reward_zero_std": 0.0, "grad_norm": 1.6867090463638306, "kl": 0.01246858760714531, "learning_rate": 1.219738287754204e-06, "loss": 0.5166, "num_tokens": 1907527.0, "reward": 0.7350000143051147, "reward_std": 0.5233545303344727, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 1.7631571292877197, "sampling/importance_sampling_ratio/mean": 0.7843213677406311, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6379227638244629, "sampling/sampling_logp_difference/mean": 0.023944538086652756, "step": 682, "step_time": 19.123204567003995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.17561206221580505, "epoch": 0.683, "frac_reward_zero_std": 0.0, "grad_norm": 1.1406911611557007, "kl": 0.018169889226555824, "learning_rate": 1.2127904106698665e-06, "loss": 0.1794, "num_tokens": 1909908.0, "reward": 0.4400000274181366, "reward_std": 0.5973832607269287, "rewards/reward_func/mean": 0.4400000274181366, "rewards/reward_func/std": 0.5973832011222839, "sampling/importance_sampling_ratio/max": 1.8623545169830322, "sampling/importance_sampling_ratio/mean": 0.8905357122421265, "sampling/importance_sampling_ratio/min": 0.28850045800209045, "sampling/sampling_logp_difference/max": 0.5671961307525635, "sampling/sampling_logp_difference/mean": 0.021296625956892967, "step": 683, "step_time": 22.480978624953423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.15201327204704285, "epoch": 0.684, "frac_reward_zero_std": 0.0, "grad_norm": 0.931879997253418, "kl": 0.008392587304115295, "learning_rate": 1.2058560358050242e-06, "loss": -0.064, "num_tokens": 1912707.0, "reward": 0.4724999964237213, "reward_std": 0.6102117300033569, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6102117300033569, "sampling/importance_sampling_ratio/max": 0.9025232195854187, "sampling/importance_sampling_ratio/mean": 0.7802497148513794, "sampling/importance_sampling_ratio/min": 0.6496914029121399, "sampling/sampling_logp_difference/max": 0.3881196975708008, "sampling/sampling_logp_difference/mean": 0.014654451981186867, "step": 684, "step_time": 29.235026963986456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18260405957698822, "epoch": 0.685, "frac_reward_zero_std": 0.0, "grad_norm": 1.2494268417358398, "kl": 0.015494224615395069, "learning_rate": 1.1989352358979888e-06, "loss": -0.2135, "num_tokens": 1915807.0, "reward": 0.7200000286102295, "reward_std": 0.5467479825019836, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5467479825019836, "sampling/importance_sampling_ratio/max": 2.045649290084839, "sampling/importance_sampling_ratio/mean": 1.041058897972107, "sampling/importance_sampling_ratio/min": 0.4365946054458618, "sampling/sampling_logp_difference/max": 0.6800916194915771, "sampling/sampling_logp_difference/mean": 0.02187693491578102, "step": 685, "step_time": 21.287946883006953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.15355905890464783, "epoch": 0.686, "frac_reward_zero_std": 0.0, "grad_norm": 1.42573881149292, "kl": 0.2112908810377121, "learning_rate": 1.192028083544675e-06, "loss": -0.1901, "num_tokens": 1918550.0, "reward": -0.10249999910593033, "reward_std": 0.079320028424263, "rewards/reward_func/mean": -0.10249999910593033, "rewards/reward_func/std": 0.079320028424263, "sampling/importance_sampling_ratio/max": 1.7652370929718018, "sampling/importance_sampling_ratio/mean": 0.8168801069259644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8712925910949707, "sampling/sampling_logp_difference/mean": 0.020637063309550285, "step": 686, "step_time": 40.33678616001271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.19389846920967102, "epoch": 0.687, "frac_reward_zero_std": 0.0, "grad_norm": 0.8411192297935486, "kl": 0.01714112050831318, "learning_rate": 1.1851346511978427e-06, "loss": -0.3038, "num_tokens": 1921219.0, "reward": 0.9800000190734863, "reward_std": 0.0336650051176548, "rewards/reward_func/mean": 0.9800000190734863, "rewards/reward_func/std": 0.0336650125682354, "sampling/importance_sampling_ratio/max": 1.445432424545288, "sampling/importance_sampling_ratio/mean": 1.012347936630249, "sampling/importance_sampling_ratio/min": 0.4856460690498352, "sampling/sampling_logp_difference/max": 0.34787988662719727, "sampling/sampling_logp_difference/mean": 0.019954465329647064, "step": 687, "step_time": 15.59982443205081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15489740669727325, "epoch": 0.688, "frac_reward_zero_std": 0.0, "grad_norm": 0.9113624095916748, "kl": 0.017412204295396805, "learning_rate": 1.178255011166337e-06, "loss": 0.0884, "num_tokens": 1924050.0, "reward": 0.7300000190734863, "reward_std": 0.5333542227745056, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5333541631698608, "sampling/importance_sampling_ratio/max": 0.8303588032722473, "sampling/importance_sampling_ratio/mean": 0.6948626041412354, "sampling/importance_sampling_ratio/min": 0.46059513092041016, "sampling/sampling_logp_difference/max": 0.5306453704833984, "sampling/sampling_logp_difference/mean": 0.014744197018444538, "step": 688, "step_time": 21.30548561300384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.19587302207946777, "epoch": 0.689, "frac_reward_zero_std": 0.0, "grad_norm": 2.994297742843628, "kl": 0.009213171899318695, "learning_rate": 1.171389235614324e-06, "loss": 0.2358, "num_tokens": 1927022.0, "reward": 0.7150000333786011, "reward_std": 0.5699999928474426, "rewards/reward_func/mean": 0.7150000333786011, "rewards/reward_func/std": 0.5699999928474426, "sampling/importance_sampling_ratio/max": 1.4069054126739502, "sampling/importance_sampling_ratio/mean": 0.9677623510360718, "sampling/importance_sampling_ratio/min": 0.47890836000442505, "sampling/sampling_logp_difference/max": 0.3979848623275757, "sampling/sampling_logp_difference/mean": 0.017313562333583832, "step": 689, "step_time": 25.7125615819823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.17625375092029572, "epoch": 0.69, "frac_reward_zero_std": 0.0, "grad_norm": 1.6779705286026, "kl": 0.013130989857017994, "learning_rate": 1.1645373965605424e-06, "loss": 0.0598, "num_tokens": 1929637.0, "reward": 0.45249998569488525, "reward_std": 0.6283510327339172, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6283509731292725, "sampling/importance_sampling_ratio/max": 1.7670490741729736, "sampling/importance_sampling_ratio/mean": 1.5317835807800293, "sampling/importance_sampling_ratio/min": 1.1295089721679688, "sampling/sampling_logp_difference/max": 0.5055224895477295, "sampling/sampling_logp_difference/mean": 0.01866794191300869, "step": 690, "step_time": 29.388456568995025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.21710196137428284, "epoch": 0.691, "frac_reward_zero_std": 0.0, "grad_norm": 1.5798344612121582, "kl": 0.010813696309924126, "learning_rate": 1.1576995658775405e-06, "loss": 0.2285, "num_tokens": 1932614.0, "reward": 0.9950000047683716, "reward_std": 0.009999990463256836, "rewards/reward_func/mean": 0.9950000047683716, "rewards/reward_func/std": 0.009999990463256836, "sampling/importance_sampling_ratio/max": 1.9343171119689941, "sampling/importance_sampling_ratio/mean": 1.4603794813156128, "sampling/importance_sampling_ratio/min": 1.2002147436141968, "sampling/sampling_logp_difference/max": 0.2993013262748718, "sampling/sampling_logp_difference/mean": 0.022444991394877434, "step": 691, "step_time": 14.832698397978675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.15841224789619446, "epoch": 0.692, "frac_reward_zero_std": 0.0, "grad_norm": 1.020944356918335, "kl": 0.020345650613307953, "learning_rate": 1.1508758152909276e-06, "loss": 0.1105, "num_tokens": 1935811.0, "reward": 0.2175000011920929, "reward_std": 0.5220711827278137, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.5220711827278137, "sampling/importance_sampling_ratio/max": 1.2650750875473022, "sampling/importance_sampling_ratio/mean": 0.786872386932373, "sampling/importance_sampling_ratio/min": 0.3641090989112854, "sampling/sampling_logp_difference/max": 0.5222735404968262, "sampling/sampling_logp_difference/mean": 0.025538161396980286, "step": 692, "step_time": 32.957738024007995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16953511536121368, "epoch": 0.693, "frac_reward_zero_std": 0.0, "grad_norm": 2.665311813354492, "kl": 0.0062560890801250935, "learning_rate": 1.1440662163786168e-06, "loss": 0.0771, "num_tokens": 1938623.0, "reward": 0.4700000286102295, "reward_std": 0.5891801714897156, "rewards/reward_func/mean": 0.4700000286102295, "rewards/reward_func/std": 0.5891802310943604, "sampling/importance_sampling_ratio/max": 1.1128435134887695, "sampling/importance_sampling_ratio/mean": 0.7925659418106079, "sampling/importance_sampling_ratio/min": 0.34047815203666687, "sampling/sampling_logp_difference/max": 0.4195902347564697, "sampling/sampling_logp_difference/mean": 0.013841532170772552, "step": 693, "step_time": 23.263072183995973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.21949127316474915, "epoch": 0.694, "frac_reward_zero_std": 0.0, "grad_norm": 1.048211693763733, "kl": 0.013864029198884964, "learning_rate": 1.1372708405700794e-06, "loss": -0.0181, "num_tokens": 1941091.0, "reward": 0.19249999523162842, "reward_std": 0.5419332385063171, "rewards/reward_func/mean": 0.19249999523162842, "rewards/reward_func/std": 0.5419332981109619, "sampling/importance_sampling_ratio/max": 1.0678907632827759, "sampling/importance_sampling_ratio/mean": 0.8189404010772705, "sampling/importance_sampling_ratio/min": 0.4544825553894043, "sampling/sampling_logp_difference/max": 0.672843337059021, "sampling/sampling_logp_difference/mean": 0.026752181351184845, "step": 694, "step_time": 30.027207756997086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17363199591636658, "epoch": 0.695, "frac_reward_zero_std": 0.0, "grad_norm": 1.2954981327056885, "kl": 0.011540659703314304, "learning_rate": 1.130489759145593e-06, "loss": -0.1796, "num_tokens": 1943778.0, "reward": 0.45749998092651367, "reward_std": 0.6267575621604919, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6267575621604919, "sampling/importance_sampling_ratio/max": 1.4664690494537354, "sampling/importance_sampling_ratio/mean": 0.9116808176040649, "sampling/importance_sampling_ratio/min": 0.4786515235900879, "sampling/sampling_logp_difference/max": 0.47525423765182495, "sampling/sampling_logp_difference/mean": 0.023723328486084938, "step": 695, "step_time": 22.443636220006738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16085851192474365, "epoch": 0.696, "frac_reward_zero_std": 0.0, "grad_norm": 1.1584553718566895, "kl": 0.010659406892955303, "learning_rate": 1.1237230432354912e-06, "loss": 0.2284, "num_tokens": 1946571.0, "reward": 0.48000001907348633, "reward_std": 0.5894629955291748, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.5894630551338196, "sampling/importance_sampling_ratio/max": 1.4723174571990967, "sampling/importance_sampling_ratio/mean": 0.9917522668838501, "sampling/importance_sampling_ratio/min": 0.6992703676223755, "sampling/sampling_logp_difference/max": 0.5116240978240967, "sampling/sampling_logp_difference/mean": 0.017352299764752388, "step": 696, "step_time": 21.55389647500124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1589394509792328, "epoch": 0.697, "frac_reward_zero_std": 0.0, "grad_norm": 1.0688273906707764, "kl": 0.007772871293127537, "learning_rate": 1.116970763819424e-06, "loss": 0.0086, "num_tokens": 1949192.0, "reward": 0.2175000011920929, "reward_std": 0.5227092504501343, "rewards/reward_func/mean": 0.2175000011920929, "rewards/reward_func/std": 0.522709310054779, "sampling/importance_sampling_ratio/max": 1.2568421363830566, "sampling/importance_sampling_ratio/mean": 1.120989441871643, "sampling/importance_sampling_ratio/min": 1.014384150505066, "sampling/sampling_logp_difference/max": 0.22574341297149658, "sampling/sampling_logp_difference/mean": 0.014775886200368404, "step": 697, "step_time": 27.857453167962376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18689307570457458, "epoch": 0.698, "frac_reward_zero_std": 0.0, "grad_norm": 3.302844285964966, "kl": 0.009897071868181229, "learning_rate": 1.1102329917256047e-06, "loss": 0.4338, "num_tokens": 1952179.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 2.5051963329315186, "sampling/importance_sampling_ratio/mean": 1.3562290668487549, "sampling/importance_sampling_ratio/min": 0.7542689442634583, "sampling/sampling_logp_difference/max": 0.4436211585998535, "sampling/sampling_logp_difference/mean": 0.022337360307574272, "step": 698, "step_time": 16.997008278034627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16848671436309814, "epoch": 0.699, "frac_reward_zero_std": 0.0, "grad_norm": 0.6557225584983826, "kl": 0.02143358625471592, "learning_rate": 1.103509797630077e-06, "loss": 0.0836, "num_tokens": 1954821.0, "reward": 0.46000000834465027, "reward_std": 0.6066849827766418, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.6066849827766418, "sampling/importance_sampling_ratio/max": 0.772297203540802, "sampling/importance_sampling_ratio/mean": 0.7129390239715576, "sampling/importance_sampling_ratio/min": 0.6424605846405029, "sampling/sampling_logp_difference/max": 0.4757019877433777, "sampling/sampling_logp_difference/mean": 0.0152615150436759, "step": 699, "step_time": 25.717010673950426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16029885411262512, "epoch": 0.7, "frac_reward_zero_std": 0.0, "grad_norm": 1.2565683126449585, "kl": 0.01652313396334648, "learning_rate": 1.0968012520559634e-06, "loss": -0.1297, "num_tokens": 1957638.0, "reward": 0.7174999713897705, "reward_std": 0.5649999976158142, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.5649999976158142, "sampling/importance_sampling_ratio/max": 1.1680848598480225, "sampling/importance_sampling_ratio/mean": 0.8021676540374756, "sampling/importance_sampling_ratio/min": 0.6033262610435486, "sampling/sampling_logp_difference/max": 0.3912663459777832, "sampling/sampling_logp_difference/mean": 0.014552041888237, "step": 700, "step_time": 22.502415989001747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.15743423998355865, "epoch": 0.701, "frac_reward_zero_std": 0.0, "grad_norm": 1.5367584228515625, "kl": 0.013473836705088615, "learning_rate": 1.0901074253727338e-06, "loss": 0.3004, "num_tokens": 1960537.0, "reward": 0.45500001311302185, "reward_std": 0.6212621927261353, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6212621331214905, "sampling/importance_sampling_ratio/max": 1.7083156108856201, "sampling/importance_sampling_ratio/mean": 1.0699577331542969, "sampling/importance_sampling_ratio/min": 0.6667116284370422, "sampling/sampling_logp_difference/max": 0.9061718583106995, "sampling/sampling_logp_difference/mean": 0.02741251513361931, "step": 701, "step_time": 21.768594132037833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 59.5, "completions/mean_terminated_length": 59.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.2432118058204651, "epoch": 0.702, "frac_reward_zero_std": 0.0, "grad_norm": 1.15652596950531, "kl": 0.015364106744527817, "learning_rate": 1.0834283877954629e-06, "loss": 0.0804, "num_tokens": 1963629.0, "reward": 0.7250000238418579, "reward_std": 0.550000011920929, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.550000011920929, "sampling/importance_sampling_ratio/max": 1.525777816772461, "sampling/importance_sampling_ratio/mean": 0.8097450733184814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.36809319257736206, "sampling/sampling_logp_difference/mean": 0.028614182025194168, "step": 702, "step_time": 29.559320070024114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1953570395708084, "epoch": 0.703, "frac_reward_zero_std": 1.0, "grad_norm": 0.015281864441931248, "kl": 0.016568204388022423, "learning_rate": 1.0767642093840933e-06, "loss": 0.0002, "num_tokens": 1966815.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4548747539520264, "sampling/importance_sampling_ratio/mean": 1.1091108322143555, "sampling/importance_sampling_ratio/min": 0.8349975347518921, "sampling/sampling_logp_difference/max": 0.6219499111175537, "sampling/sampling_logp_difference/mean": 0.016165658831596375, "step": 703, "step_time": 13.381469426036347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18326662480831146, "epoch": 0.704, "frac_reward_zero_std": 0.0, "grad_norm": 1.8187443017959595, "kl": 0.009728511795401573, "learning_rate": 1.0701149600427044e-06, "loss": 0.2145, "num_tokens": 1969410.0, "reward": 0.7450000047683716, "reward_std": 0.5033554434776306, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5033553838729858, "sampling/importance_sampling_ratio/max": 1.9205377101898193, "sampling/importance_sampling_ratio/mean": 1.2594337463378906, "sampling/importance_sampling_ratio/min": 0.8788582682609558, "sampling/sampling_logp_difference/max": 0.42977041006088257, "sampling/sampling_logp_difference/mean": 0.02028745971620083, "step": 704, "step_time": 22.03774317802163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1742338240146637, "epoch": 0.705, "frac_reward_zero_std": 0.0, "grad_norm": 0.8777467012405396, "kl": 0.00959632359445095, "learning_rate": 1.0634807095187739e-06, "loss": -0.1506, "num_tokens": 1972063.0, "reward": 0.7400000095367432, "reward_std": 0.5067543983459473, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5067543983459473, "sampling/importance_sampling_ratio/max": 1.4602584838867188, "sampling/importance_sampling_ratio/mean": 0.9466754198074341, "sampling/importance_sampling_ratio/min": 0.599545955657959, "sampling/sampling_logp_difference/max": 0.5304362773895264, "sampling/sampling_logp_difference/mean": 0.019883565604686737, "step": 705, "step_time": 18.344800705031957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.188254252076149, "epoch": 0.706, "frac_reward_zero_std": 0.0, "grad_norm": 1.3150897026062012, "kl": 0.01690901629626751, "learning_rate": 1.0568615274024521e-06, "loss": 0.1779, "num_tokens": 1975212.0, "reward": 0.4775000214576721, "reward_std": 0.5984632968902588, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5984632968902588, "sampling/importance_sampling_ratio/max": 1.231411337852478, "sampling/importance_sampling_ratio/mean": 0.8587811589241028, "sampling/importance_sampling_ratio/min": 0.48872289061546326, "sampling/sampling_logp_difference/max": 0.42327880859375, "sampling/sampling_logp_difference/mean": 0.01824348047375679, "step": 706, "step_time": 27.947045795968734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15054745972156525, "epoch": 0.707, "frac_reward_zero_std": 0.0, "grad_norm": 1.0521990060806274, "kl": 0.008674677461385727, "learning_rate": 1.0502574831258259e-06, "loss": 0.06, "num_tokens": 1978215.0, "reward": 0.737500011920929, "reward_std": 0.5050659775733948, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5050659775733948, "sampling/importance_sampling_ratio/max": 1.3509514331817627, "sampling/importance_sampling_ratio/mean": 1.089831829071045, "sampling/importance_sampling_ratio/min": 0.8873059749603271, "sampling/sampling_logp_difference/max": 0.2947990894317627, "sampling/sampling_logp_difference/mean": 0.015538704581558704, "step": 707, "step_time": 32.55240538599901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.25, "completions/mean_terminated_length": 45.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1661231815814972, "epoch": 0.708, "frac_reward_zero_std": 0.0, "grad_norm": 1.3954211473464966, "kl": 0.017934156581759453, "learning_rate": 1.043668645962195e-06, "loss": 0.1236, "num_tokens": 1981098.0, "reward": 0.2199999988079071, "reward_std": 0.5217917561531067, "rewards/reward_func/mean": 0.2199999988079071, "rewards/reward_func/std": 0.5217917561531067, "sampling/importance_sampling_ratio/max": 2.014948844909668, "sampling/importance_sampling_ratio/mean": 1.0221155881881714, "sampling/importance_sampling_ratio/min": 0.5755712985992432, "sampling/sampling_logp_difference/max": 0.5101752281188965, "sampling/sampling_logp_difference/mean": 0.01902727410197258, "step": 708, "step_time": 37.02133324398892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17880955338478088, "epoch": 0.709, "frac_reward_zero_std": 0.0, "grad_norm": 0.940302312374115, "kl": 0.009082653559744358, "learning_rate": 1.037095085025345e-06, "loss": -0.066, "num_tokens": 1984076.0, "reward": 0.4775000214576721, "reward_std": 0.5805959701538086, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5805959701538086, "sampling/importance_sampling_ratio/max": 1.2936863899230957, "sampling/importance_sampling_ratio/mean": 0.8493080735206604, "sampling/importance_sampling_ratio/min": 0.5143672823905945, "sampling/sampling_logp_difference/max": 0.42260313034057617, "sampling/sampling_logp_difference/mean": 0.0230729840695858, "step": 709, "step_time": 27.23126086796401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16109369695186615, "epoch": 0.71, "frac_reward_zero_std": 0.0, "grad_norm": 0.8408161997795105, "kl": 0.018380463123321533, "learning_rate": 1.0305368692688175e-06, "loss": -0.0049, "num_tokens": 1986928.0, "reward": 0.4724999964237213, "reward_std": 0.6097745299339294, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6097745895385742, "sampling/importance_sampling_ratio/max": 0.8472173810005188, "sampling/importance_sampling_ratio/mean": 0.7312625646591187, "sampling/importance_sampling_ratio/min": 0.6451421976089478, "sampling/sampling_logp_difference/max": 0.33403587341308594, "sampling/sampling_logp_difference/mean": 0.01686948724091053, "step": 710, "step_time": 23.09410350699909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.17019344866275787, "epoch": 0.711, "frac_reward_zero_std": 0.0, "grad_norm": 2.1232824325561523, "kl": 0.008030352182686329, "learning_rate": 1.0239940674851943e-06, "loss": 0.2092, "num_tokens": 1989864.0, "reward": 0.7325000166893005, "reward_std": 0.4894469678401947, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.4894469678401947, "sampling/importance_sampling_ratio/max": 1.70095956325531, "sampling/importance_sampling_ratio/mean": 1.2173515558242798, "sampling/importance_sampling_ratio/min": 0.6396232843399048, "sampling/sampling_logp_difference/max": 0.4416775703430176, "sampling/sampling_logp_difference/mean": 0.023577244952321053, "step": 711, "step_time": 26.35717175999889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15705543756484985, "epoch": 0.712, "frac_reward_zero_std": 0.0, "grad_norm": 0.9585121870040894, "kl": 0.02179640345275402, "learning_rate": 1.0174667483053682e-06, "loss": 0.0803, "num_tokens": 1992676.0, "reward": 0.7300000190734863, "reward_std": 0.5400000214576721, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 1.0074682235717773, "sampling/importance_sampling_ratio/mean": 0.8175646066665649, "sampling/importance_sampling_ratio/min": 0.49997568130493164, "sampling/sampling_logp_difference/max": 0.4557321071624756, "sampling/sampling_logp_difference/mean": 0.018549950793385506, "step": 712, "step_time": 19.704493640980218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1879270225763321, "epoch": 0.713, "frac_reward_zero_std": 0.0, "grad_norm": 0.8939632177352905, "kl": 0.01124010980129242, "learning_rate": 1.0109549801978306e-06, "loss": -0.0037, "num_tokens": 1995400.0, "reward": 0.44749999046325684, "reward_std": 0.637985110282898, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.6379851698875427, "sampling/importance_sampling_ratio/max": 0.7486835718154907, "sampling/importance_sampling_ratio/mean": 0.7318408489227295, "sampling/importance_sampling_ratio/min": 0.7222962379455566, "sampling/sampling_logp_difference/max": 0.6675922870635986, "sampling/sampling_logp_difference/mean": 0.029690761119127274, "step": 713, "step_time": 33.028068289975636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.2051449865102768, "epoch": 0.714, "frac_reward_zero_std": 0.0, "grad_norm": 1.8255308866500854, "kl": 0.016956554725766182, "learning_rate": 1.0044588314679452e-06, "loss": 0.0688, "num_tokens": 1998202.0, "reward": 0.20999999344348907, "reward_std": 0.5283306837081909, "rewards/reward_func/mean": 0.20999999344348907, "rewards/reward_func/std": 0.5283307433128357, "sampling/importance_sampling_ratio/max": 2.4970903396606445, "sampling/importance_sampling_ratio/mean": 1.5178099870681763, "sampling/importance_sampling_ratio/min": 1.021748423576355, "sampling/sampling_logp_difference/max": 0.48185861110687256, "sampling/sampling_logp_difference/mean": 0.025594091042876244, "step": 714, "step_time": 24.760290696984157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.204759418964386, "epoch": 0.715, "frac_reward_zero_std": 0.0, "grad_norm": 1.919746994972229, "kl": 0.009655832313001156, "learning_rate": 9.979783702572413e-07, "loss": -0.2139, "num_tokens": 2001007.0, "reward": 0.45749998092651367, "reward_std": 0.6264383792877197, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6264383792877197, "sampling/importance_sampling_ratio/max": 1.935180425643921, "sampling/importance_sampling_ratio/mean": 1.5050908327102661, "sampling/importance_sampling_ratio/min": 0.7599585056304932, "sampling/sampling_logp_difference/max": 0.533684253692627, "sampling/sampling_logp_difference/mean": 0.02052435837686062, "step": 715, "step_time": 21.559102697006892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2009008526802063, "epoch": 0.716, "frac_reward_zero_std": 0.0, "grad_norm": 1.2267452478408813, "kl": 0.015681294724345207, "learning_rate": 9.915136645426885e-07, "loss": 0.0778, "num_tokens": 2003727.0, "reward": 0.4399999976158142, "reward_std": 0.6364484429359436, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.6364485025405884, "sampling/importance_sampling_ratio/max": 1.7718433141708374, "sampling/importance_sampling_ratio/mean": 1.1375638246536255, "sampling/importance_sampling_ratio/min": 0.8104740381240845, "sampling/sampling_logp_difference/max": 0.41234660148620605, "sampling/sampling_logp_difference/mean": 0.019192637875676155, "step": 716, "step_time": 38.716981267964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.15902605652809143, "epoch": 0.717, "frac_reward_zero_std": 0.0, "grad_norm": 1.32768714427948, "kl": 0.02186400629580021, "learning_rate": 9.850647821359918e-07, "loss": -0.0384, "num_tokens": 2006562.0, "reward": 0.17249999940395355, "reward_std": 0.552290678024292, "rewards/reward_func/mean": 0.17249999940395355, "rewards/reward_func/std": 0.552290678024292, "sampling/importance_sampling_ratio/max": 1.019228219985962, "sampling/importance_sampling_ratio/mean": 0.8152288794517517, "sampling/importance_sampling_ratio/min": 0.4933042526245117, "sampling/sampling_logp_difference/max": 0.6309623718261719, "sampling/sampling_logp_difference/mean": 0.01565525308251381, "step": 717, "step_time": 32.93862854200415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.17107419669628143, "epoch": 0.718, "frac_reward_zero_std": 0.0, "grad_norm": 1.196829080581665, "kl": 0.008283451199531555, "learning_rate": 9.786317906828745e-07, "loss": 0.0112, "num_tokens": 2009459.0, "reward": 0.7400000095367432, "reward_std": 0.5067543387413025, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5067543983459473, "sampling/importance_sampling_ratio/max": 1.1044892072677612, "sampling/importance_sampling_ratio/mean": 1.0267512798309326, "sampling/importance_sampling_ratio/min": 0.9220594763755798, "sampling/sampling_logp_difference/max": 0.30576276779174805, "sampling/sampling_logp_difference/mean": 0.015499727800488472, "step": 718, "step_time": 22.0018217159668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16527995467185974, "epoch": 0.719, "frac_reward_zero_std": 0.0, "grad_norm": 1.2433176040649414, "kl": 0.018094314262270927, "learning_rate": 9.722147576623745e-07, "loss": 0.0437, "num_tokens": 2012217.0, "reward": 0.4449999928474426, "reward_std": 0.6408587694168091, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.6408588290214539, "sampling/importance_sampling_ratio/max": 1.8691997528076172, "sampling/importance_sampling_ratio/mean": 1.2755298614501953, "sampling/importance_sampling_ratio/min": 0.8203466534614563, "sampling/sampling_logp_difference/max": 0.35222768783569336, "sampling/sampling_logp_difference/mean": 0.019306136295199394, "step": 719, "step_time": 31.712265649985056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16757917404174805, "epoch": 0.72, "frac_reward_zero_std": 0.0, "grad_norm": 1.0138911008834839, "kl": 0.011057587340474129, "learning_rate": 9.658137503861314e-07, "loss": 0.1386, "num_tokens": 2014933.0, "reward": 0.14750000834465027, "reward_std": 0.5515055656433105, "rewards/reward_func/mean": 0.14750000834465027, "rewards/reward_func/std": 0.5515055060386658, "sampling/importance_sampling_ratio/max": 1.6429708003997803, "sampling/importance_sampling_ratio/mean": 1.0646227598190308, "sampling/importance_sampling_ratio/min": 0.7856729030609131, "sampling/sampling_logp_difference/max": 0.42923521995544434, "sampling/sampling_logp_difference/mean": 0.015791606158018112, "step": 720, "step_time": 33.40644013299607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18734370172023773, "epoch": 0.721, "frac_reward_zero_std": 0.0, "grad_norm": 1.225502371788025, "kl": 0.014365536160767078, "learning_rate": 9.594288359976817e-07, "loss": 0.0184, "num_tokens": 2017890.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.1519101858139038, "sampling/importance_sampling_ratio/mean": 0.752432107925415, "sampling/importance_sampling_ratio/min": 0.27390021085739136, "sampling/sampling_logp_difference/max": 0.5582789778709412, "sampling/sampling_logp_difference/mean": 0.021609732881188393, "step": 721, "step_time": 18.33411379001336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17616461217403412, "epoch": 0.722, "frac_reward_zero_std": 0.0, "grad_norm": 0.8953106999397278, "kl": 0.009579099714756012, "learning_rate": 9.530600814717575e-07, "loss": 0.0198, "num_tokens": 2020533.0, "reward": 0.46000000834465027, "reward_std": 0.6269502639770508, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.6269502639770508, "sampling/importance_sampling_ratio/max": 1.0486232042312622, "sampling/importance_sampling_ratio/mean": 0.8259384036064148, "sampling/importance_sampling_ratio/min": 0.5922527313232422, "sampling/sampling_logp_difference/max": 0.35149848461151123, "sampling/sampling_logp_difference/mean": 0.018787343055009842, "step": 722, "step_time": 29.213955003011506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.13721592724323273, "epoch": 0.723, "frac_reward_zero_std": 0.0, "grad_norm": 0.8277876973152161, "kl": 0.005629024468362331, "learning_rate": 9.467075536135787e-07, "loss": -0.1305, "num_tokens": 2023093.0, "reward": 0.737500011920929, "reward_std": 0.4922313392162323, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.4922313094139099, "sampling/importance_sampling_ratio/max": 1.0955877304077148, "sampling/importance_sampling_ratio/mean": 0.8334077596664429, "sampling/importance_sampling_ratio/min": 0.6076446771621704, "sampling/sampling_logp_difference/max": 0.5303812026977539, "sampling/sampling_logp_difference/mean": 0.013619763776659966, "step": 723, "step_time": 16.955107585003134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.21815365552902222, "epoch": 0.724, "frac_reward_zero_std": 0.0, "grad_norm": 1.217698574066162, "kl": 0.0071097747422754765, "learning_rate": 9.403713190581576e-07, "loss": -0.2755, "num_tokens": 2025322.0, "reward": 0.7325000166893005, "reward_std": 0.5283543467521667, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5283544063568115, "sampling/importance_sampling_ratio/max": 1.6860201358795166, "sampling/importance_sampling_ratio/mean": 1.1274690628051758, "sampling/importance_sampling_ratio/min": 0.7098177671432495, "sampling/sampling_logp_difference/max": 0.325131893157959, "sampling/sampling_logp_difference/mean": 0.017436789348721504, "step": 724, "step_time": 11.668815009994432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15961281955242157, "epoch": 0.725, "frac_reward_zero_std": 0.0, "grad_norm": 1.154750943183899, "kl": 0.015952900052070618, "learning_rate": 9.340514442695953e-07, "loss": -0.326, "num_tokens": 2028158.0, "reward": 0.7124999761581421, "reward_std": 0.5486574769020081, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.5486574769020081, "sampling/importance_sampling_ratio/max": 2.691126585006714, "sampling/importance_sampling_ratio/mean": 1.3006467819213867, "sampling/importance_sampling_ratio/min": 0.4023272693157196, "sampling/sampling_logp_difference/max": 0.8266539573669434, "sampling/sampling_logp_difference/mean": 0.01811610534787178, "step": 725, "step_time": 24.94039690500358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 78.0, "completions/max_terminated_length": 78.0, "completions/mean_length": 62.75, "completions/mean_terminated_length": 62.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16903364658355713, "epoch": 0.726, "frac_reward_zero_std": 0.0, "grad_norm": 0.9491844773292542, "kl": 0.010009784251451492, "learning_rate": 9.277479955403887e-07, "loss": 0.1796, "num_tokens": 2030679.0, "reward": 0.45249998569488525, "reward_std": 0.615379273891449, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.615379273891449, "sampling/importance_sampling_ratio/max": 1.6663516759872437, "sampling/importance_sampling_ratio/mean": 1.1396737098693848, "sampling/importance_sampling_ratio/min": 0.6469031572341919, "sampling/sampling_logp_difference/max": 0.3937532901763916, "sampling/sampling_logp_difference/mean": 0.016705503687262535, "step": 726, "step_time": 17.624534689995926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.17963173985481262, "epoch": 0.727, "frac_reward_zero_std": 0.0, "grad_norm": 0.5117601752281189, "kl": 0.015238159336149693, "learning_rate": 9.214610389907327e-07, "loss": -0.1873, "num_tokens": 2033320.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 0.7068102955818176, "sampling/importance_sampling_ratio/mean": 0.5739802122116089, "sampling/importance_sampling_ratio/min": 0.33175501227378845, "sampling/sampling_logp_difference/max": 0.5143416523933411, "sampling/sampling_logp_difference/mean": 0.01867019385099411, "step": 727, "step_time": 11.484074496023823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16959020495414734, "epoch": 0.728, "frac_reward_zero_std": 0.0, "grad_norm": 1.4641830921173096, "kl": 0.00800903607159853, "learning_rate": 9.151906405678251e-07, "loss": 0.3752, "num_tokens": 2036105.0, "reward": -0.07249999791383743, "reward_std": 0.040311288088560104, "rewards/reward_func/mean": -0.07249999791383743, "rewards/reward_func/std": 0.040311288088560104, "sampling/importance_sampling_ratio/max": 1.6005078554153442, "sampling/importance_sampling_ratio/mean": 1.1864056587219238, "sampling/importance_sampling_ratio/min": 0.6224417686462402, "sampling/sampling_logp_difference/max": 0.4277052879333496, "sampling/sampling_logp_difference/mean": 0.015602022409439087, "step": 728, "step_time": 35.06643193698255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.17962051928043365, "epoch": 0.729, "frac_reward_zero_std": 0.0, "grad_norm": 1.0354877710342407, "kl": 0.006228611338883638, "learning_rate": 9.0893686604518e-07, "loss": 0.0252, "num_tokens": 2038571.0, "reward": 0.4424999952316284, "reward_std": 0.6443795561790466, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.6443795561790466, "sampling/importance_sampling_ratio/max": 1.4196176528930664, "sampling/importance_sampling_ratio/mean": 1.064631462097168, "sampling/importance_sampling_ratio/min": 0.7318459153175354, "sampling/sampling_logp_difference/max": 0.3618814945220947, "sampling/sampling_logp_difference/mean": 0.018481481820344925, "step": 729, "step_time": 22.281085506023373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.188942551612854, "epoch": 0.73, "frac_reward_zero_std": 0.0, "grad_norm": 1.402450442314148, "kl": 0.013719546608626842, "learning_rate": 9.026997810219313e-07, "loss": 0.3985, "num_tokens": 2041460.0, "reward": 0.48500001430511475, "reward_std": 0.5947268009185791, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5947268605232239, "sampling/importance_sampling_ratio/max": 2.342262029647827, "sampling/importance_sampling_ratio/mean": 1.2136986255645752, "sampling/importance_sampling_ratio/min": 0.6102213859558105, "sampling/sampling_logp_difference/max": 0.7804973125457764, "sampling/sampling_logp_difference/mean": 0.023561185225844383, "step": 730, "step_time": 20.50828202604316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17801183462142944, "epoch": 0.731, "frac_reward_zero_std": 0.0, "grad_norm": 4.140344142913818, "kl": 0.021650364622473717, "learning_rate": 8.964794509221508e-07, "loss": 0.4506, "num_tokens": 2044338.0, "reward": -0.029999999329447746, "reward_std": 0.03162277862429619, "rewards/reward_func/mean": -0.029999999329447746, "rewards/reward_func/std": 0.03162277489900589, "sampling/importance_sampling_ratio/max": 2.2819936275482178, "sampling/importance_sampling_ratio/mean": 1.059441328048706, "sampling/importance_sampling_ratio/min": 0.24718116223812103, "sampling/sampling_logp_difference/max": 0.4278266429901123, "sampling/sampling_logp_difference/mean": 0.023989928886294365, "step": 731, "step_time": 38.65887645998737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.13578319549560547, "epoch": 0.732, "frac_reward_zero_std": 0.0, "grad_norm": 0.8830834031105042, "kl": 0.02507774345576763, "learning_rate": 8.902759409941567e-07, "loss": -0.0219, "num_tokens": 2047359.0, "reward": 0.48500001430511475, "reward_std": 0.5951750874519348, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5951750874519348, "sampling/importance_sampling_ratio/max": 1.2019600868225098, "sampling/importance_sampling_ratio/mean": 0.8898513913154602, "sampling/importance_sampling_ratio/min": 0.5592065453529358, "sampling/sampling_logp_difference/max": 0.7046303749084473, "sampling/sampling_logp_difference/mean": 0.014582839794456959, "step": 732, "step_time": 23.32031476602424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1849280297756195, "epoch": 0.733, "frac_reward_zero_std": 0.0, "grad_norm": 1.3588628768920898, "kl": 0.01479425560683012, "learning_rate": 8.840893163098332e-07, "loss": -0.1918, "num_tokens": 2049961.0, "reward": 0.17500001192092896, "reward_std": 0.5534437298774719, "rewards/reward_func/mean": 0.17500001192092896, "rewards/reward_func/std": 0.5534437894821167, "sampling/importance_sampling_ratio/max": 1.1513025760650635, "sampling/importance_sampling_ratio/mean": 0.8153568506240845, "sampling/importance_sampling_ratio/min": 0.3637250065803528, "sampling/sampling_logp_difference/max": 0.329026460647583, "sampling/sampling_logp_difference/mean": 0.025158749893307686, "step": 733, "step_time": 27.58716698101489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1709059327840805, "epoch": 0.734, "frac_reward_zero_std": 0.0, "grad_norm": 1.5014686584472656, "kl": 0.008804066106677055, "learning_rate": 8.779196417639465e-07, "loss": -0.1742, "num_tokens": 2053355.0, "reward": 0.4675000011920929, "reward_std": 0.610484778881073, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6104848384857178, "sampling/importance_sampling_ratio/max": 1.5505198240280151, "sampling/importance_sampling_ratio/mean": 1.0989161729812622, "sampling/importance_sampling_ratio/min": 0.8893836736679077, "sampling/sampling_logp_difference/max": 0.2774237394332886, "sampling/sampling_logp_difference/mean": 0.016124853864312172, "step": 734, "step_time": 34.1527088660514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.18432560563087463, "epoch": 0.735, "frac_reward_zero_std": 0.0, "grad_norm": 0.7626345753669739, "kl": 0.015863532200455666, "learning_rate": 8.71766982073462e-07, "loss": 0.3399, "num_tokens": 2056280.0, "reward": 0.20000000298023224, "reward_std": 0.5349766612052917, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5349766612052917, "sampling/importance_sampling_ratio/max": 1.4151264429092407, "sampling/importance_sampling_ratio/mean": 1.0339347124099731, "sampling/importance_sampling_ratio/min": 0.5459555387496948, "sampling/sampling_logp_difference/max": 0.7558444738388062, "sampling/sampling_logp_difference/mean": 0.022118931636214256, "step": 735, "step_time": 26.30608264199691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.13817165791988373, "epoch": 0.736, "frac_reward_zero_std": 0.0, "grad_norm": 0.9026435613632202, "kl": 0.015941379591822624, "learning_rate": 8.656314017768694e-07, "loss": -0.0277, "num_tokens": 2059680.0, "reward": 0.7275000214576721, "reward_std": 0.5317503213882446, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5317502617835999, "sampling/importance_sampling_ratio/max": 1.0423253774642944, "sampling/importance_sampling_ratio/mean": 0.9157225489616394, "sampling/importance_sampling_ratio/min": 0.8246874809265137, "sampling/sampling_logp_difference/max": 0.3474726676940918, "sampling/sampling_logp_difference/mean": 0.019679585471749306, "step": 736, "step_time": 33.488151199999265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18676094710826874, "epoch": 0.737, "frac_reward_zero_std": 0.0, "grad_norm": 1.5682297945022583, "kl": 0.028803769499063492, "learning_rate": 8.595129652335019e-07, "loss": 0.3274, "num_tokens": 2062471.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 1.5042445659637451, "sampling/importance_sampling_ratio/mean": 0.9616555571556091, "sampling/importance_sampling_ratio/min": 0.6271997690200806, "sampling/sampling_logp_difference/max": 0.5289497375488281, "sampling/sampling_logp_difference/mean": 0.02205747738480568, "step": 737, "step_time": 19.172225521993823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.17850393056869507, "epoch": 0.738, "frac_reward_zero_std": 0.0, "grad_norm": 2.7135050296783447, "kl": 0.013864757493138313, "learning_rate": 8.534117366228645e-07, "loss": -0.0509, "num_tokens": 2065508.0, "reward": 0.7400000095367432, "reward_std": 0.5000666379928589, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5000666975975037, "sampling/importance_sampling_ratio/max": 2.0345335006713867, "sampling/importance_sampling_ratio/mean": 1.325158953666687, "sampling/importance_sampling_ratio/min": 0.6851629614830017, "sampling/sampling_logp_difference/max": 0.506891131401062, "sampling/sampling_logp_difference/mean": 0.019138095900416374, "step": 738, "step_time": 28.703282848990057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.20194987952709198, "epoch": 0.739, "frac_reward_zero_std": 0.0, "grad_norm": 1.5897279977798462, "kl": 0.008492416702210903, "learning_rate": 8.473277799439569e-07, "loss": 0.2477, "num_tokens": 2068344.0, "reward": 0.49000000953674316, "reward_std": 0.5888972282409668, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5888972282409668, "sampling/importance_sampling_ratio/max": 1.8381931781768799, "sampling/importance_sampling_ratio/mean": 1.3536276817321777, "sampling/importance_sampling_ratio/min": 0.8069265484809875, "sampling/sampling_logp_difference/max": 0.3798189163208008, "sampling/sampling_logp_difference/mean": 0.021122204139828682, "step": 739, "step_time": 20.3152707790141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.16915154457092285, "epoch": 0.74, "frac_reward_zero_std": 0.0, "grad_norm": 0.931997537612915, "kl": 0.009465587325394154, "learning_rate": 8.412611590146069e-07, "loss": 0.1429, "num_tokens": 2070948.0, "reward": 0.42500001192092896, "reward_std": 0.6467611789703369, "rewards/reward_func/mean": 0.42500001192092896, "rewards/reward_func/std": 0.6467611789703369, "sampling/importance_sampling_ratio/max": 1.3508557081222534, "sampling/importance_sampling_ratio/mean": 0.9440776705741882, "sampling/importance_sampling_ratio/min": 0.34541571140289307, "sampling/sampling_logp_difference/max": 0.4881579279899597, "sampling/sampling_logp_difference/mean": 0.018185501918196678, "step": 740, "step_time": 24.673817569972016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.16276097297668457, "epoch": 0.741, "frac_reward_zero_std": 0.0, "grad_norm": 1.035072922706604, "kl": 0.010159646160900593, "learning_rate": 8.352119374707979e-07, "loss": 0.0764, "num_tokens": 2073681.0, "reward": 0.2224999964237213, "reward_std": 0.5202803611755371, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.5202804207801819, "sampling/importance_sampling_ratio/max": 1.195013403892517, "sampling/importance_sampling_ratio/mean": 0.8562400341033936, "sampling/importance_sampling_ratio/min": 0.7241944074630737, "sampling/sampling_logp_difference/max": 0.33409547805786133, "sampling/sampling_logp_difference/mean": 0.014942617155611515, "step": 741, "step_time": 33.18091492302483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17472010850906372, "epoch": 0.742, "frac_reward_zero_std": 0.0, "grad_norm": 1.9077184200286865, "kl": 0.099124014377594, "learning_rate": 8.291801787660001e-07, "loss": 0.0992, "num_tokens": 2076292.0, "reward": 0.4649999737739563, "reward_std": 0.6182502508163452, "rewards/reward_func/mean": 0.4649999737739563, "rewards/reward_func/std": 0.6182502508163452, "sampling/importance_sampling_ratio/max": 1.7012168169021606, "sampling/importance_sampling_ratio/mean": 0.9267334938049316, "sampling/importance_sampling_ratio/min": 0.2755015790462494, "sampling/sampling_logp_difference/max": 1.3129488229751587, "sampling/sampling_logp_difference/mean": 0.026498261839151382, "step": 742, "step_time": 21.187537103018258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17678618431091309, "epoch": 0.743, "frac_reward_zero_std": 0.0, "grad_norm": 0.6554853916168213, "kl": 0.012268660590052605, "learning_rate": 8.231659461705092e-07, "loss": 0.2714, "num_tokens": 2079337.0, "reward": 0.1824999898672104, "reward_std": 0.5499318242073059, "rewards/reward_func/mean": 0.1824999898672104, "rewards/reward_func/std": 0.5499318242073059, "sampling/importance_sampling_ratio/max": 1.3556013107299805, "sampling/importance_sampling_ratio/mean": 0.7827243208885193, "sampling/importance_sampling_ratio/min": 0.27323997020721436, "sampling/sampling_logp_difference/max": 0.4416499137878418, "sampling/sampling_logp_difference/mean": 0.021022938191890717, "step": 743, "step_time": 40.58132391201798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.18959416449069977, "epoch": 0.744, "frac_reward_zero_std": 0.0, "grad_norm": 0.8889655470848083, "kl": 0.009796516969799995, "learning_rate": 8.171693027707772e-07, "loss": 0.0825, "num_tokens": 2081781.0, "reward": 0.7325000166893005, "reward_std": 0.5283543467521667, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5283544063568115, "sampling/importance_sampling_ratio/max": 0.9077510833740234, "sampling/importance_sampling_ratio/mean": 0.6856824159622192, "sampling/importance_sampling_ratio/min": 0.2700147330760956, "sampling/sampling_logp_difference/max": 0.3900569677352905, "sampling/sampling_logp_difference/mean": 0.016983825713396072, "step": 744, "step_time": 13.218375284050126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.26604506373405457, "epoch": 0.745, "frac_reward_zero_std": 0.0, "grad_norm": 2.0891060829162598, "kl": 0.01171761192381382, "learning_rate": 8.111903114687591e-07, "loss": 0.2847, "num_tokens": 2084284.0, "reward": 0.7124999761581421, "reward_std": 0.5043395161628723, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.5043395161628723, "sampling/importance_sampling_ratio/max": 1.74489164352417, "sampling/importance_sampling_ratio/mean": 0.953168511390686, "sampling/importance_sampling_ratio/min": 0.5810538530349731, "sampling/sampling_logp_difference/max": 0.38275253772735596, "sampling/sampling_logp_difference/mean": 0.0226482842117548, "step": 745, "step_time": 21.99126341898227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19541367888450623, "epoch": 0.746, "frac_reward_zero_std": 0.0, "grad_norm": 1.6770156621932983, "kl": 0.016987763345241547, "learning_rate": 8.052290349812419e-07, "loss": 0.439, "num_tokens": 2087049.0, "reward": 0.20000000298023224, "reward_std": 0.5359726548194885, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5359726548194885, "sampling/importance_sampling_ratio/max": 2.390557289123535, "sampling/importance_sampling_ratio/mean": 1.569882869720459, "sampling/importance_sampling_ratio/min": 0.7815389633178711, "sampling/sampling_logp_difference/max": 0.815589189529419, "sampling/sampling_logp_difference/mean": 0.022378752008080482, "step": 746, "step_time": 26.513825125992298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.19290375709533691, "epoch": 0.747, "frac_reward_zero_std": 0.0, "grad_norm": 0.8766121864318848, "kl": 0.018344111740589142, "learning_rate": 7.992855358391968e-07, "loss": -0.0119, "num_tokens": 2089730.0, "reward": 0.17499999701976776, "reward_std": 0.5521775484085083, "rewards/reward_func/mean": 0.17499999701976776, "rewards/reward_func/std": 0.5521774888038635, "sampling/importance_sampling_ratio/max": 0.9800979495048523, "sampling/importance_sampling_ratio/mean": 0.5701234936714172, "sampling/importance_sampling_ratio/min": 0.2475348860025406, "sampling/sampling_logp_difference/max": 1.0528191328048706, "sampling/sampling_logp_difference/mean": 0.022044919431209564, "step": 747, "step_time": 38.006621173990425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.19260410964488983, "epoch": 0.748, "frac_reward_zero_std": 0.0, "grad_norm": 1.2305965423583984, "kl": 0.013900158926844597, "learning_rate": 7.933598763871156e-07, "loss": -0.2719, "num_tokens": 2092263.0, "reward": 0.1899999976158142, "reward_std": 0.5407402515411377, "rewards/reward_func/mean": 0.1899999976158142, "rewards/reward_func/std": 0.5407402515411377, "sampling/importance_sampling_ratio/max": 1.4424039125442505, "sampling/importance_sampling_ratio/mean": 0.9574822783470154, "sampling/importance_sampling_ratio/min": 0.47926896810531616, "sampling/sampling_logp_difference/max": 0.5303046703338623, "sampling/sampling_logp_difference/mean": 0.018329016864299774, "step": 748, "step_time": 25.547142967989203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18603956699371338, "epoch": 0.749, "frac_reward_zero_std": 0.0, "grad_norm": 2.196472644805908, "kl": 0.024144070222973824, "learning_rate": 7.874521187823631e-07, "loss": 0.578, "num_tokens": 2095371.0, "reward": 0.4624999761581421, "reward_std": 0.6209871172904968, "rewards/reward_func/mean": 0.4624999761581421, "rewards/reward_func/std": 0.6209871768951416, "sampling/importance_sampling_ratio/max": 2.7265470027923584, "sampling/importance_sampling_ratio/mean": 1.1012520790100098, "sampling/importance_sampling_ratio/min": 0.3692186772823334, "sampling/sampling_logp_difference/max": 0.8040235042572021, "sampling/sampling_logp_difference/mean": 0.026813216507434845, "step": 749, "step_time": 26.709419016027823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1836884766817093, "epoch": 0.75, "frac_reward_zero_std": 0.0, "grad_norm": 1.9060646295547485, "kl": 0.014748227782547474, "learning_rate": 7.815623249945215e-07, "loss": -0.1683, "num_tokens": 2098001.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 2.1657533645629883, "sampling/importance_sampling_ratio/mean": 1.079534888267517, "sampling/importance_sampling_ratio/min": 0.5228157043457031, "sampling/sampling_logp_difference/max": 0.5553015470504761, "sampling/sampling_logp_difference/mean": 0.02127106674015522, "step": 750, "step_time": 16.736328262020834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.15925879776477814, "epoch": 0.751, "frac_reward_zero_std": 0.0, "grad_norm": 2.1834208965301514, "kl": 0.019534096121788025, "learning_rate": 7.756905568047393e-07, "loss": 0.1172, "num_tokens": 2101291.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.5012420415878296, "sampling/importance_sampling_ratio/mean": 1.1168947219848633, "sampling/importance_sampling_ratio/min": 0.7359365820884705, "sampling/sampling_logp_difference/max": 0.317885160446167, "sampling/sampling_logp_difference/mean": 0.015462284907698631, "step": 751, "step_time": 17.293174117978197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15503384172916412, "epoch": 0.752, "frac_reward_zero_std": 0.0, "grad_norm": 1.7492260932922363, "kl": 0.016820628196001053, "learning_rate": 7.698368758050878e-07, "loss": -0.354, "num_tokens": 2104510.0, "reward": 0.20250001549720764, "reward_std": 0.5337523818016052, "rewards/reward_func/mean": 0.20250001549720764, "rewards/reward_func/std": 0.53375244140625, "sampling/importance_sampling_ratio/max": 1.5708686113357544, "sampling/importance_sampling_ratio/mean": 0.871680498123169, "sampling/importance_sampling_ratio/min": 0.374078631401062, "sampling/sampling_logp_difference/max": 0.4539494514465332, "sampling/sampling_logp_difference/mean": 0.022727761417627335, "step": 752, "step_time": 40.5143657780136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.2260342389345169, "epoch": 0.753, "frac_reward_zero_std": 0.0, "grad_norm": 0.9871532917022705, "kl": 0.01694342866539955, "learning_rate": 7.640013433979093e-07, "loss": 0.2427, "num_tokens": 2107174.0, "reward": 0.42250001430511475, "reward_std": 0.6669520139694214, "rewards/reward_func/mean": 0.42250001430511475, "rewards/reward_func/std": 0.6669520139694214, "sampling/importance_sampling_ratio/max": 0.8810743689537048, "sampling/importance_sampling_ratio/mean": 0.59300696849823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5747154951095581, "sampling/sampling_logp_difference/mean": 0.025310808792710304, "step": 753, "step_time": 27.64713804697385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17111508548259735, "epoch": 0.754, "frac_reward_zero_std": 0.0, "grad_norm": 0.9241670370101929, "kl": 0.011174448765814304, "learning_rate": 7.58184020795179e-07, "loss": 0.1672, "num_tokens": 2110362.0, "reward": 0.2150000035762787, "reward_std": 0.49047595262527466, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.49047598242759705, "sampling/importance_sampling_ratio/max": 1.7409347295761108, "sampling/importance_sampling_ratio/mean": 0.9998546838760376, "sampling/importance_sampling_ratio/min": 0.5844720602035522, "sampling/sampling_logp_difference/max": 0.2992151379585266, "sampling/sampling_logp_difference/mean": 0.015162689611315727, "step": 754, "step_time": 35.14179689303273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16719110310077667, "epoch": 0.755, "frac_reward_zero_std": 0.0, "grad_norm": 1.1052947044372559, "kl": 0.009931657463312149, "learning_rate": 7.523849690178567e-07, "loss": 0.0748, "num_tokens": 2113460.0, "reward": 0.2175000160932541, "reward_std": 0.5158407688140869, "rewards/reward_func/mean": 0.2175000160932541, "rewards/reward_func/std": 0.5158407688140869, "sampling/importance_sampling_ratio/max": 1.392272710800171, "sampling/importance_sampling_ratio/mean": 1.2095292806625366, "sampling/importance_sampling_ratio/min": 0.9626930952072144, "sampling/sampling_logp_difference/max": 0.4731113910675049, "sampling/sampling_logp_difference/mean": 0.016341708600521088, "step": 755, "step_time": 35.33032606699271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.15754979848861694, "epoch": 0.756, "frac_reward_zero_std": 0.0, "grad_norm": 0.7584160566329956, "kl": 0.02082926407456398, "learning_rate": 7.466042488952521e-07, "loss": 0.0374, "num_tokens": 2116550.0, "reward": -0.030000001192092896, "reward_std": 0.040824826806783676, "rewards/reward_func/mean": -0.030000001192092896, "rewards/reward_func/std": 0.040824830532073975, "sampling/importance_sampling_ratio/max": 0.7966128587722778, "sampling/importance_sampling_ratio/mean": 0.6172126531600952, "sampling/importance_sampling_ratio/min": 0.2728629410266876, "sampling/sampling_logp_difference/max": 0.6440171599388123, "sampling/sampling_logp_difference/mean": 0.023620599880814552, "step": 756, "step_time": 38.59035433497047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 61.5, "completions/mean_terminated_length": 61.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17724810540676117, "epoch": 0.757, "frac_reward_zero_std": 0.0, "grad_norm": 0.857313334941864, "kl": 0.01343874167650938, "learning_rate": 7.408419210643847e-07, "loss": -0.1256, "num_tokens": 2119579.0, "reward": 0.4775000214576721, "reward_std": 0.5804237127304077, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5804236531257629, "sampling/importance_sampling_ratio/max": 1.1267565488815308, "sampling/importance_sampling_ratio/mean": 0.8883934020996094, "sampling/importance_sampling_ratio/min": 0.4498753547668457, "sampling/sampling_logp_difference/max": 0.47825485467910767, "sampling/sampling_logp_difference/mean": 0.019619595259428024, "step": 757, "step_time": 25.899380012007896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.16971728205680847, "epoch": 0.758, "frac_reward_zero_std": 0.0, "grad_norm": 1.1160763502120972, "kl": 0.010076766833662987, "learning_rate": 7.350980459693455e-07, "loss": 0.1175, "num_tokens": 2122429.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.1028574705123901, "sampling/importance_sampling_ratio/mean": 0.8751924633979797, "sampling/importance_sampling_ratio/min": 0.5844590067863464, "sampling/sampling_logp_difference/max": 0.33286094665527344, "sampling/sampling_logp_difference/mean": 0.016627872362732887, "step": 758, "step_time": 21.73135227599414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.2247239053249359, "epoch": 0.759, "frac_reward_zero_std": 0.0, "grad_norm": 0.9031432867050171, "kl": 0.013609400019049644, "learning_rate": 7.293726838606674e-07, "loss": -0.0792, "num_tokens": 2125103.0, "reward": 0.7325000166893005, "reward_std": 0.5283543467521667, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5283544063568115, "sampling/importance_sampling_ratio/max": 1.6223313808441162, "sampling/importance_sampling_ratio/mean": 0.9199168682098389, "sampling/importance_sampling_ratio/min": 0.45400863885879517, "sampling/sampling_logp_difference/max": 0.2773076295852661, "sampling/sampling_logp_difference/mean": 0.020786341279745102, "step": 759, "step_time": 17.759921273041982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15369828045368195, "epoch": 0.76, "frac_reward_zero_std": 0.0, "grad_norm": 1.7395877838134766, "kl": 0.013168664649128914, "learning_rate": 7.236658947946886e-07, "loss": 0.1011, "num_tokens": 2128117.0, "reward": 0.19499999284744263, "reward_std": 0.5395367741584778, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.5395368337631226, "sampling/importance_sampling_ratio/max": 2.3293864727020264, "sampling/importance_sampling_ratio/mean": 1.6012465953826904, "sampling/importance_sampling_ratio/min": 1.1115145683288574, "sampling/sampling_logp_difference/max": 0.6413042545318604, "sampling/sampling_logp_difference/mean": 0.014813265763223171, "step": 760, "step_time": 30.91305888001807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.14135727286338806, "epoch": 0.761, "frac_reward_zero_std": 0.0, "grad_norm": 1.1094361543655396, "kl": 0.02502884902060032, "learning_rate": 7.179777386329276e-07, "loss": -0.1925, "num_tokens": 2131110.0, "reward": 0.6924999952316284, "reward_std": 0.5886354446411133, "rewards/reward_func/mean": 0.6924999952316284, "rewards/reward_func/std": 0.5886354446411133, "sampling/importance_sampling_ratio/max": 1.060774564743042, "sampling/importance_sampling_ratio/mean": 0.9196078777313232, "sampling/importance_sampling_ratio/min": 0.5495688319206238, "sampling/sampling_logp_difference/max": 0.5127713680267334, "sampling/sampling_logp_difference/mean": 0.01406371034681797, "step": 761, "step_time": 25.955361855973024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.2058166265487671, "epoch": 0.762, "frac_reward_zero_std": 0.0, "grad_norm": 1.1769338846206665, "kl": 0.016208594664931297, "learning_rate": 7.123082750414487e-07, "loss": -0.1432, "num_tokens": 2133455.0, "reward": 0.48250001668930054, "reward_std": 0.5917980670928955, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5917980670928955, "sampling/importance_sampling_ratio/max": 1.6428663730621338, "sampling/importance_sampling_ratio/mean": 1.0470232963562012, "sampling/importance_sampling_ratio/min": 0.6223844885826111, "sampling/sampling_logp_difference/max": 0.31319427490234375, "sampling/sampling_logp_difference/mean": 0.023071369156241417, "step": 762, "step_time": 17.886447070981376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19531789422035217, "epoch": 0.763, "frac_reward_zero_std": 0.0, "grad_norm": 1.0730407238006592, "kl": 0.16480080783367157, "learning_rate": 7.066575634902437e-07, "loss": -0.0055, "num_tokens": 2135953.0, "reward": 0.9925000071525574, "reward_std": 0.00957426242530346, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.00957426242530346, "sampling/importance_sampling_ratio/max": 0.962732195854187, "sampling/importance_sampling_ratio/mean": 0.8153795599937439, "sampling/importance_sampling_ratio/min": 0.546139657497406, "sampling/sampling_logp_difference/max": 0.35427331924438477, "sampling/sampling_logp_difference/mean": 0.01739596202969551, "step": 763, "step_time": 17.28462676802883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 61.5, "completions/mean_terminated_length": 61.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.20633232593536377, "epoch": 0.764, "frac_reward_zero_std": 0.0, "grad_norm": 1.8220878839492798, "kl": 0.009252580814063549, "learning_rate": 7.010256632526036e-07, "loss": -0.0916, "num_tokens": 2138996.0, "reward": 0.45749998092651367, "reward_std": 0.6275016665458679, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6275016069412231, "sampling/importance_sampling_ratio/max": 2.0471394062042236, "sampling/importance_sampling_ratio/mean": 1.3246567249298096, "sampling/importance_sampling_ratio/min": 0.9938044548034668, "sampling/sampling_logp_difference/max": 0.29726123809814453, "sampling/sampling_logp_difference/mean": 0.014701912179589272, "step": 764, "step_time": 21.432103952975012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.14441938698291779, "epoch": 0.765, "frac_reward_zero_std": 0.0, "grad_norm": 2.108551502227783, "kl": 0.01378727052360773, "learning_rate": 6.95412633404495e-07, "loss": -0.1665, "num_tokens": 2142265.0, "reward": 0.24500000476837158, "reward_std": 0.5033554434776306, "rewards/reward_func/mean": 0.24500000476837158, "rewards/reward_func/std": 0.5033553838729858, "sampling/importance_sampling_ratio/max": 2.285527467727661, "sampling/importance_sampling_ratio/mean": 1.5754728317260742, "sampling/importance_sampling_ratio/min": 1.3068912029266357, "sampling/sampling_logp_difference/max": 0.47871720790863037, "sampling/sampling_logp_difference/mean": 0.016440121456980705, "step": 765, "step_time": 38.87439507001545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17754623293876648, "epoch": 0.766, "frac_reward_zero_std": 0.0, "grad_norm": 1.2020010948181152, "kl": 0.01589268445968628, "learning_rate": 6.898185328239468e-07, "loss": 0.3525, "num_tokens": 2144810.0, "reward": 0.46000000834465027, "reward_std": 0.6235917806625366, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.6235917210578918, "sampling/importance_sampling_ratio/max": 1.8180670738220215, "sampling/importance_sampling_ratio/mean": 1.20233154296875, "sampling/importance_sampling_ratio/min": 0.663787841796875, "sampling/sampling_logp_difference/max": 0.41589564085006714, "sampling/sampling_logp_difference/mean": 0.01719684526324272, "step": 766, "step_time": 19.871508201991674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16139961779117584, "epoch": 0.767, "frac_reward_zero_std": 0.0, "grad_norm": 0.8430211544036865, "kl": 0.009419786743819714, "learning_rate": 6.842434201904255e-07, "loss": -0.0295, "num_tokens": 2147820.0, "reward": 0.7250000238418579, "reward_std": 0.523672878742218, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.5236729383468628, "sampling/importance_sampling_ratio/max": 0.9945605397224426, "sampling/importance_sampling_ratio/mean": 0.7161622047424316, "sampling/importance_sampling_ratio/min": 0.2731669843196869, "sampling/sampling_logp_difference/max": 0.6408298015594482, "sampling/sampling_logp_difference/mean": 0.018986236304044724, "step": 767, "step_time": 27.392030903953128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.19481495022773743, "epoch": 0.768, "frac_reward_zero_std": 0.0, "grad_norm": 1.2760918140411377, "kl": 0.012536468915641308, "learning_rate": 6.78687353984226e-07, "loss": 0.2405, "num_tokens": 2150437.0, "reward": 0.7425000071525574, "reward_std": 0.5083552002906799, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5083552002906799, "sampling/importance_sampling_ratio/max": 1.5039702653884888, "sampling/importance_sampling_ratio/mean": 0.914948582649231, "sampling/importance_sampling_ratio/min": 0.516736626625061, "sampling/sampling_logp_difference/max": 0.6249194145202637, "sampling/sampling_logp_difference/mean": 0.024361975491046906, "step": 768, "step_time": 19.28579560201615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17333655059337616, "epoch": 0.769, "frac_reward_zero_std": 0.0, "grad_norm": 0.9733495712280273, "kl": 0.017303530126810074, "learning_rate": 6.731503924858518e-07, "loss": -0.2386, "num_tokens": 2153314.0, "reward": 0.42250001430511475, "reward_std": 0.6704414486885071, "rewards/reward_func/mean": 0.42250001430511475, "rewards/reward_func/std": 0.6704413890838623, "sampling/importance_sampling_ratio/max": 1.4091726541519165, "sampling/importance_sampling_ratio/mean": 0.8986155986785889, "sampling/importance_sampling_ratio/min": 0.5775488018989563, "sampling/sampling_logp_difference/max": 0.3481426239013672, "sampling/sampling_logp_difference/mean": 0.01685832254588604, "step": 769, "step_time": 28.981062715989538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15905138850212097, "epoch": 0.77, "frac_reward_zero_std": 0.0, "grad_norm": 0.7885928153991699, "kl": 0.009373069740831852, "learning_rate": 6.676325937754102e-07, "loss": -0.1892, "num_tokens": 2156199.0, "reward": 0.7100000381469727, "reward_std": 0.5799999833106995, "rewards/reward_func/mean": 0.7100000381469727, "rewards/reward_func/std": 0.5799999833106995, "sampling/importance_sampling_ratio/max": 1.440505027770996, "sampling/importance_sampling_ratio/mean": 0.9580501914024353, "sampling/importance_sampling_ratio/min": 0.5778598785400391, "sampling/sampling_logp_difference/max": 0.44685041904449463, "sampling/sampling_logp_difference/mean": 0.016981616616249084, "step": 770, "step_time": 19.203623606998008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17572003602981567, "epoch": 0.771, "frac_reward_zero_std": 0.0, "grad_norm": 1.2568960189819336, "kl": 0.01397181861102581, "learning_rate": 6.621340157319998e-07, "loss": 0.0737, "num_tokens": 2159544.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.3186613321304321, "sampling/importance_sampling_ratio/mean": 0.923529863357544, "sampling/importance_sampling_ratio/min": 0.4395921230316162, "sampling/sampling_logp_difference/max": 0.37542176246643066, "sampling/sampling_logp_difference/mean": 0.0232163667678833, "step": 771, "step_time": 29.682804643001873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17877408862113953, "epoch": 0.772, "frac_reward_zero_std": 0.0, "grad_norm": 0.7722077369689941, "kl": 0.010325605049729347, "learning_rate": 6.566547160331e-07, "loss": 0.1852, "num_tokens": 2162267.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.1176972389221191, "sampling/importance_sampling_ratio/mean": 0.6664525866508484, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.38793087005615234, "sampling/sampling_logp_difference/mean": 0.0231131911277771, "step": 772, "step_time": 18.377510873950087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.174563929438591, "epoch": 0.773, "frac_reward_zero_std": 0.0, "grad_norm": 1.1849660873413086, "kl": 0.01368184108287096, "learning_rate": 6.511947521539738e-07, "loss": -0.1586, "num_tokens": 2164871.0, "reward": 0.7400000095367432, "reward_std": 0.5066885948181152, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5066885948181152, "sampling/importance_sampling_ratio/max": 2.161853313446045, "sampling/importance_sampling_ratio/mean": 1.455291986465454, "sampling/importance_sampling_ratio/min": 0.7266326546669006, "sampling/sampling_logp_difference/max": 0.5246192216873169, "sampling/sampling_logp_difference/mean": 0.021538708359003067, "step": 773, "step_time": 16.12982746202033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.12235935032367706, "epoch": 0.774, "frac_reward_zero_std": 0.0, "grad_norm": 0.7529805302619934, "kl": 0.006180343683809042, "learning_rate": 6.457541813670565e-07, "loss": -0.1506, "num_tokens": 2167783.0, "reward": 0.9925000071525574, "reward_std": 0.014999985694885254, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.014999986626207829, "sampling/importance_sampling_ratio/max": 1.426221251487732, "sampling/importance_sampling_ratio/mean": 0.9917997121810913, "sampling/importance_sampling_ratio/min": 0.716769814491272, "sampling/sampling_logp_difference/max": 0.3387885093688965, "sampling/sampling_logp_difference/mean": 0.010162458755075932, "step": 774, "step_time": 21.981031369999982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.2151685506105423, "epoch": 0.775, "frac_reward_zero_std": 0.0, "grad_norm": 1.2983440160751343, "kl": 0.01152323093265295, "learning_rate": 6.403330607413643e-07, "loss": -0.1665, "num_tokens": 2170625.0, "reward": 0.7024999856948853, "reward_std": 0.5298663973808289, "rewards/reward_func/mean": 0.7024999856948853, "rewards/reward_func/std": 0.5298663973808289, "sampling/importance_sampling_ratio/max": 1.7368923425674438, "sampling/importance_sampling_ratio/mean": 1.0499792098999023, "sampling/importance_sampling_ratio/min": 0.6125698089599609, "sampling/sampling_logp_difference/max": 0.2811737060546875, "sampling/sampling_logp_difference/mean": 0.018785523250699043, "step": 775, "step_time": 22.23468585399678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19031566381454468, "epoch": 0.776, "frac_reward_zero_std": 0.0, "grad_norm": 1.291479229927063, "kl": 0.007776190992444754, "learning_rate": 6.349314471418849e-07, "loss": 0.1841, "num_tokens": 2173204.0, "reward": 0.4399999976158142, "reward_std": 0.6424432396888733, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.6424432396888733, "sampling/importance_sampling_ratio/max": 1.743950605392456, "sampling/importance_sampling_ratio/mean": 1.178384780883789, "sampling/importance_sampling_ratio/min": 0.5092048048973083, "sampling/sampling_logp_difference/max": 0.4105731248855591, "sampling/sampling_logp_difference/mean": 0.018567068502306938, "step": 776, "step_time": 24.8699814369902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.15377497673034668, "epoch": 0.777, "frac_reward_zero_std": 0.0, "grad_norm": 1.964449167251587, "kl": 0.0083929393440485, "learning_rate": 6.295493972289904e-07, "loss": 0.1439, "num_tokens": 2175386.0, "reward": 0.9674999713897705, "reward_std": 0.052519846707582474, "rewards/reward_func/mean": 0.9674999713897705, "rewards/reward_func/std": 0.052519846707582474, "sampling/importance_sampling_ratio/max": 1.8079571723937988, "sampling/importance_sampling_ratio/mean": 1.3296911716461182, "sampling/importance_sampling_ratio/min": 0.7503855228424072, "sampling/sampling_logp_difference/max": 0.4740368127822876, "sampling/sampling_logp_difference/mean": 0.016167402267456055, "step": 777, "step_time": 12.331017715972848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.2164464145898819, "epoch": 0.778, "frac_reward_zero_std": 0.0, "grad_norm": 1.1394131183624268, "kl": 0.030036717653274536, "learning_rate": 6.241869674578363e-07, "loss": 0.3418, "num_tokens": 2177976.0, "reward": 0.4625000059604645, "reward_std": 0.6175422072410583, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.6175422072410583, "sampling/importance_sampling_ratio/max": 1.1647049188613892, "sampling/importance_sampling_ratio/mean": 0.7272229790687561, "sampling/importance_sampling_ratio/min": 0.2476707249879837, "sampling/sampling_logp_difference/max": 1.0166096687316895, "sampling/sampling_logp_difference/mean": 0.026258310303092003, "step": 778, "step_time": 30.01188500097487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 61.75, "completions/mean_terminated_length": 61.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.19995911419391632, "epoch": 0.779, "frac_reward_zero_std": 0.0, "grad_norm": 1.3844139575958252, "kl": 0.008600587025284767, "learning_rate": 6.188442140777742e-07, "loss": -0.0912, "num_tokens": 2180344.0, "reward": 0.7100000381469727, "reward_std": 0.5799999833106995, "rewards/reward_func/mean": 0.7100000381469727, "rewards/reward_func/std": 0.5799999833106995, "sampling/importance_sampling_ratio/max": 2.4914073944091797, "sampling/importance_sampling_ratio/mean": 1.34405517578125, "sampling/importance_sampling_ratio/min": 0.6680753231048584, "sampling/sampling_logp_difference/max": 0.6787099838256836, "sampling/sampling_logp_difference/mean": 0.021124547347426414, "step": 779, "step_time": 13.96109197800979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16854476928710938, "epoch": 0.78, "frac_reward_zero_std": 0.0, "grad_norm": 1.2078826427459717, "kl": 0.010544574819505215, "learning_rate": 6.135211931317594e-07, "loss": -0.1752, "num_tokens": 2183117.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.861361026763916, "sampling/importance_sampling_ratio/mean": 1.1615455150604248, "sampling/importance_sampling_ratio/min": 0.6297479271888733, "sampling/sampling_logp_difference/max": 0.5306280851364136, "sampling/sampling_logp_difference/mean": 0.020385798066854477, "step": 780, "step_time": 18.09878264099825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1957128942012787, "epoch": 0.781, "frac_reward_zero_std": 0.0, "grad_norm": 0.8542881608009338, "kl": 0.017616048455238342, "learning_rate": 6.082179604557617e-07, "loss": 0.046, "num_tokens": 2185507.0, "reward": 0.4599999785423279, "reward_std": 0.6178996562957764, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.6178997159004211, "sampling/importance_sampling_ratio/max": 1.1819839477539062, "sampling/importance_sampling_ratio/mean": 0.8506796360015869, "sampling/importance_sampling_ratio/min": 0.5027575492858887, "sampling/sampling_logp_difference/max": 0.5076675415039062, "sampling/sampling_logp_difference/mean": 0.02212216705083847, "step": 781, "step_time": 20.57440196996322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1546093374490738, "epoch": 0.782, "frac_reward_zero_std": 0.0, "grad_norm": 1.1687120199203491, "kl": 0.012353716418147087, "learning_rate": 6.029345716781837e-07, "loss": -0.0429, "num_tokens": 2188597.0, "reward": 0.48000001907348633, "reward_std": 0.5948108434677124, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.5948109030723572, "sampling/importance_sampling_ratio/max": 1.1749125719070435, "sampling/importance_sampling_ratio/mean": 0.9629591703414917, "sampling/importance_sampling_ratio/min": 0.8087829351425171, "sampling/sampling_logp_difference/max": 0.46068644523620605, "sampling/sampling_logp_difference/mean": 0.02025839127600193, "step": 782, "step_time": 30.160392530960962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 33.0, "completions/min_terminated_length": 33.0, "entropy": 0.1940281093120575, "epoch": 0.783, "frac_reward_zero_std": 0.0, "grad_norm": 1.5344480276107788, "kl": 0.017377682030200958, "learning_rate": 5.976710822192722e-07, "loss": 0.1011, "num_tokens": 2190858.0, "reward": 0.4624999761581421, "reward_std": 0.5921359658241272, "rewards/reward_func/mean": 0.4624999761581421, "rewards/reward_func/std": 0.592136025428772, "sampling/importance_sampling_ratio/max": 1.7589510679244995, "sampling/importance_sampling_ratio/mean": 1.0019781589508057, "sampling/importance_sampling_ratio/min": 0.5731030106544495, "sampling/sampling_logp_difference/max": 0.6189779043197632, "sampling/sampling_logp_difference/mean": 0.026062628254294395, "step": 783, "step_time": 16.444462491956074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.19138528406620026, "epoch": 0.784, "frac_reward_zero_std": 0.0, "grad_norm": 1.8585448265075684, "kl": 0.01925732009112835, "learning_rate": 5.924275472905425e-07, "loss": 0.2269, "num_tokens": 2193873.0, "reward": 0.7200000286102295, "reward_std": 0.5533534288406372, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5533534288406372, "sampling/importance_sampling_ratio/max": 1.4855865240097046, "sampling/importance_sampling_ratio/mean": 0.9914582371711731, "sampling/importance_sampling_ratio/min": 0.532261312007904, "sampling/sampling_logp_difference/max": 0.351259708404541, "sampling/sampling_logp_difference/mean": 0.024614620953798294, "step": 784, "step_time": 20.17299226502655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 47.5, "completions/mean_terminated_length": 47.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.12624485790729523, "epoch": 0.785, "frac_reward_zero_std": 0.0, "grad_norm": 0.869793176651001, "kl": 0.019814154133200645, "learning_rate": 5.872040218941929e-07, "loss": 0.099, "num_tokens": 2196942.0, "reward": 0.7325000166893005, "reward_std": 0.5151941180229187, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5151941776275635, "sampling/importance_sampling_ratio/max": 1.3906809091567993, "sampling/importance_sampling_ratio/mean": 1.0369181632995605, "sampling/importance_sampling_ratio/min": 0.6651946902275085, "sampling/sampling_logp_difference/max": 0.6735460758209229, "sampling/sampling_logp_difference/mean": 0.017296234145760536, "step": 785, "step_time": 26.285939307010267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.21539698541164398, "epoch": 0.786, "frac_reward_zero_std": 0.0, "grad_norm": 0.9977689981460571, "kl": 0.018831906840205193, "learning_rate": 5.820005608225345e-07, "loss": 0.1015, "num_tokens": 2200104.0, "reward": 0.19999998807907104, "reward_std": 0.5016639232635498, "rewards/reward_func/mean": 0.19999998807907104, "rewards/reward_func/std": 0.5016639232635498, "sampling/importance_sampling_ratio/max": 1.024660348892212, "sampling/importance_sampling_ratio/mean": 0.8590462803840637, "sampling/importance_sampling_ratio/min": 0.7173623442649841, "sampling/sampling_logp_difference/max": 0.47890007495880127, "sampling/sampling_logp_difference/mean": 0.026286235079169273, "step": 786, "step_time": 32.92805120703997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.19545860588550568, "epoch": 0.787, "frac_reward_zero_std": 0.0, "grad_norm": 0.8692592978477478, "kl": 0.009537375532090664, "learning_rate": 5.768172186574123e-07, "loss": -0.1926, "num_tokens": 2202648.0, "reward": 0.48500001430511475, "reward_std": 0.5889255404472351, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5889255404472351, "sampling/importance_sampling_ratio/max": 1.1382092237472534, "sampling/importance_sampling_ratio/mean": 0.9175537824630737, "sampling/importance_sampling_ratio/min": 0.6625953912734985, "sampling/sampling_logp_difference/max": 0.3489952087402344, "sampling/sampling_logp_difference/mean": 0.015568016096949577, "step": 787, "step_time": 23.46705276099965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18042880296707153, "epoch": 0.788, "frac_reward_zero_std": 0.0, "grad_norm": 2.0212419033050537, "kl": 0.007243243977427483, "learning_rate": 5.716540497696307e-07, "loss": 0.3025, "num_tokens": 2205688.0, "reward": 0.4625000059604645, "reward_std": 0.6157583594322205, "rewards/reward_func/mean": 0.4625000059604645, "rewards/reward_func/std": 0.6157583594322205, "sampling/importance_sampling_ratio/max": 2.4834413528442383, "sampling/importance_sampling_ratio/mean": 1.4974021911621094, "sampling/importance_sampling_ratio/min": 1.0850346088409424, "sampling/sampling_logp_difference/max": 0.35714179277420044, "sampling/sampling_logp_difference/mean": 0.018049370497465134, "step": 788, "step_time": 36.234356241999194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1859857589006424, "epoch": 0.789, "frac_reward_zero_std": 0.0, "grad_norm": 1.87863028049469, "kl": 0.018821505829691887, "learning_rate": 5.665111083183905e-07, "loss": 0.6013, "num_tokens": 2208356.0, "reward": -0.09000000357627869, "reward_std": 0.07527726888656616, "rewards/reward_func/mean": -0.09000000357627869, "rewards/reward_func/std": 0.07527726888656616, "sampling/importance_sampling_ratio/max": 2.7212724685668945, "sampling/importance_sampling_ratio/mean": 1.1685781478881836, "sampling/importance_sampling_ratio/min": 0.45494288206100464, "sampling/sampling_logp_difference/max": 0.43838202953338623, "sampling/sampling_logp_difference/mean": 0.021123006939888, "step": 789, "step_time": 33.50007220898988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.15837079286575317, "epoch": 0.79, "frac_reward_zero_std": 0.0, "grad_norm": 0.74587482213974, "kl": 0.02354729175567627, "learning_rate": 5.613884482507123e-07, "loss": -0.0185, "num_tokens": 2210650.0, "reward": 0.9925000071525574, "reward_std": 0.014999985694885254, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.014999986626207829, "sampling/importance_sampling_ratio/max": 1.0600758790969849, "sampling/importance_sampling_ratio/mean": 0.8555861711502075, "sampling/importance_sampling_ratio/min": 0.7122514843940735, "sampling/sampling_logp_difference/max": 0.43571996688842773, "sampling/sampling_logp_difference/mean": 0.01710636168718338, "step": 790, "step_time": 8.579650515981484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1980804204940796, "epoch": 0.791, "frac_reward_zero_std": 0.0, "grad_norm": 1.0128065347671509, "kl": 0.10143162310123444, "learning_rate": 5.562861233008774e-07, "loss": -0.1272, "num_tokens": 2213224.0, "reward": 0.9925000071525574, "reward_std": 0.00957426242530346, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.00957426242530346, "sampling/importance_sampling_ratio/max": 1.244162917137146, "sampling/importance_sampling_ratio/mean": 0.8024770021438599, "sampling/importance_sampling_ratio/min": 0.40055370330810547, "sampling/sampling_logp_difference/max": 0.7108635902404785, "sampling/sampling_logp_difference/mean": 0.021526839584112167, "step": 791, "step_time": 21.235105300031137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.14805449545383453, "epoch": 0.792, "frac_reward_zero_std": 0.0, "grad_norm": 0.7646735310554504, "kl": 0.013776770792901516, "learning_rate": 5.512041869898585e-07, "loss": 0.0187, "num_tokens": 2215922.0, "reward": 0.22500000894069672, "reward_std": 0.5177193880081177, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.5177193880081177, "sampling/importance_sampling_ratio/max": 1.3631681203842163, "sampling/importance_sampling_ratio/mean": 0.8914490938186646, "sampling/importance_sampling_ratio/min": 0.5417377352714539, "sampling/sampling_logp_difference/max": 0.47491908073425293, "sampling/sampling_logp_difference/mean": 0.018335452303290367, "step": 792, "step_time": 22.510953449003864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17312388122081757, "epoch": 0.793, "frac_reward_zero_std": 0.0, "grad_norm": 2.5769407749176025, "kl": 0.018955403938889503, "learning_rate": 5.46142692624764e-07, "loss": 0.3057, "num_tokens": 2218952.0, "reward": 0.7300000190734863, "reward_std": 0.5333541631698608, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5333541631698608, "sampling/importance_sampling_ratio/max": 2.190645694732666, "sampling/importance_sampling_ratio/mean": 1.281541347503662, "sampling/importance_sampling_ratio/min": 0.5733336210250854, "sampling/sampling_logp_difference/max": 0.5450418591499329, "sampling/sampling_logp_difference/mean": 0.02337059937417507, "step": 793, "step_time": 16.344598419964314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "entropy": 0.13810628652572632, "epoch": 0.794, "frac_reward_zero_std": 0.0, "grad_norm": 0.8026781678199768, "kl": 0.009717956185340881, "learning_rate": 5.411016932982751e-07, "loss": -0.1976, "num_tokens": 2221941.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 0.7852121591567993, "sampling/importance_sampling_ratio/mean": 0.5874035358428955, "sampling/importance_sampling_ratio/min": 0.33051344752311707, "sampling/sampling_logp_difference/max": 0.44902557134628296, "sampling/sampling_logp_difference/mean": 0.01808769442141056, "step": 794, "step_time": 22.935984315990936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.19587481021881104, "epoch": 0.795, "frac_reward_zero_std": 0.0, "grad_norm": 1.1193466186523438, "kl": 0.007409246638417244, "learning_rate": 5.360812418880884e-07, "loss": 0.0313, "num_tokens": 2224276.0, "reward": 0.7174999713897705, "reward_std": 0.5649999976158142, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.5649999976158142, "sampling/importance_sampling_ratio/max": 1.4078540802001953, "sampling/importance_sampling_ratio/mean": 1.2140588760375977, "sampling/importance_sampling_ratio/min": 1.0870336294174194, "sampling/sampling_logp_difference/max": 0.2789750099182129, "sampling/sampling_logp_difference/mean": 0.013154975138604641, "step": 795, "step_time": 14.34664799098391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.20118220150470734, "epoch": 0.796, "frac_reward_zero_std": 0.0, "grad_norm": 2.013411521911621, "kl": 0.014776881784200668, "learning_rate": 5.310813910563645e-07, "loss": 0.2549, "num_tokens": 2226676.0, "reward": 0.7325000166893005, "reward_std": 0.5349999666213989, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5349999666213989, "sampling/importance_sampling_ratio/max": 1.8802326917648315, "sampling/importance_sampling_ratio/mean": 1.421309471130371, "sampling/importance_sampling_ratio/min": 0.8183515667915344, "sampling/sampling_logp_difference/max": 0.32749223709106445, "sampling/sampling_logp_difference/mean": 0.026124635711312294, "step": 796, "step_time": 13.164709123026114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.171716570854187, "epoch": 0.797, "frac_reward_zero_std": 0.0, "grad_norm": 0.6674391627311707, "kl": 0.010285746306180954, "learning_rate": 5.261021932491714e-07, "loss": -0.0442, "num_tokens": 2229782.0, "reward": 0.7400000095367432, "reward_std": 0.5001999735832214, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5001999735832214, "sampling/importance_sampling_ratio/max": 1.23018479347229, "sampling/importance_sampling_ratio/mean": 0.7658807039260864, "sampling/importance_sampling_ratio/min": 0.563362717628479, "sampling/sampling_logp_difference/max": 0.5829996466636658, "sampling/sampling_logp_difference/mean": 0.016382500529289246, "step": 797, "step_time": 27.418524406966753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17473354935646057, "epoch": 0.798, "frac_reward_zero_std": 0.0, "grad_norm": 1.2395750284194946, "kl": 0.009930586442351341, "learning_rate": 5.211437006959396e-07, "loss": 0.2572, "num_tokens": 2232720.0, "reward": 0.4749999940395355, "reward_std": 0.6062177419662476, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6062177419662476, "sampling/importance_sampling_ratio/max": 1.8504725694656372, "sampling/importance_sampling_ratio/mean": 1.0980055332183838, "sampling/importance_sampling_ratio/min": 0.6141704320907593, "sampling/sampling_logp_difference/max": 0.6101547479629517, "sampling/sampling_logp_difference/mean": 0.022570829838514328, "step": 798, "step_time": 27.857180500985123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.20046040415763855, "epoch": 0.799, "frac_reward_zero_std": 0.0, "grad_norm": 1.419909119606018, "kl": 0.009930077940225601, "learning_rate": 5.162059654089083e-07, "loss": -0.0302, "num_tokens": 2235532.0, "reward": 0.21250000596046448, "reward_std": 0.5267114639282227, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.5267115235328674, "sampling/importance_sampling_ratio/max": 1.5732817649841309, "sampling/importance_sampling_ratio/mean": 1.0418779850006104, "sampling/importance_sampling_ratio/min": 0.6842311024665833, "sampling/sampling_logp_difference/max": 0.43329477310180664, "sampling/sampling_logp_difference/mean": 0.017911871895194054, "step": 799, "step_time": 29.79952314402908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17958156764507294, "epoch": 0.8, "frac_reward_zero_std": 0.0, "grad_norm": 0.9571763277053833, "kl": 0.011360651813447475, "learning_rate": 5.112890391825844e-07, "loss": -0.0715, "num_tokens": 2238683.0, "reward": 0.45500001311302185, "reward_std": 0.6319019198417664, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6319018602371216, "sampling/importance_sampling_ratio/max": 1.2257215976715088, "sampling/importance_sampling_ratio/mean": 0.8390830755233765, "sampling/importance_sampling_ratio/min": 0.47878724336624146, "sampling/sampling_logp_difference/max": 0.4325673580169678, "sampling/sampling_logp_difference/mean": 0.019181597977876663, "step": 800, "step_time": 39.032570591021795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.16971011459827423, "epoch": 0.801, "frac_reward_zero_std": 0.0, "grad_norm": 1.5135771036148071, "kl": 0.011661847122013569, "learning_rate": 5.063929735931985e-07, "loss": -0.3873, "num_tokens": 2241589.0, "reward": 0.4675000011920929, "reward_std": 0.5531952381134033, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.5531952977180481, "sampling/importance_sampling_ratio/max": 1.4882134199142456, "sampling/importance_sampling_ratio/mean": 0.9066961407661438, "sampling/importance_sampling_ratio/min": 0.569215714931488, "sampling/sampling_logp_difference/max": 0.4510658383369446, "sampling/sampling_logp_difference/mean": 0.013913432136178017, "step": 801, "step_time": 25.026140147994738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.17882198095321655, "epoch": 0.802, "frac_reward_zero_std": 0.0, "grad_norm": 0.9199877381324768, "kl": 0.04159931838512421, "learning_rate": 5.015178199981602e-07, "loss": -0.2877, "num_tokens": 2244652.0, "reward": 0.7450000047683716, "reward_std": 0.4967561364173889, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.4967561364173889, "sampling/importance_sampling_ratio/max": 1.2649548053741455, "sampling/importance_sampling_ratio/mean": 0.7683274149894714, "sampling/importance_sampling_ratio/min": 0.3505207300186157, "sampling/sampling_logp_difference/max": 0.5531506538391113, "sampling/sampling_logp_difference/mean": 0.027807625010609627, "step": 802, "step_time": 28.3410996790044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18011437356472015, "epoch": 0.803, "frac_reward_zero_std": 1.0, "grad_norm": 0.007815965451300144, "kl": 0.01194655243307352, "learning_rate": 4.966636295355254e-07, "loss": 0.0001, "num_tokens": 2247732.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.790849208831787, "sampling/importance_sampling_ratio/mean": 0.8822266459465027, "sampling/importance_sampling_ratio/min": 0.32599690556526184, "sampling/sampling_logp_difference/max": 0.4784144163131714, "sampling/sampling_logp_difference/mean": 0.02495655044913292, "step": 803, "step_time": 9.802588924998417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 59.5, "completions/mean_terminated_length": 59.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.2054775357246399, "epoch": 0.804, "frac_reward_zero_std": 0.0, "grad_norm": 1.6376335620880127, "kl": 0.0108869643881917, "learning_rate": 4.918304531234533e-07, "loss": 0.1983, "num_tokens": 2250589.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 1.4075301885604858, "sampling/importance_sampling_ratio/mean": 1.0960708856582642, "sampling/importance_sampling_ratio/min": 0.6361958980560303, "sampling/sampling_logp_difference/max": 0.37690234184265137, "sampling/sampling_logp_difference/mean": 0.017018061131238937, "step": 804, "step_time": 16.887071790988557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 62.75, "completions/mean_terminated_length": 62.75, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "entropy": 0.17534485459327698, "epoch": 0.805, "frac_reward_zero_std": 0.0, "grad_norm": 1.3343805074691772, "kl": 0.011893111281096935, "learning_rate": 4.870183414596794e-07, "loss": 0.0321, "num_tokens": 2253276.0, "reward": 0.4699999988079071, "reward_std": 0.6128621101379395, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6128621101379395, "sampling/importance_sampling_ratio/max": 1.6565197706222534, "sampling/importance_sampling_ratio/mean": 1.133117437362671, "sampling/importance_sampling_ratio/min": 0.6268569231033325, "sampling/sampling_logp_difference/max": 0.38912510871887207, "sampling/sampling_logp_difference/mean": 0.01705281063914299, "step": 805, "step_time": 28.746815636986867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.2010265290737152, "epoch": 0.806, "frac_reward_zero_std": 0.0, "grad_norm": 1.5721911191940308, "kl": 0.02580397017300129, "learning_rate": 4.822273450209767e-07, "loss": 0.0834, "num_tokens": 2256173.0, "reward": 0.4775000214576721, "reward_std": 0.5976272225379944, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5976272821426392, "sampling/importance_sampling_ratio/max": 1.2717344760894775, "sampling/importance_sampling_ratio/mean": 0.845128059387207, "sampling/importance_sampling_ratio/min": 0.5765133500099182, "sampling/sampling_logp_difference/max": 0.516434907913208, "sampling/sampling_logp_difference/mean": 0.02786729857325554, "step": 806, "step_time": 20.563251657004002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 48.75, "completions/mean_terminated_length": 48.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1876489520072937, "epoch": 0.807, "frac_reward_zero_std": 0.0, "grad_norm": 1.5900763273239136, "kl": 0.013270639814436436, "learning_rate": 4.774575140626317e-07, "loss": 0.0489, "num_tokens": 2258405.0, "reward": 0.44749999046325684, "reward_std": 0.6271297335624695, "rewards/reward_func/mean": 0.44749999046325684, "rewards/reward_func/std": 0.6271297335624695, "sampling/importance_sampling_ratio/max": 1.3228853940963745, "sampling/importance_sampling_ratio/mean": 0.9809135794639587, "sampling/importance_sampling_ratio/min": 0.6786561608314514, "sampling/sampling_logp_difference/max": 1.0350041389465332, "sampling/sampling_logp_difference/mean": 0.02136646769940853, "step": 807, "step_time": 17.394484421005473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.19536280632019043, "epoch": 0.808, "frac_reward_zero_std": 0.0, "grad_norm": 1.0698084831237793, "kl": 0.017988262698054314, "learning_rate": 4.727088986179129e-07, "loss": 0.2756, "num_tokens": 2261178.0, "reward": 0.7049999833106995, "reward_std": 0.5507267713546753, "rewards/reward_func/mean": 0.7049999833106995, "rewards/reward_func/std": 0.5507268309593201, "sampling/importance_sampling_ratio/max": 1.415069341659546, "sampling/importance_sampling_ratio/mean": 0.7814751863479614, "sampling/importance_sampling_ratio/min": 0.09279520064592361, "sampling/sampling_logp_difference/max": 0.5583863258361816, "sampling/sampling_logp_difference/mean": 0.021592801436781883, "step": 808, "step_time": 28.89226758404402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1817227303981781, "epoch": 0.809, "frac_reward_zero_std": 0.0, "grad_norm": 0.7316394448280334, "kl": 0.011850952170789242, "learning_rate": 4.679815484975506e-07, "loss": -0.1293, "num_tokens": 2263912.0, "reward": 0.1850000023841858, "reward_std": 0.5460463762283325, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.5460464358329773, "sampling/importance_sampling_ratio/max": 1.369758129119873, "sampling/importance_sampling_ratio/mean": 0.9832720756530762, "sampling/importance_sampling_ratio/min": 0.6943590641021729, "sampling/sampling_logp_difference/max": 0.4842700958251953, "sampling/sampling_logp_difference/mean": 0.01901688240468502, "step": 809, "step_time": 27.848608533968218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.5, "completions/mean_terminated_length": 46.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.21461977064609528, "epoch": 0.81, "frac_reward_zero_std": 0.0, "grad_norm": 0.9072965383529663, "kl": 0.008500700816512108, "learning_rate": 4.632755132892094e-07, "loss": -0.3181, "num_tokens": 2266063.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 2.1195013523101807, "sampling/importance_sampling_ratio/mean": 1.1718158721923828, "sampling/importance_sampling_ratio/min": 0.4169178903102875, "sampling/sampling_logp_difference/max": 0.4102039337158203, "sampling/sampling_logp_difference/mean": 0.017576338723301888, "step": 810, "step_time": 17.026407942001242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 57.5, "completions/mean_terminated_length": 57.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.21305222809314728, "epoch": 0.811, "frac_reward_zero_std": 0.0, "grad_norm": 0.9910611510276794, "kl": 0.010315888561308384, "learning_rate": 4.5859084235697236e-07, "loss": 0.2588, "num_tokens": 2269345.0, "reward": 0.48000001907348633, "reward_std": 0.601331889629364, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.601331889629364, "sampling/importance_sampling_ratio/max": 1.662583827972412, "sampling/importance_sampling_ratio/mean": 0.9407171010971069, "sampling/importance_sampling_ratio/min": 0.46772709488868713, "sampling/sampling_logp_difference/max": 0.5322847366333008, "sampling/sampling_logp_difference/mean": 0.023096144199371338, "step": 811, "step_time": 34.6031671789824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.18370580673217773, "epoch": 0.812, "frac_reward_zero_std": 0.0, "grad_norm": 0.9986130595207214, "kl": 0.02252900041639805, "learning_rate": 4.539275848408217e-07, "loss": 0.3904, "num_tokens": 2272720.0, "reward": 0.4950000047683716, "reward_std": 0.5831809639930725, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5831809043884277, "sampling/importance_sampling_ratio/max": 1.0480772256851196, "sampling/importance_sampling_ratio/mean": 0.5856756567955017, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9911562204360962, "sampling/sampling_logp_difference/mean": 0.0303180068731308, "step": 812, "step_time": 23.473890161025338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.14672961831092834, "epoch": 0.813, "frac_reward_zero_std": 0.0, "grad_norm": 0.8458666205406189, "kl": 0.010714175179600716, "learning_rate": 4.4928578965612034e-07, "loss": -0.1443, "num_tokens": 2275595.0, "reward": 0.7450000047683716, "reward_std": 0.5033554434776306, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5033553838729858, "sampling/importance_sampling_ratio/max": 1.0408309698104858, "sampling/importance_sampling_ratio/mean": 0.7613344788551331, "sampling/importance_sampling_ratio/min": 0.5826834440231323, "sampling/sampling_logp_difference/max": 0.4842512607574463, "sampling/sampling_logp_difference/mean": 0.015375719405710697, "step": 813, "step_time": 18.475114682049025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.17193692922592163, "epoch": 0.814, "frac_reward_zero_std": 0.0, "grad_norm": 0.9988762736320496, "kl": 0.011139345355331898, "learning_rate": 4.446655054931051e-07, "loss": -0.0201, "num_tokens": 2278277.0, "reward": 0.20000000298023224, "reward_std": 0.5349143147468567, "rewards/reward_func/mean": 0.20000000298023224, "rewards/reward_func/std": 0.5349143147468567, "sampling/importance_sampling_ratio/max": 1.010710597038269, "sampling/importance_sampling_ratio/mean": 0.8403879404067993, "sampling/importance_sampling_ratio/min": 0.4797182083129883, "sampling/sampling_logp_difference/max": 0.3223251700401306, "sampling/sampling_logp_difference/mean": 0.01814989373087883, "step": 814, "step_time": 32.06515252200188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.17054447531700134, "epoch": 0.815, "frac_reward_zero_std": 0.0, "grad_norm": 0.8063357472419739, "kl": 0.006983228959143162, "learning_rate": 4.400667808163689e-07, "loss": 0.0523, "num_tokens": 2281080.0, "reward": 0.45249998569488525, "reward_std": 0.632844090461731, "rewards/reward_func/mean": 0.45249998569488525, "rewards/reward_func/std": 0.6328441500663757, "sampling/importance_sampling_ratio/max": 0.9947839379310608, "sampling/importance_sampling_ratio/mean": 0.7957141399383545, "sampling/importance_sampling_ratio/min": 0.5442267656326294, "sampling/sampling_logp_difference/max": 0.6143076419830322, "sampling/sampling_logp_difference/mean": 0.01837220974266529, "step": 815, "step_time": 21.390265527006704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.205951988697052, "epoch": 0.816, "frac_reward_zero_std": 0.0, "grad_norm": 1.4754880666732788, "kl": 0.009705515578389168, "learning_rate": 4.354896638643591e-07, "loss": -0.2266, "num_tokens": 2283802.0, "reward": 0.45750001072883606, "reward_std": 0.6280326247215271, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.6280326843261719, "sampling/importance_sampling_ratio/max": 1.7561450004577637, "sampling/importance_sampling_ratio/mean": 1.0742089748382568, "sampling/importance_sampling_ratio/min": 0.7658206820487976, "sampling/sampling_logp_difference/max": 0.4200308322906494, "sampling/sampling_logp_difference/mean": 0.022033661603927612, "step": 816, "step_time": 21.581138708046637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.15711306035518646, "epoch": 0.817, "frac_reward_zero_std": 0.0, "grad_norm": 0.9345096945762634, "kl": 0.011637304909527302, "learning_rate": 4.3093420264886525e-07, "loss": -0.0048, "num_tokens": 2286997.0, "reward": 0.23499999940395355, "reward_std": 0.5104572772979736, "rewards/reward_func/mean": 0.23499999940395355, "rewards/reward_func/std": 0.5104572772979736, "sampling/importance_sampling_ratio/max": 0.9966399073600769, "sampling/importance_sampling_ratio/mean": 0.8099616169929504, "sampling/importance_sampling_ratio/min": 0.4820592999458313, "sampling/sampling_logp_difference/max": 0.3097100257873535, "sampling/sampling_logp_difference/mean": 0.014364638365805149, "step": 817, "step_time": 35.420583015016746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17414146661758423, "epoch": 0.818, "frac_reward_zero_std": 1.0, "grad_norm": 0.009861983358860016, "kl": 0.011552774347364902, "learning_rate": 4.264004449545206e-07, "loss": 0.0001, "num_tokens": 2289482.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0347466468811035, "sampling/importance_sampling_ratio/mean": 1.3609652519226074, "sampling/importance_sampling_ratio/min": 0.777285635471344, "sampling/sampling_logp_difference/max": 0.33480191230773926, "sampling/sampling_logp_difference/mean": 0.019030015915632248, "step": 818, "step_time": 14.128807380970102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1754721999168396, "epoch": 0.819, "frac_reward_zero_std": 0.0, "grad_norm": 1.4437003135681152, "kl": 0.018222562968730927, "learning_rate": 4.2188843833829874e-07, "loss": 0.1297, "num_tokens": 2291754.0, "reward": 0.7274999618530273, "reward_std": 0.5450000166893005, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.5450000166893005, "sampling/importance_sampling_ratio/max": 1.1184042692184448, "sampling/importance_sampling_ratio/mean": 0.9072989225387573, "sampling/importance_sampling_ratio/min": 0.47613978385925293, "sampling/sampling_logp_difference/max": 0.32324957847595215, "sampling/sampling_logp_difference/mean": 0.016209548339247704, "step": 819, "step_time": 16.407472643011715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.12643775343894958, "epoch": 0.82, "frac_reward_zero_std": 0.0, "grad_norm": 2.1231870651245117, "kl": 0.017428437247872353, "learning_rate": 4.1739823012901223e-07, "loss": -0.2071, "num_tokens": 2295038.0, "reward": 0.48250001668930054, "reward_std": 0.5864227414131165, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5864228010177612, "sampling/importance_sampling_ratio/max": 1.8391348123550415, "sampling/importance_sampling_ratio/mean": 1.3048007488250732, "sampling/importance_sampling_ratio/min": 0.829092800617218, "sampling/sampling_logp_difference/max": 1.211526870727539, "sampling/sampling_logp_difference/mean": 0.016960646957159042, "step": 820, "step_time": 35.23319572902983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 58.75, "completions/mean_terminated_length": 58.75, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.1962655484676361, "epoch": 0.821, "frac_reward_zero_std": 0.0, "grad_norm": 0.8454394936561584, "kl": 0.009563854895532131, "learning_rate": 4.129298674268226e-07, "loss": 0.0162, "num_tokens": 2298064.0, "reward": -0.08749999850988388, "reward_std": 0.0655108168721199, "rewards/reward_func/mean": -0.08749999850988388, "rewards/reward_func/std": 0.0655108168721199, "sampling/importance_sampling_ratio/max": 1.8635432720184326, "sampling/importance_sampling_ratio/mean": 1.0068652629852295, "sampling/importance_sampling_ratio/min": 0.5388798117637634, "sampling/sampling_logp_difference/max": 0.552926778793335, "sampling/sampling_logp_difference/mean": 0.018205897882580757, "step": 821, "step_time": 40.754630632989574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.17128358781337738, "epoch": 0.822, "frac_reward_zero_std": 0.0, "grad_norm": 1.2102605104446411, "kl": 0.02462276816368103, "learning_rate": 4.08483397102738e-07, "loss": 0.1069, "num_tokens": 2300810.0, "reward": 0.19750000536441803, "reward_std": 0.5369279980659485, "rewards/reward_func/mean": 0.19750000536441803, "rewards/reward_func/std": 0.5369279980659485, "sampling/importance_sampling_ratio/max": 1.1869491338729858, "sampling/importance_sampling_ratio/mean": 0.9190918207168579, "sampling/importance_sampling_ratio/min": 0.6545261740684509, "sampling/sampling_logp_difference/max": 0.4064096212387085, "sampling/sampling_logp_difference/mean": 0.017584379762411118, "step": 822, "step_time": 35.67830186098581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 44.0, "completions/mean_terminated_length": 44.0, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.17486220598220825, "epoch": 0.823, "frac_reward_zero_std": 0.0, "grad_norm": 1.2958502769470215, "kl": 0.008962174877524376, "learning_rate": 4.040588657981301e-07, "loss": -0.0166, "num_tokens": 2303791.0, "reward": 0.4775000214576721, "reward_std": 0.5925299525260925, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.5925298929214478, "sampling/importance_sampling_ratio/max": 1.2050940990447998, "sampling/importance_sampling_ratio/mean": 1.077024221420288, "sampling/importance_sampling_ratio/min": 0.9960001707077026, "sampling/sampling_logp_difference/max": 0.3881962299346924, "sampling/sampling_logp_difference/mean": 0.014588586986064911, "step": 823, "step_time": 24.134380882023834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.21331864595413208, "epoch": 0.824, "frac_reward_zero_std": 0.0, "grad_norm": 0.6838282942771912, "kl": 0.0070513272657990456, "learning_rate": 3.9965631992423704e-07, "loss": 0.1764, "num_tokens": 2306743.0, "reward": 0.2150000035762787, "reward_std": 0.5259594321250916, "rewards/reward_func/mean": 0.2150000035762787, "rewards/reward_func/std": 0.5259594321250916, "sampling/importance_sampling_ratio/max": 1.083331823348999, "sampling/importance_sampling_ratio/mean": 0.7108086943626404, "sampling/importance_sampling_ratio/min": 0.23756855726242065, "sampling/sampling_logp_difference/max": 0.4961552619934082, "sampling/sampling_logp_difference/mean": 0.023972108960151672, "step": 824, "step_time": 24.793997288041282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.23532478511333466, "epoch": 0.825, "frac_reward_zero_std": 1.0, "grad_norm": 0.006143078673630953, "kl": 0.0066858637146651745, "learning_rate": 3.952758056616826e-07, "loss": 0.0001, "num_tokens": 2309175.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4093302488327026, "sampling/importance_sampling_ratio/mean": 0.8465116024017334, "sampling/importance_sampling_ratio/min": 0.6270447373390198, "sampling/sampling_logp_difference/max": 0.42949986457824707, "sampling/sampling_logp_difference/mean": 0.020085223019123077, "step": 825, "step_time": 19.471903096011374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1588001549243927, "epoch": 0.826, "frac_reward_zero_std": 0.0, "grad_norm": 0.9130637645721436, "kl": 0.009222378022968769, "learning_rate": 3.9091736895998907e-07, "loss": -0.1827, "num_tokens": 2312145.0, "reward": 0.22750000655651093, "reward_std": 0.5155821442604065, "rewards/reward_func/mean": 0.22750000655651093, "rewards/reward_func/std": 0.5155822038650513, "sampling/importance_sampling_ratio/max": 1.110325813293457, "sampling/importance_sampling_ratio/mean": 0.8857458829879761, "sampling/importance_sampling_ratio/min": 0.5572593808174133, "sampling/sampling_logp_difference/max": 0.4140195846557617, "sampling/sampling_logp_difference/mean": 0.014316546730697155, "step": 826, "step_time": 24.975291687995195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1863897442817688, "epoch": 0.827, "frac_reward_zero_std": 0.0, "grad_norm": 1.5767977237701416, "kl": 0.011625191196799278, "learning_rate": 3.8658105553709356e-07, "loss": 0.1528, "num_tokens": 2315370.0, "reward": 0.4424999952316284, "reward_std": 0.639289915561676, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.6392899751663208, "sampling/importance_sampling_ratio/max": 1.9419039487838745, "sampling/importance_sampling_ratio/mean": 1.3288265466690063, "sampling/importance_sampling_ratio/min": 0.9593133926391602, "sampling/sampling_logp_difference/max": 0.3129112720489502, "sampling/sampling_logp_difference/mean": 0.018183281645178795, "step": 827, "step_time": 40.15956154098967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1707930564880371, "epoch": 0.828, "frac_reward_zero_std": 0.0, "grad_norm": 0.9571285843849182, "kl": 0.003910840023308992, "learning_rate": 3.822669108788737e-07, "loss": 0.0516, "num_tokens": 2317885.0, "reward": -0.10250000655651093, "reward_std": 0.08655441552400589, "rewards/reward_func/mean": -0.10250000655651093, "rewards/reward_func/std": 0.08655441552400589, "sampling/importance_sampling_ratio/max": 1.1113779544830322, "sampling/importance_sampling_ratio/mean": 0.8932973146438599, "sampling/importance_sampling_ratio/min": 0.6883568167686462, "sampling/sampling_logp_difference/max": 0.5045347213745117, "sampling/sampling_logp_difference/mean": 0.016512233763933182, "step": 828, "step_time": 29.19581034797011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.20417889952659607, "epoch": 0.829, "frac_reward_zero_std": 0.0, "grad_norm": 1.477431297302246, "kl": 0.01053350418806076, "learning_rate": 3.77974980238664e-07, "loss": -0.3819, "num_tokens": 2320609.0, "reward": 0.45750001072883606, "reward_std": 0.6225953698158264, "rewards/reward_func/mean": 0.45750001072883606, "rewards/reward_func/std": 0.6225954294204712, "sampling/importance_sampling_ratio/max": 1.7963160276412964, "sampling/importance_sampling_ratio/mean": 1.186934232711792, "sampling/importance_sampling_ratio/min": 0.5153868198394775, "sampling/sampling_logp_difference/max": 0.3202238082885742, "sampling/sampling_logp_difference/mean": 0.021925056353211403, "step": 829, "step_time": 19.395187649992295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.15229056775569916, "epoch": 0.83, "frac_reward_zero_std": 0.0, "grad_norm": 1.8170478343963623, "kl": 0.03679286688566208, "learning_rate": 3.737053086367873e-07, "loss": -0.3456, "num_tokens": 2323731.0, "reward": 0.48250001668930054, "reward_std": 0.5982404947280884, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5982404351234436, "sampling/importance_sampling_ratio/max": 1.6604540348052979, "sampling/importance_sampling_ratio/mean": 1.1528222560882568, "sampling/importance_sampling_ratio/min": 0.7104343175888062, "sampling/sampling_logp_difference/max": 0.7907071113586426, "sampling/sampling_logp_difference/mean": 0.024626366794109344, "step": 830, "step_time": 30.26012890099082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.22032947838306427, "epoch": 0.831, "frac_reward_zero_std": 0.0, "grad_norm": 1.3279026746749878, "kl": 0.009642641060054302, "learning_rate": 3.6945794086007706e-07, "loss": -0.1607, "num_tokens": 2326701.0, "reward": 0.48250001668930054, "reward_std": 0.5634639859199524, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5634639859199524, "sampling/importance_sampling_ratio/max": 1.3973757028579712, "sampling/importance_sampling_ratio/mean": 0.9588946104049683, "sampling/importance_sampling_ratio/min": 0.2998386025428772, "sampling/sampling_logp_difference/max": 0.456146240234375, "sampling/sampling_logp_difference/mean": 0.0189543217420578, "step": 831, "step_time": 24.04522953403648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1644914597272873, "epoch": 0.832, "frac_reward_zero_std": 0.0, "grad_norm": 0.5662811398506165, "kl": 0.005434609483927488, "learning_rate": 3.6523292146141225e-07, "loss": -0.031, "num_tokens": 2329124.0, "reward": 0.7250000238418579, "reward_std": 0.543353796005249, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.543353796005249, "sampling/importance_sampling_ratio/max": 1.0592163801193237, "sampling/importance_sampling_ratio/mean": 0.847390353679657, "sampling/importance_sampling_ratio/min": 0.7142123579978943, "sampling/sampling_logp_difference/max": 0.2976806163787842, "sampling/sampling_logp_difference/mean": 0.013939842581748962, "step": 832, "step_time": 18.12831261003157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1935732364654541, "epoch": 0.833, "frac_reward_zero_std": 0.0, "grad_norm": 1.2526682615280151, "kl": 0.2149205058813095, "learning_rate": 3.610302947592473e-07, "loss": 0.0411, "num_tokens": 2332726.0, "reward": -0.05249999836087227, "reward_std": 0.04193248599767685, "rewards/reward_func/mean": -0.05249999836087227, "rewards/reward_func/std": 0.04193248599767685, "sampling/importance_sampling_ratio/max": 1.6759350299835205, "sampling/importance_sampling_ratio/mean": 0.9670582413673401, "sampling/importance_sampling_ratio/min": 0.4668562710285187, "sampling/sampling_logp_difference/max": 1.067018985748291, "sampling/sampling_logp_difference/mean": 0.037538912147283554, "step": 833, "step_time": 45.58688229799736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.17752604186534882, "epoch": 0.834, "frac_reward_zero_std": 0.0, "grad_norm": 0.6150750517845154, "kl": 0.007287648040801287, "learning_rate": 3.56850104837147e-07, "loss": -0.1813, "num_tokens": 2335190.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 1.406284213066101, "sampling/importance_sampling_ratio/mean": 0.8016417026519775, "sampling/importance_sampling_ratio/min": 0.4885151982307434, "sampling/sampling_logp_difference/max": 0.466930627822876, "sampling/sampling_logp_difference/mean": 0.015480021014809608, "step": 834, "step_time": 11.869403278979007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.22800211608409882, "epoch": 0.835, "frac_reward_zero_std": 0.0, "grad_norm": 0.9578028917312622, "kl": 0.022625835612416267, "learning_rate": 3.5269239554332565e-07, "loss": 0.3839, "num_tokens": 2338092.0, "reward": 0.20499999821186066, "reward_std": 0.5326974987983704, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.5326975584030151, "sampling/importance_sampling_ratio/max": 1.5992507934570312, "sampling/importance_sampling_ratio/mean": 0.8830766677856445, "sampling/importance_sampling_ratio/min": 0.31618911027908325, "sampling/sampling_logp_difference/max": 0.6922190189361572, "sampling/sampling_logp_difference/mean": 0.0238192118704319, "step": 835, "step_time": 31.512995455006603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.186680406332016, "epoch": 0.836, "frac_reward_zero_std": 0.0, "grad_norm": 1.543162226676941, "kl": 0.012540522031486034, "learning_rate": 3.485572104901869e-07, "loss": 0.1418, "num_tokens": 2340969.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.082902431488037, "sampling/importance_sampling_ratio/mean": 0.763536810874939, "sampling/importance_sampling_ratio/min": 0.48717835545539856, "sampling/sampling_logp_difference/max": 0.5528017282485962, "sampling/sampling_logp_difference/mean": 0.018483372405171394, "step": 836, "step_time": 26.28796013299143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.12877994775772095, "epoch": 0.837, "frac_reward_zero_std": 0.0, "grad_norm": 0.669474184513092, "kl": 0.009199704974889755, "learning_rate": 3.4444459305386507e-07, "loss": -0.0978, "num_tokens": 2344250.0, "reward": 0.44999998807907104, "reward_std": 0.5797125697135925, "rewards/reward_func/mean": 0.44999998807907104, "rewards/reward_func/std": 0.5797125697135925, "sampling/importance_sampling_ratio/max": 0.8778196573257446, "sampling/importance_sampling_ratio/mean": 0.7608557939529419, "sampling/importance_sampling_ratio/min": 0.595091700553894, "sampling/sampling_logp_difference/max": 0.29559987783432007, "sampling/sampling_logp_difference/mean": 0.013404346071183681, "step": 837, "step_time": 32.918676506960765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1918548047542572, "epoch": 0.838, "frac_reward_zero_std": 0.0, "grad_norm": 1.533847689628601, "kl": 0.02322276309132576, "learning_rate": 3.403545863737706e-07, "loss": 0.2421, "num_tokens": 2347155.0, "reward": 0.7225000262260437, "reward_std": 0.5157760977745056, "rewards/reward_func/mean": 0.7225000262260437, "rewards/reward_func/std": 0.5157760977745056, "sampling/importance_sampling_ratio/max": 1.259711503982544, "sampling/importance_sampling_ratio/mean": 0.8380982279777527, "sampling/importance_sampling_ratio/min": 0.6516640186309814, "sampling/sampling_logp_difference/max": 0.4830503463745117, "sampling/sampling_logp_difference/mean": 0.029079409316182137, "step": 838, "step_time": 32.53770210099174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.16639743745326996, "epoch": 0.839, "frac_reward_zero_std": 0.0, "grad_norm": 0.8579061627388, "kl": 0.01332518458366394, "learning_rate": 3.362872333521389e-07, "loss": 0.3768, "num_tokens": 2350442.0, "reward": -0.032499998807907104, "reward_std": 0.0403112918138504, "rewards/reward_func/mean": -0.032499998807907104, "rewards/reward_func/std": 0.040311288088560104, "sampling/importance_sampling_ratio/max": 1.5169082880020142, "sampling/importance_sampling_ratio/mean": 0.8401793241500854, "sampling/importance_sampling_ratio/min": 0.3931258022785187, "sampling/sampling_logp_difference/max": 0.3263571262359619, "sampling/sampling_logp_difference/mean": 0.019800430163741112, "step": 839, "step_time": 33.64415624795947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.16395486891269684, "epoch": 0.84, "frac_reward_zero_std": 0.0, "grad_norm": 1.2737388610839844, "kl": 0.007574332877993584, "learning_rate": 3.322425766535778e-07, "loss": -0.0081, "num_tokens": 2352837.0, "reward": 0.17000001668930054, "reward_std": 0.534415602684021, "rewards/reward_func/mean": 0.17000001668930054, "rewards/reward_func/std": 0.534415602684021, "sampling/importance_sampling_ratio/max": 1.3868507146835327, "sampling/importance_sampling_ratio/mean": 1.0394642353057861, "sampling/importance_sampling_ratio/min": 0.7656762003898621, "sampling/sampling_logp_difference/max": 0.37166810035705566, "sampling/sampling_logp_difference/mean": 0.019453195855021477, "step": 840, "step_time": 23.48899394099135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1775212436914444, "epoch": 0.841, "frac_reward_zero_std": 0.0, "grad_norm": 1.1793206930160522, "kl": 0.01670890301465988, "learning_rate": 3.2822065870462216e-07, "loss": 0.1732, "num_tokens": 2355509.0, "reward": 0.737500011920929, "reward_std": 0.5183547735214233, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5183547735214233, "sampling/importance_sampling_ratio/max": 1.0268076658248901, "sampling/importance_sampling_ratio/mean": 0.733163595199585, "sampling/importance_sampling_ratio/min": 0.45836731791496277, "sampling/sampling_logp_difference/max": 0.33691859245300293, "sampling/sampling_logp_difference/mean": 0.015842992812395096, "step": 841, "step_time": 24.684389399015345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.19227556884288788, "epoch": 0.842, "frac_reward_zero_std": 0.0, "grad_norm": 1.2484830617904663, "kl": 0.006040018051862717, "learning_rate": 3.2422152169328925e-07, "loss": 0.1588, "num_tokens": 2358229.0, "reward": 0.4975000023841858, "reward_std": 0.574478030204773, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.574478030204773, "sampling/importance_sampling_ratio/max": 1.4473868608474731, "sampling/importance_sampling_ratio/mean": 0.9413895606994629, "sampling/importance_sampling_ratio/min": 0.45780861377716064, "sampling/sampling_logp_difference/max": 0.49308204650878906, "sampling/sampling_logp_difference/mean": 0.018636401742696762, "step": 842, "step_time": 24.692696976009756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18922726809978485, "epoch": 0.843, "frac_reward_zero_std": 0.0, "grad_norm": 1.003891944885254, "kl": 0.019801408052444458, "learning_rate": 3.2024520756863244e-07, "loss": 0.0142, "num_tokens": 2360939.0, "reward": 0.7274999618530273, "reward_std": 0.5450000166893005, "rewards/reward_func/mean": 0.7274999618530273, "rewards/reward_func/std": 0.5450000166893005, "sampling/importance_sampling_ratio/max": 1.0157593488693237, "sampling/importance_sampling_ratio/mean": 0.8257155418395996, "sampling/importance_sampling_ratio/min": 0.7257212400436401, "sampling/sampling_logp_difference/max": 0.35247159004211426, "sampling/sampling_logp_difference/mean": 0.014439178630709648, "step": 843, "step_time": 19.145142277993727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.180103600025177, "epoch": 0.844, "frac_reward_zero_std": 0.0, "grad_norm": 0.6468386650085449, "kl": 0.059285968542099, "learning_rate": 3.162917580403066e-07, "loss": 0.1341, "num_tokens": 2363390.0, "reward": 0.7275000214576721, "reward_std": 0.5383539795875549, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5383539795875549, "sampling/importance_sampling_ratio/max": 1.1012475490570068, "sampling/importance_sampling_ratio/mean": 0.6091129779815674, "sampling/importance_sampling_ratio/min": 0.010757852345705032, "sampling/sampling_logp_difference/max": 3.6021478176116943, "sampling/sampling_logp_difference/mean": 0.038047343492507935, "step": 844, "step_time": 25.45046971400734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17114479839801788, "epoch": 0.845, "frac_reward_zero_std": 0.0, "grad_norm": 1.1604118347167969, "kl": 0.00920295249670744, "learning_rate": 3.123612145781255e-07, "loss": 0.3682, "num_tokens": 2365762.0, "reward": 0.9925000071525574, "reward_std": 0.014999985694885254, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.014999986626207829, "sampling/importance_sampling_ratio/max": 1.4257279634475708, "sampling/importance_sampling_ratio/mean": 0.7272835969924927, "sampling/importance_sampling_ratio/min": 0.4257436990737915, "sampling/sampling_logp_difference/max": 0.47627711296081543, "sampling/sampling_logp_difference/mean": 0.018426435068249702, "step": 845, "step_time": 10.051446447032504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.14990216493606567, "epoch": 0.846, "frac_reward_zero_std": 0.0, "grad_norm": 0.5134677886962891, "kl": 0.009382078424096107, "learning_rate": 3.08453618411631e-07, "loss": 0.0609, "num_tokens": 2368461.0, "reward": 0.7325000166893005, "reward_std": 0.5350000262260437, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5349999666213989, "sampling/importance_sampling_ratio/max": 0.7793800234794617, "sampling/importance_sampling_ratio/mean": 0.6071128249168396, "sampling/importance_sampling_ratio/min": 0.39208662509918213, "sampling/sampling_logp_difference/max": 0.4311760663986206, "sampling/sampling_logp_difference/mean": 0.01838875189423561, "step": 846, "step_time": 21.29483149200678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.21665431559085846, "epoch": 0.847, "frac_reward_zero_std": 0.0, "grad_norm": 1.468215823173523, "kl": 0.014259794726967812, "learning_rate": 3.0456901052965726e-07, "loss": -0.0047, "num_tokens": 2371177.0, "reward": 0.699999988079071, "reward_std": 0.5800574421882629, "rewards/reward_func/mean": 0.699999988079071, "rewards/reward_func/std": 0.5800574421882629, "sampling/importance_sampling_ratio/max": 2.0398547649383545, "sampling/importance_sampling_ratio/mean": 1.336204171180725, "sampling/importance_sampling_ratio/min": 0.5635480880737305, "sampling/sampling_logp_difference/max": 0.4297794699668884, "sampling/sampling_logp_difference/mean": 0.02516762539744377, "step": 847, "step_time": 22.74143269698834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 71.0, "completions/max_terminated_length": 71.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.16123364865779877, "epoch": 0.848, "frac_reward_zero_std": 0.0, "grad_norm": 1.1822861433029175, "kl": 0.009443854913115501, "learning_rate": 3.0070743167990275e-07, "loss": -0.0799, "num_tokens": 2373811.0, "reward": 0.4724999964237213, "reward_std": 0.6038418412208557, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6038419008255005, "sampling/importance_sampling_ratio/max": 1.709815263748169, "sampling/importance_sampling_ratio/mean": 1.166445255279541, "sampling/importance_sampling_ratio/min": 0.5984019041061401, "sampling/sampling_logp_difference/max": 0.6722922325134277, "sampling/sampling_logp_difference/mean": 0.01599198952317238, "step": 848, "step_time": 28.18608867697185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2796138525009155, "epoch": 0.849, "frac_reward_zero_std": 0.0, "grad_norm": 1.6512898206710815, "kl": 0.016758890822529793, "learning_rate": 2.968689223685034e-07, "loss": 0.1503, "num_tokens": 2376360.0, "reward": 0.45749998092651367, "reward_std": 0.6270765066146851, "rewards/reward_func/mean": 0.45749998092651367, "rewards/reward_func/std": 0.6270765662193298, "sampling/importance_sampling_ratio/max": 1.5978471040725708, "sampling/importance_sampling_ratio/mean": 0.9322152137756348, "sampling/importance_sampling_ratio/min": 0.6283316612243652, "sampling/sampling_logp_difference/max": 0.3913869857788086, "sampling/sampling_logp_difference/mean": 0.027435243129730225, "step": 849, "step_time": 21.999671792029403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.1622805893421173, "epoch": 0.85, "frac_reward_zero_std": 0.0, "grad_norm": 1.412500023841858, "kl": 0.007405742537230253, "learning_rate": 2.9305352285960404e-07, "loss": -0.1426, "num_tokens": 2379680.0, "reward": 0.24000000953674316, "reward_std": 0.5067543387413025, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.5067543983459473, "sampling/importance_sampling_ratio/max": 1.5080679655075073, "sampling/importance_sampling_ratio/mean": 1.1415870189666748, "sampling/importance_sampling_ratio/min": 0.677692174911499, "sampling/sampling_logp_difference/max": 0.21403026580810547, "sampling/sampling_logp_difference/mean": 0.016235211864113808, "step": 850, "step_time": 29.752660113968886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.165657639503479, "epoch": 0.851, "frac_reward_zero_std": 0.0, "grad_norm": 1.3680304288864136, "kl": 0.006796839646995068, "learning_rate": 2.892612731749414e-07, "loss": -0.2616, "num_tokens": 2383119.0, "reward": 0.49000000953674316, "reward_std": 0.5888972282409668, "rewards/reward_func/mean": 0.49000000953674316, "rewards/reward_func/std": 0.5888972282409668, "sampling/importance_sampling_ratio/max": 1.2143021821975708, "sampling/importance_sampling_ratio/mean": 1.007341980934143, "sampling/importance_sampling_ratio/min": 0.7997197508811951, "sampling/sampling_logp_difference/max": 0.30791938304901123, "sampling/sampling_logp_difference/mean": 0.015207197517156601, "step": 851, "step_time": 35.45942432299489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 45.75, "completions/mean_terminated_length": 45.75, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.16966894268989563, "epoch": 0.852, "frac_reward_zero_std": 0.0, "grad_norm": 1.278833270072937, "kl": 0.01502284687012434, "learning_rate": 2.8549221309341903e-07, "loss": -0.0892, "num_tokens": 2386203.0, "reward": 0.4749999940395355, "reward_std": 0.6015812158584595, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6015812158584595, "sampling/importance_sampling_ratio/max": 1.5108919143676758, "sampling/importance_sampling_ratio/mean": 1.161525011062622, "sampling/importance_sampling_ratio/min": 0.7567739486694336, "sampling/sampling_logp_difference/max": 0.49666595458984375, "sampling/sampling_logp_difference/mean": 0.02126007340848446, "step": 852, "step_time": 32.071394263010006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18095619976520538, "epoch": 0.853, "frac_reward_zero_std": 0.0, "grad_norm": 0.9422032833099365, "kl": 0.009996765293180943, "learning_rate": 2.8174638215069494e-07, "loss": 0.2559, "num_tokens": 2389449.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 1.1240458488464355, "sampling/importance_sampling_ratio/mean": 0.6376754641532898, "sampling/importance_sampling_ratio/min": 0.44907501339912415, "sampling/sampling_logp_difference/max": 0.5306485891342163, "sampling/sampling_logp_difference/mean": 0.02526061050593853, "step": 853, "step_time": 17.366863403993193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.18978972733020782, "epoch": 0.854, "frac_reward_zero_std": 0.0, "grad_norm": 1.2462257146835327, "kl": 0.01231365092098713, "learning_rate": 2.780238196387619e-07, "loss": 0.1348, "num_tokens": 2392298.0, "reward": 0.4775000214576721, "reward_std": 0.6040074825286865, "rewards/reward_func/mean": 0.4775000214576721, "rewards/reward_func/std": 0.6040074825286865, "sampling/importance_sampling_ratio/max": 1.9387389421463013, "sampling/importance_sampling_ratio/mean": 1.2450857162475586, "sampling/importance_sampling_ratio/min": 0.3843546509742737, "sampling/sampling_logp_difference/max": 0.35591620206832886, "sampling/sampling_logp_difference/mean": 0.026126228272914886, "step": 854, "step_time": 21.978632806043606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.15857350826263428, "epoch": 0.855, "frac_reward_zero_std": 0.0, "grad_norm": 1.2062512636184692, "kl": 0.01693621091544628, "learning_rate": 2.743245646055398e-07, "loss": 0.2889, "num_tokens": 2395268.0, "reward": 0.987500011920929, "reward_std": 0.02499999664723873, "rewards/reward_func/mean": 0.987500011920929, "rewards/reward_func/std": 0.025000005960464478, "sampling/importance_sampling_ratio/max": 1.2723414897918701, "sampling/importance_sampling_ratio/mean": 0.8478846549987793, "sampling/importance_sampling_ratio/min": 0.513663649559021, "sampling/sampling_logp_difference/max": 0.41994762420654297, "sampling/sampling_logp_difference/mean": 0.01691858097910881, "step": 855, "step_time": 25.634181850997265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.11950673162937164, "epoch": 0.856, "frac_reward_zero_std": 0.0, "grad_norm": 0.48881202936172485, "kl": 0.012126228772103786, "learning_rate": 2.706486558544644e-07, "loss": -0.0807, "num_tokens": 2398305.0, "reward": 0.7325000166893005, "reward_std": 0.508486270904541, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5084863305091858, "sampling/importance_sampling_ratio/max": 1.1293208599090576, "sampling/importance_sampling_ratio/mean": 0.7066723704338074, "sampling/importance_sampling_ratio/min": 0.5500060319900513, "sampling/sampling_logp_difference/max": 0.48597288131713867, "sampling/sampling_logp_difference/mean": 0.01614673063158989, "step": 856, "step_time": 20.926950644992758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.16225463151931763, "epoch": 0.857, "frac_reward_zero_std": 0.0, "grad_norm": 1.354940414428711, "kl": 0.017896318808197975, "learning_rate": 2.6699613194407724e-07, "loss": 0.1783, "num_tokens": 2401154.0, "reward": 0.7174999713897705, "reward_std": 0.4814128577709198, "rewards/reward_func/mean": 0.7174999713897705, "rewards/reward_func/std": 0.4814128577709198, "sampling/importance_sampling_ratio/max": 1.7239042520523071, "sampling/importance_sampling_ratio/mean": 0.9752792716026306, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.7297461032867432, "sampling/sampling_logp_difference/mean": 0.021161388605833054, "step": 857, "step_time": 35.985530426958576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17592734098434448, "epoch": 0.858, "frac_reward_zero_std": 0.0, "grad_norm": 1.1126450300216675, "kl": 0.008447921834886074, "learning_rate": 2.633670311876277e-07, "loss": -0.0888, "num_tokens": 2403349.0, "reward": 0.42000001668930054, "reward_std": 0.6701740622520447, "rewards/reward_func/mean": 0.42000001668930054, "rewards/reward_func/std": 0.6701741218566895, "sampling/importance_sampling_ratio/max": 1.2816358804702759, "sampling/importance_sampling_ratio/mean": 0.8961262106895447, "sampling/importance_sampling_ratio/min": 0.5940454006195068, "sampling/sampling_logp_difference/max": 0.32509803771972656, "sampling/sampling_logp_difference/mean": 0.013038203120231628, "step": 858, "step_time": 21.325295454997104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16509617865085602, "epoch": 0.859, "frac_reward_zero_std": 0.0, "grad_norm": 1.0125917196273804, "kl": 0.01618064008653164, "learning_rate": 2.5976139165266367e-07, "loss": -0.0695, "num_tokens": 2406521.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.0414167642593384, "sampling/importance_sampling_ratio/mean": 0.8598008155822754, "sampling/importance_sampling_ratio/min": 0.5939289331436157, "sampling/sampling_logp_difference/max": 0.7001156806945801, "sampling/sampling_logp_difference/mean": 0.022162873297929764, "step": 859, "step_time": 25.550344446033705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.15506242215633392, "epoch": 0.86, "frac_reward_zero_std": 0.0, "grad_norm": 0.5608531832695007, "kl": 0.02194199524819851, "learning_rate": 2.5617925116063926e-07, "loss": 0.2624, "num_tokens": 2409228.0, "reward": 0.2149999886751175, "reward_std": 0.5249444842338562, "rewards/reward_func/mean": 0.2149999886751175, "rewards/reward_func/std": 0.5249444842338562, "sampling/importance_sampling_ratio/max": 1.7682006359100342, "sampling/importance_sampling_ratio/mean": 0.7009803056716919, "sampling/importance_sampling_ratio/min": 0.20947977900505066, "sampling/sampling_logp_difference/max": 1.2303390502929688, "sampling/sampling_logp_difference/mean": 0.028136592358350754, "step": 860, "step_time": 25.325990209006704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.17698627710342407, "epoch": 0.861, "frac_reward_zero_std": 0.0, "grad_norm": 0.76371169090271, "kl": 0.013147013261914253, "learning_rate": 2.52620647286512e-07, "loss": -0.1406, "num_tokens": 2412064.0, "reward": 0.48250001668930054, "reward_std": 0.5982404947280884, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5982404351234436, "sampling/importance_sampling_ratio/max": 0.849077045917511, "sampling/importance_sampling_ratio/mean": 0.6719473600387573, "sampling/importance_sampling_ratio/min": 0.5164012908935547, "sampling/sampling_logp_difference/max": 0.49473118782043457, "sampling/sampling_logp_difference/mean": 0.014568660408258438, "step": 861, "step_time": 27.166167220042553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1506022959947586, "epoch": 0.862, "frac_reward_zero_std": 0.0, "grad_norm": 1.6407166719436646, "kl": 0.021196290850639343, "learning_rate": 2.4908561735835306e-07, "loss": 0.34, "num_tokens": 2415471.0, "reward": 0.4925000071525574, "reward_std": 0.5860247611999512, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.657211184501648, "sampling/importance_sampling_ratio/mean": 1.1751480102539062, "sampling/importance_sampling_ratio/min": 0.7663894295692444, "sampling/sampling_logp_difference/max": 0.3319249153137207, "sampling/sampling_logp_difference/mean": 0.020298969000577927, "step": 862, "step_time": 23.264881934039295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.18631719052791595, "epoch": 0.863, "frac_reward_zero_std": 0.0, "grad_norm": 0.9092467427253723, "kl": 0.00855103600770235, "learning_rate": 2.455741984569543e-07, "loss": -0.1272, "num_tokens": 2418171.0, "reward": 0.9925000071525574, "reward_std": 0.014999985694885254, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.014999986626207829, "sampling/importance_sampling_ratio/max": 1.4410194158554077, "sampling/importance_sampling_ratio/mean": 0.8903515338897705, "sampling/importance_sampling_ratio/min": 0.4063568413257599, "sampling/sampling_logp_difference/max": 0.46382153034210205, "sampling/sampling_logp_difference/mean": 0.018474414944648743, "step": 863, "step_time": 22.841054462944157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1891644448041916, "epoch": 0.864, "frac_reward_zero_std": 0.0, "grad_norm": 1.3630229234695435, "kl": 0.012984142638742924, "learning_rate": 2.420864274154372e-07, "loss": -0.2425, "num_tokens": 2421012.0, "reward": 0.1850000023841858, "reward_std": 0.5477529764175415, "rewards/reward_func/mean": 0.1850000023841858, "rewards/reward_func/std": 0.5477529764175415, "sampling/importance_sampling_ratio/max": 1.2752861976623535, "sampling/importance_sampling_ratio/mean": 0.9424954652786255, "sampling/importance_sampling_ratio/min": 0.7542344927787781, "sampling/sampling_logp_difference/max": 0.3134586811065674, "sampling/sampling_logp_difference/mean": 0.015593142248690128, "step": 864, "step_time": 27.60715255897958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.22132880985736847, "epoch": 0.865, "frac_reward_zero_std": 1.0, "grad_norm": 0.0209751445800066, "kl": 0.026738613843917847, "learning_rate": 2.386223408188704e-07, "loss": 0.0003, "num_tokens": 2423753.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 0.7020701169967651, "sampling/importance_sampling_ratio/mean": 0.37373390793800354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6394176483154297, "sampling/sampling_logp_difference/mean": 0.032533902674913406, "step": 865, "step_time": 13.416073839995079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.2077866941690445, "epoch": 0.866, "frac_reward_zero_std": 0.0, "grad_norm": 1.467648983001709, "kl": 0.010890993289649487, "learning_rate": 2.3518197500388278e-07, "loss": 0.0789, "num_tokens": 2426551.0, "reward": 0.4650000035762787, "reward_std": 0.561575174331665, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.561575174331665, "sampling/importance_sampling_ratio/max": 2.1942172050476074, "sampling/importance_sampling_ratio/mean": 1.4760031700134277, "sampling/importance_sampling_ratio/min": 0.909105658531189, "sampling/sampling_logp_difference/max": 0.5286828279495239, "sampling/sampling_logp_difference/mean": 0.02224583737552166, "step": 866, "step_time": 23.49216454301495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.18499703705310822, "epoch": 0.867, "frac_reward_zero_std": 0.0, "grad_norm": 0.5360456705093384, "kl": 0.007631050422787666, "learning_rate": 2.3176536605828443e-07, "loss": -0.0918, "num_tokens": 2429540.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 0.8913956880569458, "sampling/importance_sampling_ratio/mean": 0.6784557104110718, "sampling/importance_sampling_ratio/min": 0.46579423546791077, "sampling/sampling_logp_difference/max": 0.5747342109680176, "sampling/sampling_logp_difference/mean": 0.01934453658759594, "step": 867, "step_time": 26.39598606695654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.19343514740467072, "epoch": 0.868, "frac_reward_zero_std": 0.0, "grad_norm": 1.7998617887496948, "kl": 0.009057797491550446, "learning_rate": 2.2837254982068568e-07, "loss": 0.3389, "num_tokens": 2431951.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 2.112736463546753, "sampling/importance_sampling_ratio/mean": 1.4234766960144043, "sampling/importance_sampling_ratio/min": 0.9283630847930908, "sampling/sampling_logp_difference/max": 0.25321924686431885, "sampling/sampling_logp_difference/mean": 0.014189847745001316, "step": 868, "step_time": 10.6184523460106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18004736304283142, "epoch": 0.869, "frac_reward_zero_std": 0.0, "grad_norm": 2.905263662338257, "kl": 0.011783437803387642, "learning_rate": 2.2500356188012413e-07, "loss": 0.3301, "num_tokens": 2434474.0, "reward": 0.6850000023841858, "reward_std": 0.5005663633346558, "rewards/reward_func/mean": 0.6850000023841858, "rewards/reward_func/std": 0.5005663633346558, "sampling/importance_sampling_ratio/max": 2.496553659439087, "sampling/importance_sampling_ratio/mean": 1.4958081245422363, "sampling/importance_sampling_ratio/min": 0.4620645344257355, "sampling/sampling_logp_difference/max": 0.5712859630584717, "sampling/sampling_logp_difference/mean": 0.019840184599161148, "step": 869, "step_time": 21.124880692048464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1944282054901123, "epoch": 0.87, "frac_reward_zero_std": 0.0, "grad_norm": 0.9354277849197388, "kl": 0.016179008409380913, "learning_rate": 2.2165843757568807e-07, "loss": -0.197, "num_tokens": 2437008.0, "reward": 0.7325000166893005, "reward_std": 0.5084207653999329, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5084207653999329, "sampling/importance_sampling_ratio/max": 2.120872735977173, "sampling/importance_sampling_ratio/mean": 0.8408769369125366, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9974706172943115, "sampling/sampling_logp_difference/mean": 0.022908521816134453, "step": 870, "step_time": 30.65613580099307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.18700894713401794, "epoch": 0.871, "frac_reward_zero_std": 0.0, "grad_norm": 0.5492638349533081, "kl": 0.0148142259567976, "learning_rate": 2.1833721199614992e-07, "loss": -0.0661, "num_tokens": 2439726.0, "reward": 0.7224999666213989, "reward_std": 0.5094032883644104, "rewards/reward_func/mean": 0.7224999666213989, "rewards/reward_func/std": 0.5094032883644104, "sampling/importance_sampling_ratio/max": 0.683012843132019, "sampling/importance_sampling_ratio/mean": 0.5436038970947266, "sampling/importance_sampling_ratio/min": 0.3585931360721588, "sampling/sampling_logp_difference/max": 0.5609478950500488, "sampling/sampling_logp_difference/mean": 0.02425682730972767, "step": 871, "step_time": 16.54579179303255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1687999963760376, "epoch": 0.872, "frac_reward_zero_std": 0.0, "grad_norm": 2.947537660598755, "kl": 0.01699940860271454, "learning_rate": 2.15039919979593e-07, "loss": 0.2491, "num_tokens": 2442604.0, "reward": 0.699999988079071, "reward_std": 0.5867424607276917, "rewards/reward_func/mean": 0.699999988079071, "rewards/reward_func/std": 0.5867424607276917, "sampling/importance_sampling_ratio/max": 2.2133004665374756, "sampling/importance_sampling_ratio/mean": 1.6138312816619873, "sampling/importance_sampling_ratio/min": 0.5003395676612854, "sampling/sampling_logp_difference/max": 0.6861424446105957, "sampling/sampling_logp_difference/mean": 0.028040388599038124, "step": 872, "step_time": 16.541730776021723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 56.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.20657598972320557, "epoch": 0.873, "frac_reward_zero_std": 0.0, "grad_norm": 1.7772443294525146, "kl": 0.014086953364312649, "learning_rate": 2.1176659611305133e-07, "loss": -0.3028, "num_tokens": 2445860.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5773502588272095, "sampling/importance_sampling_ratio/max": 1.573192834854126, "sampling/importance_sampling_ratio/mean": 1.0226027965545654, "sampling/importance_sampling_ratio/min": 0.48407629132270813, "sampling/sampling_logp_difference/max": 0.7500073909759521, "sampling/sampling_logp_difference/mean": 0.023664440959692, "step": 873, "step_time": 30.737320263986476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.18743224442005157, "epoch": 0.874, "frac_reward_zero_std": 0.0, "grad_norm": 1.2787818908691406, "kl": 0.008258462883532047, "learning_rate": 2.0851727473214317e-07, "loss": 0.1422, "num_tokens": 2448263.0, "reward": 0.4375, "reward_std": 0.6282448768615723, "rewards/reward_func/mean": 0.4375, "rewards/reward_func/std": 0.6282448768615723, "sampling/importance_sampling_ratio/max": 1.6630975008010864, "sampling/importance_sampling_ratio/mean": 1.1302821636199951, "sampling/importance_sampling_ratio/min": 0.863597571849823, "sampling/sampling_logp_difference/max": 0.4663163423538208, "sampling/sampling_logp_difference/mean": 0.01843806728720665, "step": 874, "step_time": 18.76195049803937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.2160964012145996, "epoch": 0.875, "frac_reward_zero_std": 0.0, "grad_norm": 3.7848427295684814, "kl": 0.016163500025868416, "learning_rate": 2.0529198992071202e-07, "loss": 0.7651, "num_tokens": 2451024.0, "reward": 0.9950000047683716, "reward_std": 0.009999990463256836, "rewards/reward_func/mean": 0.9950000047683716, "rewards/reward_func/std": 0.009999990463256836, "sampling/importance_sampling_ratio/max": 2.9236931800842285, "sampling/importance_sampling_ratio/mean": 1.3442708253860474, "sampling/importance_sampling_ratio/min": 0.5945703983306885, "sampling/sampling_logp_difference/max": 0.5726927518844604, "sampling/sampling_logp_difference/mean": 0.020161457359790802, "step": 875, "step_time": 17.45886747498298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.18215207755565643, "epoch": 0.876, "frac_reward_zero_std": 0.0, "grad_norm": 0.8797324299812317, "kl": 0.04453187435865402, "learning_rate": 2.020907755104698e-07, "loss": -0.2606, "num_tokens": 2453625.0, "reward": 0.9900000095367432, "reward_std": 0.019999999552965164, "rewards/reward_func/mean": 0.9900000095367432, "rewards/reward_func/std": 0.02000001072883606, "sampling/importance_sampling_ratio/max": 1.8095370531082153, "sampling/importance_sampling_ratio/mean": 1.1018614768981934, "sampling/importance_sampling_ratio/min": 0.5741012692451477, "sampling/sampling_logp_difference/max": 0.6543464660644531, "sampling/sampling_logp_difference/mean": 0.020336154848337173, "step": 876, "step_time": 11.849067734961864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1823144406080246, "epoch": 0.877, "frac_reward_zero_std": 0.0, "grad_norm": 1.757820725440979, "kl": 0.02728825993835926, "learning_rate": 1.9891366508064003e-07, "loss": -0.3071, "num_tokens": 2456262.0, "reward": 0.4674999713897705, "reward_std": 0.6152167916297913, "rewards/reward_func/mean": 0.4674999713897705, "rewards/reward_func/std": 0.6152167916297913, "sampling/importance_sampling_ratio/max": 2.0679214000701904, "sampling/importance_sampling_ratio/mean": 1.4455291032791138, "sampling/importance_sampling_ratio/min": 0.6010313630104065, "sampling/sampling_logp_difference/max": 0.641878604888916, "sampling/sampling_logp_difference/mean": 0.019574081525206566, "step": 877, "step_time": 23.662216279946733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.15621539950370789, "epoch": 0.878, "frac_reward_zero_std": 0.0, "grad_norm": 1.217118263244629, "kl": 0.013284009881317616, "learning_rate": 1.9576069195760883e-07, "loss": -0.2297, "num_tokens": 2458936.0, "reward": 0.17750000953674316, "reward_std": 0.5485967397689819, "rewards/reward_func/mean": 0.17750000953674316, "rewards/reward_func/std": 0.5485966801643372, "sampling/importance_sampling_ratio/max": 1.1171051263809204, "sampling/importance_sampling_ratio/mean": 0.7048035264015198, "sampling/importance_sampling_ratio/min": 0.38375580310821533, "sampling/sampling_logp_difference/max": 0.5130540132522583, "sampling/sampling_logp_difference/mean": 0.019543346017599106, "step": 878, "step_time": 23.618925157003105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1731119155883789, "epoch": 0.879, "frac_reward_zero_std": 0.0, "grad_norm": 0.8267610669136047, "kl": 0.026203131303191185, "learning_rate": 1.926318892145712e-07, "loss": 0.1625, "num_tokens": 2462050.0, "reward": 0.4950000047683716, "reward_std": 0.5831237435340881, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5831238031387329, "sampling/importance_sampling_ratio/max": 1.1664992570877075, "sampling/importance_sampling_ratio/mean": 0.7916139960289001, "sampling/importance_sampling_ratio/min": 0.39211246371269226, "sampling/sampling_logp_difference/max": 0.8612241744995117, "sampling/sampling_logp_difference/mean": 0.024210838600993156, "step": 879, "step_time": 19.22094821400242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.18681423366069794, "epoch": 0.88, "frac_reward_zero_std": 0.0, "grad_norm": 0.9253535866737366, "kl": 0.010282611474394798, "learning_rate": 1.8952728967118805e-07, "loss": -0.1138, "num_tokens": 2464787.0, "reward": 0.2224999964237213, "reward_std": 0.5119488835334778, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.5119489431381226, "sampling/importance_sampling_ratio/max": 0.9359728693962097, "sampling/importance_sampling_ratio/mean": 0.7078923583030701, "sampling/importance_sampling_ratio/min": 0.3885251581668854, "sampling/sampling_logp_difference/max": 0.4999876022338867, "sampling/sampling_logp_difference/mean": 0.01991511695086956, "step": 880, "step_time": 33.46488549700007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.16452932357788086, "epoch": 0.881, "frac_reward_zero_std": 0.0, "grad_norm": 1.189112901687622, "kl": 0.005882726516574621, "learning_rate": 1.864469258932397e-07, "loss": 0.0721, "num_tokens": 2467473.0, "reward": 0.7400000095367432, "reward_std": 0.5133549571037292, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.513355016708374, "sampling/importance_sampling_ratio/max": 1.1392372846603394, "sampling/importance_sampling_ratio/mean": 0.9576743841171265, "sampling/importance_sampling_ratio/min": 0.5530322790145874, "sampling/sampling_logp_difference/max": 0.3571441173553467, "sampling/sampling_logp_difference/mean": 0.01491897739470005, "step": 881, "step_time": 15.558518597041257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.1552337408065796, "epoch": 0.882, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109888792037964, "kl": 0.009903616271913052, "learning_rate": 1.8339083019228405e-07, "loss": 0.174, "num_tokens": 2470658.0, "reward": 0.4950000047683716, "reward_std": 0.5773791074752808, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5773791670799255, "sampling/importance_sampling_ratio/max": 1.207619547843933, "sampling/importance_sampling_ratio/mean": 0.762014627456665, "sampling/importance_sampling_ratio/min": 0.08834663033485413, "sampling/sampling_logp_difference/max": 0.8467772006988525, "sampling/sampling_logp_difference/mean": 0.02456793375313282, "step": 882, "step_time": 23.45834313199157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 48.25, "completions/mean_terminated_length": 48.25, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1790204644203186, "epoch": 0.883, "frac_reward_zero_std": 0.0, "grad_norm": 1.4729374647140503, "kl": 0.006987810134887695, "learning_rate": 1.803590346253195e-07, "loss": -0.0512, "num_tokens": 2473340.0, "reward": 0.48750001192092896, "reward_std": 0.5921359658241272, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.592136025428772, "sampling/importance_sampling_ratio/max": 1.1181201934814453, "sampling/importance_sampling_ratio/mean": 1.0030815601348877, "sampling/importance_sampling_ratio/min": 0.8332188129425049, "sampling/sampling_logp_difference/max": 0.37771010398864746, "sampling/sampling_logp_difference/mean": 0.01761620305478573, "step": 883, "step_time": 29.227340273035225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 59.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.180127814412117, "epoch": 0.884, "frac_reward_zero_std": 0.0, "grad_norm": 1.4420219659805298, "kl": 0.01028655655682087, "learning_rate": 1.7735157099444594e-07, "loss": -0.1869, "num_tokens": 2476012.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.5773502588272095, "sampling/importance_sampling_ratio/max": 1.7068969011306763, "sampling/importance_sampling_ratio/mean": 1.1267677545547485, "sampling/importance_sampling_ratio/min": 0.8424950838088989, "sampling/sampling_logp_difference/max": 0.5154504776000977, "sampling/sampling_logp_difference/mean": 0.017424389719963074, "step": 884, "step_time": 14.768310878949706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.15297441184520721, "epoch": 0.885, "frac_reward_zero_std": 0.0, "grad_norm": 0.8255749344825745, "kl": 0.010701932944357395, "learning_rate": 1.7436847084653458e-07, "loss": -0.0385, "num_tokens": 2478687.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 0.8881320357322693, "sampling/importance_sampling_ratio/mean": 0.7205005884170532, "sampling/importance_sampling_ratio/min": 0.4162541329860687, "sampling/sampling_logp_difference/max": 0.581369161605835, "sampling/sampling_logp_difference/mean": 0.020469412207603455, "step": 885, "step_time": 14.893060111033265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2043767124414444, "epoch": 0.886, "frac_reward_zero_std": 0.0, "grad_norm": 1.2387460470199585, "kl": 0.02092367596924305, "learning_rate": 1.7140976547289438e-07, "loss": 0.0749, "num_tokens": 2482179.0, "reward": 0.24000000953674316, "reward_std": 0.5066885948181152, "rewards/reward_func/mean": 0.24000000953674316, "rewards/reward_func/std": 0.5066885948181152, "sampling/importance_sampling_ratio/max": 1.2210900783538818, "sampling/importance_sampling_ratio/mean": 1.0079691410064697, "sampling/importance_sampling_ratio/min": 0.768981397151947, "sampling/sampling_logp_difference/max": 0.5036883354187012, "sampling/sampling_logp_difference/mean": 0.020835360512137413, "step": 886, "step_time": 42.84650323097594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.14825642108917236, "epoch": 0.887, "frac_reward_zero_std": 0.0, "grad_norm": 0.5784486532211304, "kl": 0.01739327609539032, "learning_rate": 1.6847548590894435e-07, "loss": -0.1255, "num_tokens": 2484772.0, "reward": 0.6899999976158142, "reward_std": 0.5737014412879944, "rewards/reward_func/mean": 0.6899999976158142, "rewards/reward_func/std": 0.5737014412879944, "sampling/importance_sampling_ratio/max": 1.518114686012268, "sampling/importance_sampling_ratio/mean": 0.7533466219902039, "sampling/importance_sampling_ratio/min": 0.1653176248073578, "sampling/sampling_logp_difference/max": 0.8092077970504761, "sampling/sampling_logp_difference/mean": 0.02420375868678093, "step": 887, "step_time": 19.143702121975366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.16579365730285645, "epoch": 0.888, "frac_reward_zero_std": 0.0, "grad_norm": 1.1091192960739136, "kl": 0.012846623547375202, "learning_rate": 1.6556566293388893e-07, "loss": -0.0025, "num_tokens": 2487312.0, "reward": 0.7325000166893005, "reward_std": 0.5084863305091858, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.5084863305091858, "sampling/importance_sampling_ratio/max": 1.2256481647491455, "sampling/importance_sampling_ratio/mean": 0.9665192365646362, "sampling/importance_sampling_ratio/min": 0.5391331315040588, "sampling/sampling_logp_difference/max": 0.32639122009277344, "sampling/sampling_logp_difference/mean": 0.015468714758753777, "step": 888, "step_time": 20.681648927973583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.16897249221801758, "epoch": 0.889, "frac_reward_zero_std": 0.0, "grad_norm": 0.8623301982879639, "kl": 0.025700649246573448, "learning_rate": 1.6268032707039362e-07, "loss": -0.0834, "num_tokens": 2490302.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 0.9438831210136414, "sampling/importance_sampling_ratio/mean": 0.5920731425285339, "sampling/importance_sampling_ratio/min": 0.3415127098560333, "sampling/sampling_logp_difference/max": 0.7291260957717896, "sampling/sampling_logp_difference/mean": 0.026099834591150284, "step": 889, "step_time": 33.02018096699612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.17846408486366272, "epoch": 0.89, "frac_reward_zero_std": 0.0, "grad_norm": 0.778485894203186, "kl": 0.010645871050655842, "learning_rate": 1.5981950858426715e-07, "loss": 0.0581, "num_tokens": 2493169.0, "reward": 0.7200000286102295, "reward_std": 0.5533534288406372, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5533534288406372, "sampling/importance_sampling_ratio/max": 1.088789463043213, "sampling/importance_sampling_ratio/mean": 0.8295964598655701, "sampling/importance_sampling_ratio/min": 0.5614679455757141, "sampling/sampling_logp_difference/max": 0.4389670491218567, "sampling/sampling_logp_difference/mean": 0.016154268756508827, "step": 890, "step_time": 15.43386735196691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.17301303148269653, "epoch": 0.891, "frac_reward_zero_std": 0.0, "grad_norm": 1.202226996421814, "kl": 0.0064941952005028725, "learning_rate": 1.5698323748414123e-07, "loss": -0.2223, "num_tokens": 2495990.0, "reward": 0.47749999165534973, "reward_std": 0.6033447980880737, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.6033448576927185, "sampling/importance_sampling_ratio/max": 1.90369713306427, "sampling/importance_sampling_ratio/mean": 1.0678374767303467, "sampling/importance_sampling_ratio/min": 0.7373954057693481, "sampling/sampling_logp_difference/max": 0.45085835456848145, "sampling/sampling_logp_difference/mean": 0.016806060448288918, "step": 891, "step_time": 21.234188842994627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.17676737904548645, "epoch": 0.892, "frac_reward_zero_std": 0.0, "grad_norm": 0.9815454483032227, "kl": 0.010478569194674492, "learning_rate": 1.5417154352115742e-07, "loss": -0.0426, "num_tokens": 2498312.0, "reward": 0.6949999928474426, "reward_std": 0.5330103039741516, "rewards/reward_func/mean": 0.6949999928474426, "rewards/reward_func/std": 0.5330103635787964, "sampling/importance_sampling_ratio/max": 1.2848477363586426, "sampling/importance_sampling_ratio/mean": 1.0999393463134766, "sampling/importance_sampling_ratio/min": 0.7612694501876831, "sampling/sampling_logp_difference/max": 0.38756728172302246, "sampling/sampling_logp_difference/mean": 0.016511203721165657, "step": 892, "step_time": 21.70559118001256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19306680560112, "epoch": 0.893, "frac_reward_zero_std": 0.0, "grad_norm": 1.079400897026062, "kl": 0.009146623313426971, "learning_rate": 1.5138445618865543e-07, "loss": 0.0752, "num_tokens": 2501502.0, "reward": 0.4925000071525574, "reward_std": 0.5860247015953064, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 1.6614584922790527, "sampling/importance_sampling_ratio/mean": 1.0476223230361938, "sampling/importance_sampling_ratio/min": 0.8211492300033569, "sampling/sampling_logp_difference/max": 0.3724271059036255, "sampling/sampling_logp_difference/mean": 0.019510159268975258, "step": 893, "step_time": 32.512725042004604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.19767546653747559, "epoch": 0.894, "frac_reward_zero_std": 0.0, "grad_norm": 3.1151063442230225, "kl": 0.009232638403773308, "learning_rate": 1.48622004721862e-07, "loss": 0.5262, "num_tokens": 2504130.0, "reward": 0.737500011920929, "reward_std": 0.511753499507904, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.511753499507904, "sampling/importance_sampling_ratio/max": 2.457738161087036, "sampling/importance_sampling_ratio/mean": 1.3262336254119873, "sampling/importance_sampling_ratio/min": 0.8517658710479736, "sampling/sampling_logp_difference/max": 0.6590301990509033, "sampling/sampling_logp_difference/mean": 0.01958930306136608, "step": 894, "step_time": 18.383160633966327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.19747808575630188, "epoch": 0.895, "frac_reward_zero_std": 0.0, "grad_norm": 0.7430298924446106, "kl": 0.005201663821935654, "learning_rate": 1.458842180975864e-07, "loss": -0.154, "num_tokens": 2506882.0, "reward": 0.9775000214576721, "reward_std": 0.03862208500504494, "rewards/reward_func/mean": 0.9775000214576721, "rewards/reward_func/std": 0.03862209618091583, "sampling/importance_sampling_ratio/max": 1.7250767946243286, "sampling/importance_sampling_ratio/mean": 1.0212225914001465, "sampling/importance_sampling_ratio/min": 0.6372237801551819, "sampling/sampling_logp_difference/max": 0.5026307106018066, "sampling/sampling_logp_difference/mean": 0.016069158911705017, "step": 895, "step_time": 20.139847173006274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.2134302407503128, "epoch": 0.896, "frac_reward_zero_std": 0.0, "grad_norm": 1.0690686702728271, "kl": 0.014967448078095913, "learning_rate": 1.4317112503391433e-07, "loss": -0.0911, "num_tokens": 2509831.0, "reward": 0.21250000596046448, "reward_std": 0.5209846496582031, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.5209846496582031, "sampling/importance_sampling_ratio/max": 0.9317091107368469, "sampling/importance_sampling_ratio/mean": 0.7741988301277161, "sampling/importance_sampling_ratio/min": 0.5902265906333923, "sampling/sampling_logp_difference/max": 0.4744560718536377, "sampling/sampling_logp_difference/mean": 0.02237253077328205, "step": 896, "step_time": 36.28402536496287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1644335687160492, "epoch": 0.897, "frac_reward_zero_std": 0.0, "grad_norm": 0.9327253699302673, "kl": 0.010979549959301949, "learning_rate": 1.4048275398990896e-07, "loss": -0.0408, "num_tokens": 2512717.0, "reward": 0.4925000071525574, "reward_std": 0.5745360851287842, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5745360851287842, "sampling/importance_sampling_ratio/max": 1.1944853067398071, "sampling/importance_sampling_ratio/mean": 0.8993010520935059, "sampling/importance_sampling_ratio/min": 0.5409674644470215, "sampling/sampling_logp_difference/max": 0.40773069858551025, "sampling/sampling_logp_difference/mean": 0.015442769974470139, "step": 897, "step_time": 22.171142959967256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.5, "completions/mean_terminated_length": 50.5, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.163870707154274, "epoch": 0.898, "frac_reward_zero_std": 0.0, "grad_norm": 1.1058582067489624, "kl": 0.007325515151023865, "learning_rate": 1.378191331653095e-07, "loss": 0.2229, "num_tokens": 2515323.0, "reward": 0.7400000095367432, "reward_std": 0.5133549571037292, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5133549571037292, "sampling/importance_sampling_ratio/max": 1.0454844236373901, "sampling/importance_sampling_ratio/mean": 0.7904455065727234, "sampling/importance_sampling_ratio/min": 0.4242042005062103, "sampling/sampling_logp_difference/max": 0.5364947319030762, "sampling/sampling_logp_difference/mean": 0.016825763508677483, "step": 898, "step_time": 14.715842578967568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 62.25, "completions/mean_terminated_length": 62.25, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "entropy": 0.19786080718040466, "epoch": 0.899, "frac_reward_zero_std": 0.0, "grad_norm": 0.8579813838005066, "kl": 0.010892621241509914, "learning_rate": 1.3518029050023862e-07, "loss": -0.0964, "num_tokens": 2518289.0, "reward": 0.737500011920929, "reward_std": 0.5249999761581421, "rewards/reward_func/mean": 0.737500011920929, "rewards/reward_func/std": 0.5250000357627869, "sampling/importance_sampling_ratio/max": 1.1443867683410645, "sampling/importance_sampling_ratio/mean": 1.0425585508346558, "sampling/importance_sampling_ratio/min": 0.8979613780975342, "sampling/sampling_logp_difference/max": 0.3586440086364746, "sampling/sampling_logp_difference/mean": 0.02215377800166607, "step": 899, "step_time": 26.91094323201105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.17444714903831482, "epoch": 0.9, "frac_reward_zero_std": 0.0, "grad_norm": 2.7653493881225586, "kl": 0.008227554149925709, "learning_rate": 1.32566253674907e-07, "loss": 0.145, "num_tokens": 2521074.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 1.0130928754806519, "sampling/importance_sampling_ratio/mean": 0.7396594882011414, "sampling/importance_sampling_ratio/min": 0.560860276222229, "sampling/sampling_logp_difference/max": 0.2428438663482666, "sampling/sampling_logp_difference/mean": 0.012215055525302887, "step": 900, "step_time": 18.3344317250303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.11433558166027069, "epoch": 0.901, "frac_reward_zero_std": 0.0, "grad_norm": 0.5804887413978577, "kl": 0.01429828628897667, "learning_rate": 1.2997705010932394e-07, "loss": 0.0661, "num_tokens": 2523949.0, "reward": 0.7450000047683716, "reward_std": 0.5099999904632568, "rewards/reward_func/mean": 0.7450000047683716, "rewards/reward_func/std": 0.5099999904632568, "sampling/importance_sampling_ratio/max": 0.8686103820800781, "sampling/importance_sampling_ratio/mean": 0.6810564994812012, "sampling/importance_sampling_ratio/min": 0.5040551424026489, "sampling/sampling_logp_difference/max": 0.601809024810791, "sampling/sampling_logp_difference/mean": 0.01641123741865158, "step": 901, "step_time": 27.763212127960287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1802811622619629, "epoch": 0.902, "frac_reward_zero_std": 0.0, "grad_norm": 2.3574025630950928, "kl": 0.012082415632903576, "learning_rate": 1.2741270696300962e-07, "loss": -0.0454, "num_tokens": 2527655.0, "reward": 0.4950000047683716, "reward_std": 0.5773791074752808, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5773791670799255, "sampling/importance_sampling_ratio/max": 2.0554757118225098, "sampling/importance_sampling_ratio/mean": 1.3245303630828857, "sampling/importance_sampling_ratio/min": 0.5577307939529419, "sampling/sampling_logp_difference/max": 0.376781702041626, "sampling/sampling_logp_difference/mean": 0.026066191494464874, "step": 902, "step_time": 38.527266351971775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.16428142786026, "epoch": 0.903, "frac_reward_zero_std": 0.0, "grad_norm": 0.7906016111373901, "kl": 0.01040832232683897, "learning_rate": 1.2487325113471034e-07, "loss": 0.0393, "num_tokens": 2530499.0, "reward": 0.4650000035762787, "reward_std": 0.5780715346336365, "rewards/reward_func/mean": 0.4650000035762787, "rewards/reward_func/std": 0.5780714750289917, "sampling/importance_sampling_ratio/max": 0.8877028822898865, "sampling/importance_sampling_ratio/mean": 0.7055122256278992, "sampling/importance_sampling_ratio/min": 0.5284835696220398, "sampling/sampling_logp_difference/max": 1.175581693649292, "sampling/sampling_logp_difference/mean": 0.019426770508289337, "step": 903, "step_time": 24.80221261101542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.18620719015598297, "epoch": 0.904, "frac_reward_zero_std": 0.0, "grad_norm": 1.2570902109146118, "kl": 0.02420058473944664, "learning_rate": 1.223587092621162e-07, "loss": 0.0084, "num_tokens": 2533807.0, "reward": 0.75, "reward_std": 0.5, "rewards/reward_func/mean": 0.75, "rewards/reward_func/std": 0.5, "sampling/importance_sampling_ratio/max": 0.9070218205451965, "sampling/importance_sampling_ratio/mean": 0.6714438199996948, "sampling/importance_sampling_ratio/min": 0.2709958851337433, "sampling/sampling_logp_difference/max": 0.5930250287055969, "sampling/sampling_logp_difference/mean": 0.024358540773391724, "step": 904, "step_time": 32.014386316994205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.20406393706798553, "epoch": 0.905, "frac_reward_zero_std": 0.0, "grad_norm": 1.0972397327423096, "kl": 0.019740739837288857, "learning_rate": 1.1986910772158106e-07, "loss": 0.1678, "num_tokens": 2536878.0, "reward": 0.19499999284744263, "reward_std": 0.53724604845047, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.53724604845047, "sampling/importance_sampling_ratio/max": 1.6062026023864746, "sampling/importance_sampling_ratio/mean": 1.018428087234497, "sampling/importance_sampling_ratio/min": 0.537709653377533, "sampling/sampling_logp_difference/max": 0.5726181268692017, "sampling/sampling_logp_difference/mean": 0.028355227783322334, "step": 905, "step_time": 28.060112708015367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.20361456274986267, "epoch": 0.906, "frac_reward_zero_std": 0.0, "grad_norm": 2.099759817123413, "kl": 0.018394675105810165, "learning_rate": 1.1740447262784782e-07, "loss": -0.1996, "num_tokens": 2539869.0, "reward": 0.4350000023841858, "reward_std": 0.6466580629348755, "rewards/reward_func/mean": 0.4350000023841858, "rewards/reward_func/std": 0.6466580629348755, "sampling/importance_sampling_ratio/max": 2.2624478340148926, "sampling/importance_sampling_ratio/mean": 1.5161234140396118, "sampling/importance_sampling_ratio/min": 0.8960333466529846, "sampling/sampling_logp_difference/max": 0.5580052137374878, "sampling/sampling_logp_difference/mean": 0.026495207101106644, "step": 906, "step_time": 40.91487336799037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1643884778022766, "epoch": 0.907, "frac_reward_zero_std": 0.0, "grad_norm": 0.5478804707527161, "kl": 0.016903769224882126, "learning_rate": 1.1496482983377189e-07, "loss": -0.2446, "num_tokens": 2542337.0, "reward": 0.7350000143051147, "reward_std": 0.5233545899391174, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 1.6779882907867432, "sampling/importance_sampling_ratio/mean": 0.9607352018356323, "sampling/importance_sampling_ratio/min": 0.42130976915359497, "sampling/sampling_logp_difference/max": 0.5931775569915771, "sampling/sampling_logp_difference/mean": 0.019127285107970238, "step": 907, "step_time": 12.933350970968604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.2063135951757431, "epoch": 0.908, "frac_reward_zero_std": 0.0, "grad_norm": 1.1989037990570068, "kl": 0.017035244032740593, "learning_rate": 1.125502049300517e-07, "loss": 0.0021, "num_tokens": 2545123.0, "reward": 0.4699999988079071, "reward_std": 0.6119912266731262, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6119912266731262, "sampling/importance_sampling_ratio/max": 1.0458905696868896, "sampling/importance_sampling_ratio/mean": 0.9481939673423767, "sampling/importance_sampling_ratio/min": 0.7635882496833801, "sampling/sampling_logp_difference/max": 0.23912334442138672, "sampling/sampling_logp_difference/mean": 0.019931582733988762, "step": 908, "step_time": 23.008077480015345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.145658940076828, "epoch": 0.909, "frac_reward_zero_std": 0.0, "grad_norm": 0.7371108531951904, "kl": 0.01708037592470646, "learning_rate": 1.1016062324496007e-07, "loss": -0.0265, "num_tokens": 2547581.0, "reward": 0.7400000095367432, "reward_std": 0.5001999735832214, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5001999735832214, "sampling/importance_sampling_ratio/max": 1.225244164466858, "sampling/importance_sampling_ratio/mean": 0.8548307418823242, "sampling/importance_sampling_ratio/min": 0.5429481863975525, "sampling/sampling_logp_difference/max": 0.4792482852935791, "sampling/sampling_logp_difference/mean": 0.01723451167345047, "step": 909, "step_time": 14.232584331010003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1658802330493927, "epoch": 0.91, "frac_reward_zero_std": 0.0, "grad_norm": 1.828809380531311, "kl": 0.01064710970968008, "learning_rate": 1.0779610984407773e-07, "loss": -0.0654, "num_tokens": 2550870.0, "reward": 0.44999998807907104, "reward_std": 0.6351377964019775, "rewards/reward_func/mean": 0.44999998807907104, "rewards/reward_func/std": 0.6351377964019775, "sampling/importance_sampling_ratio/max": 1.2774412631988525, "sampling/importance_sampling_ratio/mean": 0.9552534222602844, "sampling/importance_sampling_ratio/min": 0.5611850619316101, "sampling/sampling_logp_difference/max": 0.25612592697143555, "sampling/sampling_logp_difference/mean": 0.014152579940855503, "step": 910, "step_time": 34.918630864005536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.15069162845611572, "epoch": 0.911, "frac_reward_zero_std": 0.0, "grad_norm": 1.9571788311004639, "kl": 0.010387495160102844, "learning_rate": 1.054566895300324e-07, "loss": 0.4626, "num_tokens": 2553673.0, "reward": 0.7400000095367432, "reward_std": 0.5067543387413025, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5067543983459473, "sampling/importance_sampling_ratio/max": 1.9957287311553955, "sampling/importance_sampling_ratio/mean": 1.0585459470748901, "sampling/importance_sampling_ratio/min": 0.6162201166152954, "sampling/sampling_logp_difference/max": 0.427293062210083, "sampling/sampling_logp_difference/mean": 0.014001214876770973, "step": 911, "step_time": 19.529199623968452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2186240404844284, "epoch": 0.912, "frac_reward_zero_std": 0.0, "grad_norm": 1.2644553184509277, "kl": 0.015319534577429295, "learning_rate": 1.0314238684223515e-07, "loss": 0.0557, "num_tokens": 2556212.0, "reward": 0.1875, "reward_std": 0.5448776483535767, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.5448776483535767, "sampling/importance_sampling_ratio/max": 1.4915682077407837, "sampling/importance_sampling_ratio/mean": 0.9650976657867432, "sampling/importance_sampling_ratio/min": 0.46030107140541077, "sampling/sampling_logp_difference/max": 0.42666518688201904, "sampling/sampling_logp_difference/mean": 0.021374935284256935, "step": 912, "step_time": 25.304939417983405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1870374083518982, "epoch": 0.913, "frac_reward_zero_std": 0.0, "grad_norm": 1.3616061210632324, "kl": 0.01852884516119957, "learning_rate": 1.0085322605662668e-07, "loss": 0.4222, "num_tokens": 2558865.0, "reward": 0.7224999666213989, "reward_std": 0.5483536124229431, "rewards/reward_func/mean": 0.7224999666213989, "rewards/reward_func/std": 0.5483536124229431, "sampling/importance_sampling_ratio/max": 1.9145963191986084, "sampling/importance_sampling_ratio/mean": 1.074430227279663, "sampling/importance_sampling_ratio/min": 0.6627939939498901, "sampling/sampling_logp_difference/max": 0.48944807052612305, "sampling/sampling_logp_difference/mean": 0.023457471281290054, "step": 913, "step_time": 28.166375430999324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.17751184105873108, "epoch": 0.914, "frac_reward_zero_std": 0.0, "grad_norm": 1.2660775184631348, "kl": 0.01302144955843687, "learning_rate": 9.858923118542003e-08, "loss": 0.2114, "num_tokens": 2561885.0, "reward": 0.7425000071525574, "reward_std": 0.5016888380050659, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5016888380050659, "sampling/importance_sampling_ratio/max": 0.9896434545516968, "sampling/importance_sampling_ratio/mean": 0.7217333316802979, "sampling/importance_sampling_ratio/min": 0.4005754888057709, "sampling/sampling_logp_difference/max": 0.5207786560058594, "sampling/sampling_logp_difference/mean": 0.022219877690076828, "step": 914, "step_time": 21.4612360370229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.20032328367233276, "epoch": 0.915, "frac_reward_zero_std": 1.0, "grad_norm": 0.008858255110681057, "kl": 0.012292904779314995, "learning_rate": 9.635042597685024e-08, "loss": 0.0001, "num_tokens": 2564730.0, "reward": 1.0, "reward_std": 0.0, "rewards/reward_func/mean": 1.0, "rewards/reward_func/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4761626720428467, "sampling/importance_sampling_ratio/mean": 0.8136147260665894, "sampling/importance_sampling_ratio/min": 0.39488837122917175, "sampling/sampling_logp_difference/max": 0.5099375247955322, "sampling/sampling_logp_difference/mean": 0.021624604240059853, "step": 915, "step_time": 14.058131219993811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.18031403422355652, "epoch": 0.916, "frac_reward_zero_std": 0.0, "grad_norm": 1.005853295326233, "kl": 0.014428880997002125, "learning_rate": 9.413683391492456e-08, "loss": 0.1739, "num_tokens": 2567466.0, "reward": 0.23000000417232513, "reward_std": 0.5069516897201538, "rewards/reward_func/mean": 0.23000000417232513, "rewards/reward_func/std": 0.5069516897201538, "sampling/importance_sampling_ratio/max": 1.4600939750671387, "sampling/importance_sampling_ratio/mean": 1.1059681177139282, "sampling/importance_sampling_ratio/min": 0.7283660769462585, "sampling/sampling_logp_difference/max": 0.3184471130371094, "sampling/sampling_logp_difference/mean": 0.017616255208849907, "step": 916, "step_time": 34.3167498880066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18121924996376038, "epoch": 0.917, "frac_reward_zero_std": 0.0, "grad_norm": 0.8015077710151672, "kl": 0.00418004859238863, "learning_rate": 9.194847821917624e-08, "loss": -0.0049, "num_tokens": 2569907.0, "reward": 0.7200000286102295, "reward_std": 0.5533534288406372, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5533534288406372, "sampling/importance_sampling_ratio/max": 1.0482330322265625, "sampling/importance_sampling_ratio/mean": 0.9389315843582153, "sampling/importance_sampling_ratio/min": 0.8206279277801514, "sampling/sampling_logp_difference/max": 0.3340888023376465, "sampling/sampling_logp_difference/mean": 0.010968475602567196, "step": 917, "step_time": 17.937460044049658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 55.25, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1772686094045639, "epoch": 0.918, "frac_reward_zero_std": 0.0, "grad_norm": 1.175344467163086, "kl": 0.033087823539972305, "learning_rate": 8.978538184442137e-08, "loss": -0.0554, "num_tokens": 2573065.0, "reward": 0.23750001192092896, "reward_std": 0.5016888380050659, "rewards/reward_func/mean": 0.23750001192092896, "rewards/reward_func/std": 0.5016888380050659, "sampling/importance_sampling_ratio/max": 0.9661633372306824, "sampling/importance_sampling_ratio/mean": 0.7382379174232483, "sampling/importance_sampling_ratio/min": 0.4945218563079834, "sampling/sampling_logp_difference/max": 1.3311738967895508, "sampling/sampling_logp_difference/mean": 0.027402378618717194, "step": 918, "step_time": 28.220914450008422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.5, "completions/mean_terminated_length": 54.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.21350699663162231, "epoch": 0.919, "frac_reward_zero_std": 0.0, "grad_norm": 0.9660778641700745, "kl": 0.014310196042060852, "learning_rate": 8.764756748051661e-08, "loss": 0.2837, "num_tokens": 2575704.0, "reward": 0.20250000059604645, "reward_std": 0.5330650806427002, "rewards/reward_func/mean": 0.20250000059604645, "rewards/reward_func/std": 0.5330650806427002, "sampling/importance_sampling_ratio/max": 1.8726282119750977, "sampling/importance_sampling_ratio/mean": 1.1237609386444092, "sampling/importance_sampling_ratio/min": 0.7029232382774353, "sampling/sampling_logp_difference/max": 0.4606945514678955, "sampling/sampling_logp_difference/mean": 0.021394915878772736, "step": 919, "step_time": 27.901702003960963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.15217752754688263, "epoch": 0.92, "frac_reward_zero_std": 0.0, "grad_norm": 0.6883860230445862, "kl": 0.0072504435665905476, "learning_rate": 8.553505755212383e-08, "loss": -0.2287, "num_tokens": 2578783.0, "reward": 0.4950000047683716, "reward_std": 0.5831809639930725, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5831809043884277, "sampling/importance_sampling_ratio/max": 1.1464307308197021, "sampling/importance_sampling_ratio/mean": 0.8924072980880737, "sampling/importance_sampling_ratio/min": 0.5202386975288391, "sampling/sampling_logp_difference/max": 0.30778300762176514, "sampling/sampling_logp_difference/mean": 0.01506855245679617, "step": 920, "step_time": 33.93807952298084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.14090043306350708, "epoch": 0.921, "frac_reward_zero_std": 0.0, "grad_norm": 0.8395913243293762, "kl": 0.01654968410730362, "learning_rate": 8.344787421847216e-08, "loss": -0.0512, "num_tokens": 2581708.0, "reward": 0.7300000190734863, "reward_std": 0.5399999618530273, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5399999618530273, "sampling/importance_sampling_ratio/max": 1.2446871995925903, "sampling/importance_sampling_ratio/mean": 0.8225692510604858, "sampling/importance_sampling_ratio/min": 0.558769941329956, "sampling/sampling_logp_difference/max": 0.388421893119812, "sampling/sampling_logp_difference/mean": 0.017449436709284782, "step": 921, "step_time": 21.01245054102037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17869456112384796, "epoch": 0.922, "frac_reward_zero_std": 0.0, "grad_norm": 0.7214816808700562, "kl": 0.008024708367884159, "learning_rate": 8.138603937312722e-08, "loss": 0.1769, "num_tokens": 2584169.0, "reward": 0.21250000596046448, "reward_std": 0.5255711078643799, "rewards/reward_func/mean": 0.21250000596046448, "rewards/reward_func/std": 0.5255711674690247, "sampling/importance_sampling_ratio/max": 1.8020139932632446, "sampling/importance_sampling_ratio/mean": 0.9173101782798767, "sampling/importance_sampling_ratio/min": 0.4788658618927002, "sampling/sampling_logp_difference/max": 0.44510912895202637, "sampling/sampling_logp_difference/mean": 0.019088655710220337, "step": 922, "step_time": 20.026782102009747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.19499889016151428, "epoch": 0.923, "frac_reward_zero_std": 0.0, "grad_norm": 1.3483039140701294, "kl": 0.011548785492777824, "learning_rate": 7.934957464376059e-08, "loss": 0.2457, "num_tokens": 2586749.0, "reward": 0.4724999964237213, "reward_std": 0.5987417101860046, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.5987417697906494, "sampling/importance_sampling_ratio/max": 1.2007627487182617, "sampling/importance_sampling_ratio/mean": 0.9890279173851013, "sampling/importance_sampling_ratio/min": 0.5706509947776794, "sampling/sampling_logp_difference/max": 0.4427284002304077, "sampling/sampling_logp_difference/mean": 0.019043611362576485, "step": 923, "step_time": 29.437689348997083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 58.5, "completions/mean_terminated_length": 58.5, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.14362652599811554, "epoch": 0.924, "frac_reward_zero_std": 0.0, "grad_norm": 2.643855333328247, "kl": 0.025141166523098946, "learning_rate": 7.733850139192395e-08, "loss": 0.4442, "num_tokens": 2590227.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 2.313176155090332, "sampling/importance_sampling_ratio/mean": 1.1793012619018555, "sampling/importance_sampling_ratio/min": 0.33150434494018555, "sampling/sampling_logp_difference/max": 0.8359098434448242, "sampling/sampling_logp_difference/mean": 0.023788778111338615, "step": 924, "step_time": 18.363438031054102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.2107991874217987, "epoch": 0.925, "frac_reward_zero_std": 0.0, "grad_norm": 1.392457127571106, "kl": 0.010058220475912094, "learning_rate": 7.535284071282456e-08, "loss": 0.0108, "num_tokens": 2593088.0, "reward": 0.7475000023841858, "reward_std": 0.4983556270599365, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.4983556270599365, "sampling/importance_sampling_ratio/max": 1.5532426834106445, "sampling/importance_sampling_ratio/mean": 1.2364990711212158, "sampling/importance_sampling_ratio/min": 0.7275170087814331, "sampling/sampling_logp_difference/max": 0.44884419441223145, "sampling/sampling_logp_difference/mean": 0.027916453778743744, "step": 925, "step_time": 16.87041204998968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.16754765808582306, "epoch": 0.926, "frac_reward_zero_std": 0.0, "grad_norm": 0.759800910949707, "kl": 0.006207658909261227, "learning_rate": 7.339261343510207e-08, "loss": 0.0665, "num_tokens": 2596124.0, "reward": 0.2224999964237213, "reward_std": 0.518676221370697, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.518676221370697, "sampling/importance_sampling_ratio/max": 1.2094924449920654, "sampling/importance_sampling_ratio/mean": 0.8306597471237183, "sampling/importance_sampling_ratio/min": 0.5585423111915588, "sampling/sampling_logp_difference/max": 0.34546518325805664, "sampling/sampling_logp_difference/mean": 0.017543092370033264, "step": 926, "step_time": 36.00333207799122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.1769437938928604, "epoch": 0.927, "frac_reward_zero_std": 0.0, "grad_norm": 2.0319082736968994, "kl": 0.00691180769354105, "learning_rate": 7.145784012061424e-08, "loss": -0.3151, "num_tokens": 2598843.0, "reward": 0.4925000071525574, "reward_std": 0.5860247015953064, "rewards/reward_func/mean": 0.4925000071525574, "rewards/reward_func/std": 0.5860247015953064, "sampling/importance_sampling_ratio/max": 2.3194620609283447, "sampling/importance_sampling_ratio/mean": 1.176166296005249, "sampling/importance_sampling_ratio/min": 0.689520537853241, "sampling/sampling_logp_difference/max": 0.6964869499206543, "sampling/sampling_logp_difference/mean": 0.01907685585319996, "step": 927, "step_time": 22.442898111999966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.18492965400218964, "epoch": 0.928, "frac_reward_zero_std": 0.0, "grad_norm": 0.8374162316322327, "kl": 0.0254388265311718, "learning_rate": 6.954854106421715e-08, "loss": -0.2557, "num_tokens": 2601378.0, "reward": 0.7400000095367432, "reward_std": 0.5067543387413025, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5067543983459473, "sampling/importance_sampling_ratio/max": 1.6302472352981567, "sampling/importance_sampling_ratio/mean": 0.9240012168884277, "sampling/importance_sampling_ratio/min": 0.4317037761211395, "sampling/sampling_logp_difference/max": 0.31932520866394043, "sampling/sampling_logp_difference/mean": 0.015971792861819267, "step": 928, "step_time": 16.671337324019987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.13488560914993286, "epoch": 0.929, "frac_reward_zero_std": 0.0, "grad_norm": 1.1040288209915161, "kl": 0.022899333387613297, "learning_rate": 6.766473629355453e-08, "loss": -0.2027, "num_tokens": 2604041.0, "reward": 0.7250000238418579, "reward_std": 0.543353796005249, "rewards/reward_func/mean": 0.7250000238418579, "rewards/reward_func/std": 0.543353796005249, "sampling/importance_sampling_ratio/max": 1.4865798950195312, "sampling/importance_sampling_ratio/mean": 0.9618017077445984, "sampling/importance_sampling_ratio/min": 0.5308647155761719, "sampling/sampling_logp_difference/max": 0.6950149536132812, "sampling/sampling_logp_difference/mean": 0.01799299195408821, "step": 929, "step_time": 22.693090984015726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 77.0, "completions/max_terminated_length": 77.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.21891376376152039, "epoch": 0.93, "frac_reward_zero_std": 0.0, "grad_norm": 1.08028244972229, "kl": 0.014089684002101421, "learning_rate": 6.580644556884703e-08, "loss": 0.0544, "num_tokens": 2606850.0, "reward": 0.46000000834465027, "reward_std": 0.6248733401298523, "rewards/reward_func/mean": 0.46000000834465027, "rewards/reward_func/std": 0.6248733401298523, "sampling/importance_sampling_ratio/max": 1.0528634786605835, "sampling/importance_sampling_ratio/mean": 0.6258430480957031, "sampling/importance_sampling_ratio/min": 0.28754159808158875, "sampling/sampling_logp_difference/max": 0.627814769744873, "sampling/sampling_logp_difference/mean": 0.0288833100348711, "step": 930, "step_time": 26.43638706096681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 45.5, "completions/mean_terminated_length": 45.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17616288363933563, "epoch": 0.931, "frac_reward_zero_std": 0.0, "grad_norm": 0.9554207921028137, "kl": 0.04741973802447319, "learning_rate": 6.397368838268497e-08, "loss": 0.1973, "num_tokens": 2609737.0, "reward": 0.20499999821186066, "reward_std": 0.5316954255104065, "rewards/reward_func/mean": 0.20499999821186066, "rewards/reward_func/std": 0.5316954255104065, "sampling/importance_sampling_ratio/max": 0.7709299921989441, "sampling/importance_sampling_ratio/mean": 0.5791674852371216, "sampling/importance_sampling_ratio/min": 0.2073347270488739, "sampling/sampling_logp_difference/max": 1.2865241765975952, "sampling/sampling_logp_difference/mean": 0.0269815381616354, "step": 931, "step_time": 25.351928464020602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 53.25, "completions/mean_terminated_length": 53.25, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.2172524482011795, "epoch": 0.932, "frac_reward_zero_std": 0.0, "grad_norm": 1.1331610679626465, "kl": 0.03654748946428299, "learning_rate": 6.216648395982377e-08, "loss": 0.0206, "num_tokens": 2612920.0, "reward": 0.7200000286102295, "reward_std": 0.48083260655403137, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.480832576751709, "sampling/importance_sampling_ratio/max": 0.797945499420166, "sampling/importance_sampling_ratio/mean": 0.6741539835929871, "sampling/importance_sampling_ratio/min": 0.47907134890556335, "sampling/sampling_logp_difference/max": 0.6732114553451538, "sampling/sampling_logp_difference/mean": 0.027315547689795494, "step": 932, "step_time": 31.617669166997075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 69.0, "completions/max_terminated_length": 69.0, "completions/mean_length": 62.5, "completions/mean_terminated_length": 62.5, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.23949222266674042, "epoch": 0.933, "frac_reward_zero_std": 0.0, "grad_norm": 1.3444561958312988, "kl": 0.01180461049079895, "learning_rate": 6.038485125698296e-08, "loss": -0.3617, "num_tokens": 2615361.0, "reward": 0.4724999964237213, "reward_std": 0.6097745299339294, "rewards/reward_func/mean": 0.4724999964237213, "rewards/reward_func/std": 0.6097745895385742, "sampling/importance_sampling_ratio/max": 2.0560591220855713, "sampling/importance_sampling_ratio/mean": 0.9776567816734314, "sampling/importance_sampling_ratio/min": 0.28266316652297974, "sampling/sampling_logp_difference/max": 0.769321084022522, "sampling/sampling_logp_difference/mean": 0.025995362550020218, "step": 933, "step_time": 19.758495582966134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2122965008020401, "epoch": 0.934, "frac_reward_zero_std": 0.0, "grad_norm": 0.934613823890686, "kl": 0.013374313712120056, "learning_rate": 5.862880896264689e-08, "loss": -0.2606, "num_tokens": 2618134.0, "reward": 0.7300000190734863, "reward_std": 0.48812565207481384, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.48812568187713623, "sampling/importance_sampling_ratio/max": 1.449279546737671, "sampling/importance_sampling_ratio/mean": 0.8840512633323669, "sampling/importance_sampling_ratio/min": 0.4978399872779846, "sampling/sampling_logp_difference/max": 0.6694231033325195, "sampling/sampling_logp_difference/mean": 0.021857792511582375, "step": 934, "step_time": 23.48470835702028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 48.0, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.17245744168758392, "epoch": 0.935, "frac_reward_zero_std": 0.0, "grad_norm": 0.9897481203079224, "kl": 0.006558006163686514, "learning_rate": 5.6898375496867444e-08, "loss": 0.0546, "num_tokens": 2621167.0, "reward": 0.9925000071525574, "reward_std": 0.00957426242530346, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.00957426242530346, "sampling/importance_sampling_ratio/max": 1.7692008018493652, "sampling/importance_sampling_ratio/mean": 1.048946738243103, "sampling/importance_sampling_ratio/min": 0.5507159233093262, "sampling/sampling_logp_difference/max": 0.7558201551437378, "sampling/sampling_logp_difference/mean": 0.027090176939964294, "step": 935, "step_time": 28.094421893998515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "entropy": 0.16092617809772491, "epoch": 0.936, "frac_reward_zero_std": 0.0, "grad_norm": 1.0825340747833252, "kl": 0.008682183921337128, "learning_rate": 5.519356901107359e-08, "loss": 0.0363, "num_tokens": 2624510.0, "reward": 0.7350000143051147, "reward_std": 0.5233545899391174, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 1.3980042934417725, "sampling/importance_sampling_ratio/mean": 1.1464983224868774, "sampling/importance_sampling_ratio/min": 1.008982539176941, "sampling/sampling_logp_difference/max": 0.574749231338501, "sampling/sampling_logp_difference/mean": 0.022331232205033302, "step": 936, "step_time": 26.313102012034506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.18584692478179932, "epoch": 0.937, "frac_reward_zero_std": 0.0, "grad_norm": 1.4439955949783325, "kl": 0.01079943124204874, "learning_rate": 5.3514407387877946e-08, "loss": -0.0262, "num_tokens": 2627060.0, "reward": 0.7050000429153442, "reward_std": 0.5300629138946533, "rewards/reward_func/mean": 0.7050000429153442, "rewards/reward_func/std": 0.5300629138946533, "sampling/importance_sampling_ratio/max": 1.7478950023651123, "sampling/importance_sampling_ratio/mean": 1.2674968242645264, "sampling/importance_sampling_ratio/min": 0.5967512130737305, "sampling/sampling_logp_difference/max": 0.5355415344238281, "sampling/sampling_logp_difference/mean": 0.029074301943182945, "step": 937, "step_time": 24.065505895996466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.2037249505519867, "epoch": 0.938, "frac_reward_zero_std": 0.0, "grad_norm": 1.1504424810409546, "kl": 0.013578691519796848, "learning_rate": 5.186090824089218e-08, "loss": -0.0794, "num_tokens": 2629837.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 1.4203205108642578, "sampling/importance_sampling_ratio/mean": 0.9351270198822021, "sampling/importance_sampling_ratio/min": 0.7251021862030029, "sampling/sampling_logp_difference/max": 0.6045564413070679, "sampling/sampling_logp_difference/mean": 0.0227554552257061, "step": 938, "step_time": 23.786719072959386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 61.0, "completions/max_terminated_length": 61.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.14837254583835602, "epoch": 0.939, "frac_reward_zero_std": 0.0, "grad_norm": 0.8044763803482056, "kl": 0.02026260457932949, "learning_rate": 5.023308891453915e-08, "loss": 0.2306, "num_tokens": 2633227.0, "reward": 0.23250000178813934, "reward_std": 0.512079119682312, "rewards/reward_func/mean": 0.23250000178813934, "rewards/reward_func/std": 0.512079119682312, "sampling/importance_sampling_ratio/max": 1.3292521238327026, "sampling/importance_sampling_ratio/mean": 0.764962911605835, "sampling/importance_sampling_ratio/min": 0.3978922367095947, "sampling/sampling_logp_difference/max": 0.6411890983581543, "sampling/sampling_logp_difference/mean": 0.023921946063637733, "step": 939, "step_time": 30.130391088023316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 58.0, "completions/mean_terminated_length": 58.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.19525086879730225, "epoch": 0.94, "frac_reward_zero_std": 0.0, "grad_norm": 1.673285961151123, "kl": 0.008912093006074429, "learning_rate": 4.8630966483873834e-08, "loss": 0.0902, "num_tokens": 2636197.0, "reward": 0.4424999952316284, "reward_std": 0.6484018564224243, "rewards/reward_func/mean": 0.4424999952316284, "rewards/reward_func/std": 0.6484019160270691, "sampling/importance_sampling_ratio/max": 2.1329214572906494, "sampling/importance_sampling_ratio/mean": 1.510030746459961, "sampling/importance_sampling_ratio/min": 1.0331834554672241, "sampling/sampling_logp_difference/max": 0.312838077545166, "sampling/sampling_logp_difference/mean": 0.022833071649074554, "step": 940, "step_time": 39.004038679995574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "entropy": 0.1482611447572708, "epoch": 0.941, "frac_reward_zero_std": 0.0, "grad_norm": 0.8069556951522827, "kl": 0.015681635588407516, "learning_rate": 4.705455775440237e-08, "loss": 0.0929, "num_tokens": 2638831.0, "reward": 0.7224999666213989, "reward_std": 0.5550000071525574, "rewards/reward_func/mean": 0.7224999666213989, "rewards/reward_func/std": 0.5550000071525574, "sampling/importance_sampling_ratio/max": 1.1581465005874634, "sampling/importance_sampling_ratio/mean": 0.7917067408561707, "sampling/importance_sampling_ratio/min": 0.3518144190311432, "sampling/sampling_logp_difference/max": 0.7096753120422363, "sampling/sampling_logp_difference/mean": 0.017478279769420624, "step": 941, "step_time": 16.332119441009127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 56.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.15849155187606812, "epoch": 0.942, "frac_reward_zero_std": 0.0, "grad_norm": 1.9494924545288086, "kl": 0.012438765726983547, "learning_rate": 4.5503879261906134e-08, "loss": -0.3572, "num_tokens": 2641389.0, "reward": 0.19499999284744263, "reward_std": 0.530440092086792, "rewards/reward_func/mean": 0.19499999284744263, "rewards/reward_func/std": 0.530440092086792, "sampling/importance_sampling_ratio/max": 1.6654763221740723, "sampling/importance_sampling_ratio/mean": 0.9383490085601807, "sampling/importance_sampling_ratio/min": 0.6652641892433167, "sampling/sampling_logp_difference/max": 0.42760300636291504, "sampling/sampling_logp_difference/mean": 0.01552487351000309, "step": 942, "step_time": 28.212723647011444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 56.75, "completions/mean_terminated_length": 56.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17265716195106506, "epoch": 0.943, "frac_reward_zero_std": 0.0, "grad_norm": 1.0694712400436401, "kl": 0.01448520552366972, "learning_rate": 4.397894727226931e-08, "loss": 0.0677, "num_tokens": 2644061.0, "reward": 0.4675000011920929, "reward_std": 0.6171641945838928, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6171642541885376, "sampling/importance_sampling_ratio/max": 1.441677212715149, "sampling/importance_sampling_ratio/mean": 0.8101711273193359, "sampling/importance_sampling_ratio/min": 0.28117048740386963, "sampling/sampling_logp_difference/max": 0.4225585460662842, "sampling/sampling_logp_difference/mean": 0.02110959403216839, "step": 943, "step_time": 21.679403031012043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 49.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.16215163469314575, "epoch": 0.944, "frac_reward_zero_std": 0.0, "grad_norm": 0.7921115756034851, "kl": 0.005248079542070627, "learning_rate": 4.247977778130602e-08, "loss": -0.0718, "num_tokens": 2647093.0, "reward": 0.4950000047683716, "reward_std": 0.5831237435340881, "rewards/reward_func/mean": 0.4950000047683716, "rewards/reward_func/std": 0.5831238031387329, "sampling/importance_sampling_ratio/max": 0.9830775260925293, "sampling/importance_sampling_ratio/mean": 0.6630570292472839, "sampling/importance_sampling_ratio/min": 0.5440065860748291, "sampling/sampling_logp_difference/max": 0.5548315048217773, "sampling/sampling_logp_difference/mean": 0.02361236698925495, "step": 944, "step_time": 34.43920037901262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 49.25, "completions/mean_terminated_length": 49.25, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1933690309524536, "epoch": 0.945, "frac_reward_zero_std": 0.0, "grad_norm": 1.7078641653060913, "kl": 0.013330507092177868, "learning_rate": 4.100638651459543e-08, "loss": 0.2571, "num_tokens": 2649865.0, "reward": 0.2224999964237213, "reward_std": 0.5192542672157288, "rewards/reward_func/mean": 0.2224999964237213, "rewards/reward_func/std": 0.5192543268203735, "sampling/importance_sampling_ratio/max": 2.384737730026245, "sampling/importance_sampling_ratio/mean": 1.1281895637512207, "sampling/importance_sampling_ratio/min": 0.3522225022315979, "sampling/sampling_logp_difference/max": 0.5812180042266846, "sampling/sampling_logp_difference/mean": 0.01747589185833931, "step": 945, "step_time": 23.78452030097833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.158755823969841, "epoch": 0.946, "frac_reward_zero_std": 0.0, "grad_norm": 2.2925760746002197, "kl": 0.016519293189048767, "learning_rate": 3.955878892731441e-08, "loss": 0.468, "num_tokens": 2653068.0, "reward": -0.04749999940395355, "reward_std": 0.04425306245684624, "rewards/reward_func/mean": -0.04749999940395355, "rewards/reward_func/std": 0.04425306245684624, "sampling/importance_sampling_ratio/max": 2.926091194152832, "sampling/importance_sampling_ratio/mean": 1.322880506515503, "sampling/importance_sampling_ratio/min": 0.5722647309303284, "sampling/sampling_logp_difference/max": 0.4417976140975952, "sampling/sampling_logp_difference/mean": 0.019868051633238792, "step": 946, "step_time": 31.283426257024985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 47.25, "completions/mean_terminated_length": 47.25, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "entropy": 0.24776135385036469, "epoch": 0.947, "frac_reward_zero_std": 0.0, "grad_norm": 3.7177891731262207, "kl": 0.021586380898952484, "learning_rate": 3.813700020407707e-08, "loss": -1.2517, "num_tokens": 2656028.0, "reward": 0.16249999403953552, "reward_std": 0.5634639859199524, "rewards/reward_func/mean": 0.16249999403953552, "rewards/reward_func/std": 0.5634639859199524, "sampling/importance_sampling_ratio/max": 2.986578941345215, "sampling/importance_sampling_ratio/mean": 1.6280866861343384, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.6852164268493652, "sampling/sampling_logp_difference/mean": 0.029879365116357803, "step": 947, "step_time": 38.020860222983174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.16758981347084045, "epoch": 0.948, "frac_reward_zero_std": 0.0, "grad_norm": 1.421286940574646, "kl": 0.014573115855455399, "learning_rate": 3.67410352587741e-08, "loss": 0.033, "num_tokens": 2658868.0, "reward": 0.7350000143051147, "reward_std": 0.5233545899391174, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5233545899391174, "sampling/importance_sampling_ratio/max": 0.958949863910675, "sampling/importance_sampling_ratio/mean": 0.8320696353912354, "sampling/importance_sampling_ratio/min": 0.675715982913971, "sampling/sampling_logp_difference/max": 0.6103723049163818, "sampling/sampling_logp_difference/mean": 0.015552335418760777, "step": 948, "step_time": 24.519882424967363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.18223461508750916, "epoch": 0.949, "frac_reward_zero_std": 0.0, "grad_norm": 1.7406944036483765, "kl": 0.011923124082386494, "learning_rate": 3.537090873441701e-08, "loss": 0.4079, "num_tokens": 2661641.0, "reward": 0.7300000190734863, "reward_std": 0.5333541631698608, "rewards/reward_func/mean": 0.7300000190734863, "rewards/reward_func/std": 0.5333541631698608, "sampling/importance_sampling_ratio/max": 1.519023060798645, "sampling/importance_sampling_ratio/mean": 0.9634592533111572, "sampling/importance_sampling_ratio/min": 0.4951988756656647, "sampling/sampling_logp_difference/max": 0.3886615037918091, "sampling/sampling_logp_difference/mean": 0.019645163789391518, "step": 949, "step_time": 17.262400261999574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 49.75, "completions/mean_terminated_length": 49.75, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.16543002426624298, "epoch": 0.95, "frac_reward_zero_std": 0.0, "grad_norm": 0.6319556832313538, "kl": 0.018827809020876884, "learning_rate": 3.4026635002984974e-08, "loss": 0.0608, "num_tokens": 2664848.0, "reward": 0.9925000071525574, "reward_std": 0.00957426242530346, "rewards/reward_func/mean": 0.9925000071525574, "rewards/reward_func/std": 0.00957426242530346, "sampling/importance_sampling_ratio/max": 0.7187905311584473, "sampling/importance_sampling_ratio/mean": 0.47611093521118164, "sampling/importance_sampling_ratio/min": 0.1334999054670334, "sampling/sampling_logp_difference/max": 1.6143503189086914, "sampling/sampling_logp_difference/mean": 0.03185888007283211, "step": 950, "step_time": 27.811286953976378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.18115456402301788, "epoch": 0.951, "frac_reward_zero_std": 0.0, "grad_norm": 1.1766563653945923, "kl": 0.009134453721344471, "learning_rate": 3.270822816527325e-08, "loss": 0.1391, "num_tokens": 2667455.0, "reward": 0.7400000095367432, "reward_std": 0.5199999809265137, "rewards/reward_func/mean": 0.7400000095367432, "rewards/reward_func/std": 0.5200000405311584, "sampling/importance_sampling_ratio/max": 1.188248634338379, "sampling/importance_sampling_ratio/mean": 0.9454361796379089, "sampling/importance_sampling_ratio/min": 0.7433153390884399, "sampling/sampling_logp_difference/max": 0.3555464744567871, "sampling/sampling_logp_difference/mean": 0.021259984001517296, "step": 951, "step_time": 16.764946300012525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1782076358795166, "epoch": 0.952, "frac_reward_zero_std": 0.0, "grad_norm": 0.8439856767654419, "kl": 0.011744711548089981, "learning_rate": 3.141570205074607e-08, "loss": 0.1523, "num_tokens": 2670003.0, "reward": 0.48750001192092896, "reward_std": 0.5919107794761658, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5919107794761658, "sampling/importance_sampling_ratio/max": 0.8308660984039307, "sampling/importance_sampling_ratio/mean": 0.5725545287132263, "sampling/importance_sampling_ratio/min": 0.3764490485191345, "sampling/sampling_logp_difference/max": 0.6665728092193604, "sampling/sampling_logp_difference/mean": 0.024411041289567947, "step": 952, "step_time": 18.07115240301937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 52.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.15385833382606506, "epoch": 0.953, "frac_reward_zero_std": 0.0, "grad_norm": 0.6784666776657104, "kl": 0.010828756727278233, "learning_rate": 3.014907021739011e-08, "loss": -0.0564, "num_tokens": 2672729.0, "reward": 0.4975000023841858, "reward_std": 0.574478030204773, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.574478030204773, "sampling/importance_sampling_ratio/max": 1.2597355842590332, "sampling/importance_sampling_ratio/mean": 0.8071574568748474, "sampling/importance_sampling_ratio/min": 0.5271415710449219, "sampling/sampling_logp_difference/max": 0.3629075288772583, "sampling/sampling_logp_difference/mean": 0.016061943024396896, "step": 953, "step_time": 17.49672523100162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 53.0, "completions/max_terminated_length": 53.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.20606380701065063, "epoch": 0.954, "frac_reward_zero_std": 0.0, "grad_norm": 0.915627658367157, "kl": 0.01614759862422943, "learning_rate": 2.8908345951574045e-08, "loss": -0.0168, "num_tokens": 2675309.0, "reward": 0.19500000774860382, "reward_std": 0.540401041507721, "rewards/reward_func/mean": 0.19500000774860382, "rewards/reward_func/std": 0.5404011011123657, "sampling/importance_sampling_ratio/max": 1.2868716716766357, "sampling/importance_sampling_ratio/mean": 0.926769495010376, "sampling/importance_sampling_ratio/min": 0.3840569853782654, "sampling/sampling_logp_difference/max": 0.3604767322540283, "sampling/sampling_logp_difference/mean": 0.018085254356265068, "step": 954, "step_time": 35.028648834035266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 54.0, "completions/mean_terminated_length": 54.0, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.16685505211353302, "epoch": 0.955, "frac_reward_zero_std": 0.0, "grad_norm": 1.233354926109314, "kl": 0.02578306570649147, "learning_rate": 2.7693542267908934e-08, "loss": -0.2138, "num_tokens": 2678837.0, "reward": 0.4749999940395355, "reward_std": 0.6070969104766846, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6070969104766846, "sampling/importance_sampling_ratio/max": 1.4243724346160889, "sampling/importance_sampling_ratio/mean": 0.9638741612434387, "sampling/importance_sampling_ratio/min": 0.35751873254776, "sampling/sampling_logp_difference/max": 0.5959920883178711, "sampling/sampling_logp_difference/mean": 0.020861629396677017, "step": 955, "step_time": 25.975105747987982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 56.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.18495547771453857, "epoch": 0.956, "frac_reward_zero_std": 0.0, "grad_norm": 0.8719369769096375, "kl": 0.016174882650375366, "learning_rate": 2.6504671909109993e-08, "loss": -0.0961, "num_tokens": 2681253.0, "reward": 0.7275000214576721, "reward_std": 0.5383539795875549, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.5383539795875549, "sampling/importance_sampling_ratio/max": 1.4162179231643677, "sampling/importance_sampling_ratio/mean": 0.9496273994445801, "sampling/importance_sampling_ratio/min": 0.6745224595069885, "sampling/sampling_logp_difference/max": 0.41546475887298584, "sampling/sampling_logp_difference/mean": 0.017267737537622452, "step": 956, "step_time": 11.54723428201396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.20128807425498962, "epoch": 0.957, "frac_reward_zero_std": 0.0, "grad_norm": 1.3634074926376343, "kl": 0.021706249564886093, "learning_rate": 2.534174734586503e-08, "loss": -0.0563, "num_tokens": 2684031.0, "reward": 0.7350000143051147, "reward_std": 0.4908156394958496, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.490815669298172, "sampling/importance_sampling_ratio/max": 1.7871745824813843, "sampling/importance_sampling_ratio/mean": 1.2894593477249146, "sampling/importance_sampling_ratio/min": 0.9538618326187134, "sampling/sampling_logp_difference/max": 0.6120476722717285, "sampling/sampling_logp_difference/mean": 0.02231754921376705, "step": 957, "step_time": 27.337606568005867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.16835343837738037, "epoch": 0.958, "frac_reward_zero_std": 0.0, "grad_norm": 0.7133905291557312, "kl": 0.01163838617503643, "learning_rate": 2.4204780776702075e-08, "loss": -0.0173, "num_tokens": 2687014.0, "reward": 0.7074999809265137, "reward_std": 0.5717443823814392, "rewards/reward_func/mean": 0.7074999809265137, "rewards/reward_func/std": 0.5717443823814392, "sampling/importance_sampling_ratio/max": 1.1992768049240112, "sampling/importance_sampling_ratio/mean": 0.8198827505111694, "sampling/importance_sampling_ratio/min": 0.40020203590393066, "sampling/sampling_logp_difference/max": 0.7499260902404785, "sampling/sampling_logp_difference/mean": 0.02307146042585373, "step": 958, "step_time": 20.979067091015168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "entropy": 0.1515655368566513, "epoch": 0.959, "frac_reward_zero_std": 0.0, "grad_norm": 0.81925368309021, "kl": 0.01774483360350132, "learning_rate": 2.309378412786306e-08, "loss": -0.1286, "num_tokens": 2689736.0, "reward": 0.7224999666213989, "reward_std": 0.5550000071525574, "rewards/reward_func/mean": 0.7224999666213989, "rewards/reward_func/std": 0.5550000071525574, "sampling/importance_sampling_ratio/max": 1.5591963529586792, "sampling/importance_sampling_ratio/mean": 0.7992367744445801, "sampling/importance_sampling_ratio/min": 0.49540048837661743, "sampling/sampling_logp_difference/max": 0.3514981269836426, "sampling/sampling_logp_difference/mean": 0.020610205829143524, "step": 959, "step_time": 16.07370703999186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 55.75, "completions/mean_terminated_length": 55.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1649346649646759, "epoch": 0.96, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625032186508179, "kl": 0.01289560366421938, "learning_rate": 2.200876905317645e-08, "loss": 0.3612, "num_tokens": 2692071.0, "reward": 0.9825000166893005, "reward_std": 0.014999985694885254, "rewards/reward_func/mean": 0.9825000166893005, "rewards/reward_func/std": 0.014999986626207829, "sampling/importance_sampling_ratio/max": 2.121095895767212, "sampling/importance_sampling_ratio/mean": 1.4198492765426636, "sampling/importance_sampling_ratio/min": 0.7905817627906799, "sampling/sampling_logp_difference/max": 0.9414011240005493, "sampling/sampling_logp_difference/mean": 0.020613722503185272, "step": 960, "step_time": 14.391221414960455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 57.0, "completions/mean_terminated_length": 57.0, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.22560444474220276, "epoch": 0.961, "frac_reward_zero_std": 0.0, "grad_norm": 1.5741814374923706, "kl": 0.015432960353791714, "learning_rate": 2.094974693393731e-08, "loss": -0.0193, "num_tokens": 2694478.0, "reward": 0.4325000047683716, "reward_std": 0.6157583594322205, "rewards/reward_func/mean": 0.4325000047683716, "rewards/reward_func/std": 0.6157583594322205, "sampling/importance_sampling_ratio/max": 1.6910324096679688, "sampling/importance_sampling_ratio/mean": 1.2771632671356201, "sampling/importance_sampling_ratio/min": 0.868405282497406, "sampling/sampling_logp_difference/max": 0.3514370918273926, "sampling/sampling_logp_difference/mean": 0.023615699261426926, "step": 961, "step_time": 33.248339073034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "entropy": 0.1946193277835846, "epoch": 0.962, "frac_reward_zero_std": 0.0, "grad_norm": 0.7108011841773987, "kl": 0.011525683104991913, "learning_rate": 1.9916728878786318e-08, "loss": -0.01, "num_tokens": 2697040.0, "reward": 0.4425000250339508, "reward_std": 0.5984633564949036, "rewards/reward_func/mean": 0.4425000250339508, "rewards/reward_func/std": 0.5984632968902588, "sampling/importance_sampling_ratio/max": 1.109789490699768, "sampling/importance_sampling_ratio/mean": 0.7553939819335938, "sampling/importance_sampling_ratio/min": 0.3142869770526886, "sampling/sampling_logp_difference/max": 1.0569963455200195, "sampling/sampling_logp_difference/mean": 0.0202386062592268, "step": 962, "step_time": 21.343699855031446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.1632411628961563, "epoch": 0.963, "frac_reward_zero_std": 0.0, "grad_norm": 0.8830838799476624, "kl": 0.009288910776376724, "learning_rate": 1.890972572359456e-08, "loss": -0.0005, "num_tokens": 2700133.0, "reward": 0.22500000894069672, "reward_std": 0.47682979702949524, "rewards/reward_func/mean": 0.22500000894069672, "rewards/reward_func/std": 0.4768298268318176, "sampling/importance_sampling_ratio/max": 1.4328840970993042, "sampling/importance_sampling_ratio/mean": 1.0381311178207397, "sampling/importance_sampling_ratio/min": 0.7431904077529907, "sampling/sampling_logp_difference/max": 0.3166060447692871, "sampling/sampling_logp_difference/mean": 0.015194358304142952, "step": 963, "step_time": 36.98526712099556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 47.75, "completions/mean_terminated_length": 47.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.174856498837471, "epoch": 0.964, "frac_reward_zero_std": 0.0, "grad_norm": 1.1935949325561523, "kl": 0.013059455901384354, "learning_rate": 1.792874803134781e-08, "loss": -0.2551, "num_tokens": 2702837.0, "reward": 0.4599999785423279, "reward_std": 0.6125357151031494, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.6125357151031494, "sampling/importance_sampling_ratio/max": 1.3818479776382446, "sampling/importance_sampling_ratio/mean": 1.129005789756775, "sampling/importance_sampling_ratio/min": 0.805878221988678, "sampling/sampling_logp_difference/max": 0.4818120002746582, "sampling/sampling_logp_difference/mean": 0.023406682536005974, "step": 964, "step_time": 34.57179752999218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1712668538093567, "epoch": 0.965, "frac_reward_zero_std": 0.0, "grad_norm": 0.9191261529922485, "kl": 0.005768411327153444, "learning_rate": 1.6973806092038525e-08, "loss": -0.2117, "num_tokens": 2705467.0, "reward": 0.47749999165534973, "reward_std": 0.5980733036994934, "rewards/reward_func/mean": 0.47749999165534973, "rewards/reward_func/std": 0.5980733036994934, "sampling/importance_sampling_ratio/max": 1.2511200904846191, "sampling/importance_sampling_ratio/mean": 0.9451463222503662, "sampling/importance_sampling_ratio/min": 0.634990930557251, "sampling/sampling_logp_difference/max": 0.2973259687423706, "sampling/sampling_logp_difference/mean": 0.01768651232123375, "step": 965, "step_time": 21.234961152018514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 60.0, "completions/max_terminated_length": 60.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.16573940217494965, "epoch": 0.966, "frac_reward_zero_std": 0.0, "grad_norm": 1.5905110836029053, "kl": 0.023966524749994278, "learning_rate": 1.6044909922555973e-08, "loss": 0.089, "num_tokens": 2708468.0, "reward": 0.9975000023841858, "reward_std": 0.004999995231628418, "rewards/reward_func/mean": 0.9975000023841858, "rewards/reward_func/std": 0.004999995231628418, "sampling/importance_sampling_ratio/max": 0.755001962184906, "sampling/importance_sampling_ratio/mean": 0.583756685256958, "sampling/importance_sampling_ratio/min": 0.4375454783439636, "sampling/sampling_logp_difference/max": 0.5328750610351562, "sampling/sampling_logp_difference/mean": 0.01996489055454731, "step": 966, "step_time": 13.626948248012923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 34.0, "completions/min_terminated_length": 34.0, "entropy": 0.2142745405435562, "epoch": 0.967, "frac_reward_zero_std": 0.0, "grad_norm": 1.6645313501358032, "kl": 0.01665772870182991, "learning_rate": 1.5142069266580462e-08, "loss": -0.0956, "num_tokens": 2710902.0, "reward": 0.4699999988079071, "reward_std": 0.6078925132751465, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6078925132751465, "sampling/importance_sampling_ratio/max": 1.9509241580963135, "sampling/importance_sampling_ratio/mean": 1.268627405166626, "sampling/importance_sampling_ratio/min": 0.7948831915855408, "sampling/sampling_logp_difference/max": 0.5140471458435059, "sampling/sampling_logp_difference/mean": 0.022272692993283272, "step": 967, "step_time": 20.418749056989327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.2059357911348343, "epoch": 0.968, "frac_reward_zero_std": 0.0, "grad_norm": 0.9379705190658569, "kl": 0.011039367876946926, "learning_rate": 1.4265293594484254e-08, "loss": 0.1339, "num_tokens": 2714014.0, "reward": 0.7350000143051147, "reward_std": 0.5034878253936768, "rewards/reward_func/mean": 0.7350000143051147, "rewards/reward_func/std": 0.5034878253936768, "sampling/importance_sampling_ratio/max": 0.9928443431854248, "sampling/importance_sampling_ratio/mean": 0.7518709301948547, "sampling/importance_sampling_ratio/min": 0.42684605717658997, "sampling/sampling_logp_difference/max": 0.659188985824585, "sampling/sampling_logp_difference/mean": 0.020387601107358932, "step": 968, "step_time": 33.919046851980966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 51.0, "completions/max_terminated_length": 51.0, "completions/mean_length": 42.5, "completions/mean_terminated_length": 42.5, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.1369735449552536, "epoch": 0.969, "frac_reward_zero_std": 0.0, "grad_norm": 1.2521474361419678, "kl": 0.015447555109858513, "learning_rate": 1.3414592103228597e-08, "loss": 0.0099, "num_tokens": 2717095.0, "reward": 0.48000001907348633, "reward_std": 0.5604165196418762, "rewards/reward_func/mean": 0.48000001907348633, "rewards/reward_func/std": 0.5604165196418762, "sampling/importance_sampling_ratio/max": 1.807211995124817, "sampling/importance_sampling_ratio/mean": 1.1800693273544312, "sampling/importance_sampling_ratio/min": 0.6780693531036377, "sampling/sampling_logp_difference/max": 0.43634700775146484, "sampling/sampling_logp_difference/mean": 0.013604771345853806, "step": 969, "step_time": 25.62107712496072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.0, "completions/mean_terminated_length": 51.0, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "entropy": 0.1752425581216812, "epoch": 0.97, "frac_reward_zero_std": 0.0, "grad_norm": 0.9627369046211243, "kl": 0.010626585222780704, "learning_rate": 1.2589973716270188e-08, "loss": 0.365, "num_tokens": 2720180.0, "reward": 0.4375000298023224, "reward_std": 0.6048898696899414, "rewards/reward_func/mean": 0.4375000298023224, "rewards/reward_func/std": 0.6048898100852966, "sampling/importance_sampling_ratio/max": 1.2391868829727173, "sampling/importance_sampling_ratio/mean": 0.7270630598068237, "sampling/importance_sampling_ratio/min": 0.21379150450229645, "sampling/sampling_logp_difference/max": 0.8087836503982544, "sampling/sampling_logp_difference/mean": 0.02495509572327137, "step": 970, "step_time": 41.38037249504123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.75, "completions/mean_terminated_length": 50.75, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.172551229596138, "epoch": 0.971, "frac_reward_zero_std": 0.0, "grad_norm": 1.0640658140182495, "kl": 0.0062208459712564945, "learning_rate": 1.1791447083465136e-08, "loss": 0.1039, "num_tokens": 2722789.0, "reward": 0.48250001668930054, "reward_std": 0.5979060530662537, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5979060530662537, "sampling/importance_sampling_ratio/max": 1.1176881790161133, "sampling/importance_sampling_ratio/mean": 0.7222541570663452, "sampling/importance_sampling_ratio/min": 0.36287549138069153, "sampling/sampling_logp_difference/max": 0.37291502952575684, "sampling/sampling_logp_difference/mean": 0.01938227377831936, "step": 971, "step_time": 19.27729390200693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.17939221858978271, "epoch": 0.972, "frac_reward_zero_std": 0.0, "grad_norm": 1.3238906860351562, "kl": 0.013505707494914532, "learning_rate": 1.1019020580980144e-08, "loss": 0.3457, "num_tokens": 2725203.0, "reward": 0.7200000286102295, "reward_std": 0.5400617122650146, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5400617718696594, "sampling/importance_sampling_ratio/max": 1.7775375843048096, "sampling/importance_sampling_ratio/mean": 1.2363383769989014, "sampling/importance_sampling_ratio/min": 0.8628479242324829, "sampling/sampling_logp_difference/max": 0.34324216842651367, "sampling/sampling_logp_difference/mean": 0.013551241718232632, "step": 972, "step_time": 16.835451891995035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 48.5, "completions/mean_terminated_length": 48.5, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.19814029335975647, "epoch": 0.973, "frac_reward_zero_std": 0.0, "grad_norm": 2.085205078125, "kl": 0.010397888720035553, "learning_rate": 1.0272702311203698e-08, "loss": -0.2054, "num_tokens": 2727710.0, "reward": 0.18250000476837158, "reward_std": 0.5479887127876282, "rewards/reward_func/mean": 0.18250000476837158, "rewards/reward_func/std": 0.5479887127876282, "sampling/importance_sampling_ratio/max": 1.3691061735153198, "sampling/importance_sampling_ratio/mean": 1.0189892053604126, "sampling/importance_sampling_ratio/min": 0.7130147814750671, "sampling/sampling_logp_difference/max": 0.39141201972961426, "sampling/sampling_logp_difference/mean": 0.016478192061185837, "step": 973, "step_time": 27.768283343990333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.1971643567085266, "epoch": 0.974, "frac_reward_zero_std": 0.0, "grad_norm": 0.8487705588340759, "kl": 0.018847914412617683, "learning_rate": 9.5525001026614e-09, "loss": 0.0785, "num_tokens": 2731412.0, "reward": 0.4699999988079071, "reward_std": 0.6069046854972839, "rewards/reward_func/mean": 0.4699999988079071, "rewards/reward_func/std": 0.6069047451019287, "sampling/importance_sampling_ratio/max": 1.1688258647918701, "sampling/importance_sampling_ratio/mean": 0.7233484983444214, "sampling/importance_sampling_ratio/min": 0.5074413418769836, "sampling/sampling_logp_difference/max": 0.6254899501800537, "sampling/sampling_logp_difference/mean": 0.02716418355703354, "step": 974, "step_time": 36.64094884699443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.17901204526424408, "epoch": 0.975, "frac_reward_zero_std": 0.0, "grad_norm": 0.6190588474273682, "kl": 0.012253908440470695, "learning_rate": 8.858421509933823e-09, "loss": -0.0437, "num_tokens": 2734000.0, "reward": 0.1875, "reward_std": 0.5373003482818604, "rewards/reward_func/mean": 0.1875, "rewards/reward_func/std": 0.5373003482818604, "sampling/importance_sampling_ratio/max": 0.7467259168624878, "sampling/importance_sampling_ratio/mean": 0.652080774307251, "sampling/importance_sampling_ratio/min": 0.45938265323638916, "sampling/sampling_logp_difference/max": 0.5558450222015381, "sampling/sampling_logp_difference/mean": 0.02097991853952408, "step": 975, "step_time": 28.733662876009475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 46.0, "completions/mean_terminated_length": 46.0, "completions/min_length": 38.0, "completions/min_terminated_length": 38.0, "entropy": 0.1657002568244934, "epoch": 0.976, "frac_reward_zero_std": 0.0, "grad_norm": 1.6606913805007935, "kl": 0.016540061682462692, "learning_rate": 8.190473813576571e-09, "loss": -0.005, "num_tokens": 2737075.0, "reward": 0.7425000071525574, "reward_std": 0.5083552002906799, "rewards/reward_func/mean": 0.7425000071525574, "rewards/reward_func/std": 0.5083552002906799, "sampling/importance_sampling_ratio/max": 1.356823205947876, "sampling/importance_sampling_ratio/mean": 0.8902665376663208, "sampling/importance_sampling_ratio/min": 0.559358537197113, "sampling/sampling_logp_difference/max": 0.2555462718009949, "sampling/sampling_logp_difference/mean": 0.01978575624525547, "step": 976, "step_time": 33.19621087203268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1906890869140625, "epoch": 0.977, "frac_reward_zero_std": 0.0, "grad_norm": 1.6311613321304321, "kl": 0.015463457442820072, "learning_rate": 7.54866402004506e-09, "loss": 0.3538, "num_tokens": 2740248.0, "reward": 0.4749999940395355, "reward_std": 0.6008049845695496, "rewards/reward_func/mean": 0.4749999940395355, "rewards/reward_func/std": 0.6008049845695496, "sampling/importance_sampling_ratio/max": 1.7570791244506836, "sampling/importance_sampling_ratio/mean": 1.258058786392212, "sampling/importance_sampling_ratio/min": 0.7476190328598022, "sampling/sampling_logp_difference/max": 0.3486182689666748, "sampling/sampling_logp_difference/mean": 0.021484343335032463, "step": 977, "step_time": 23.85203403001651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 67.0, "completions/max_terminated_length": 67.0, "completions/mean_length": 60.0, "completions/mean_terminated_length": 60.0, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "entropy": 0.2029145509004593, "epoch": 0.978, "frac_reward_zero_std": 0.0, "grad_norm": 0.583314061164856, "kl": 0.005313239526003599, "learning_rate": 6.93299886162041e-09, "loss": -0.0558, "num_tokens": 2742803.0, "reward": 0.7275000214576721, "reward_std": 0.48685216903686523, "rewards/reward_func/mean": 0.7275000214576721, "rewards/reward_func/std": 0.48685213923454285, "sampling/importance_sampling_ratio/max": 0.8028652667999268, "sampling/importance_sampling_ratio/mean": 0.6591019034385681, "sampling/importance_sampling_ratio/min": 0.5473982095718384, "sampling/sampling_logp_difference/max": 0.5127360820770264, "sampling/sampling_logp_difference/mean": 0.019516227766871452, "step": 978, "step_time": 19.813421515980735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 75.0, "completions/max_terminated_length": 75.0, "completions/mean_length": 62.0, "completions/mean_terminated_length": 62.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "entropy": 0.2005559355020523, "epoch": 0.979, "frac_reward_zero_std": 0.0, "grad_norm": 0.9881650805473328, "kl": 0.006371054798364639, "learning_rate": 6.343484796338395e-09, "loss": -0.1284, "num_tokens": 2745617.0, "reward": 0.4599999785423279, "reward_std": 0.6015534996986389, "rewards/reward_func/mean": 0.4599999785423279, "rewards/reward_func/std": 0.6015535593032837, "sampling/importance_sampling_ratio/max": 1.4821754693984985, "sampling/importance_sampling_ratio/mean": 1.102525234222412, "sampling/importance_sampling_ratio/min": 0.40242305397987366, "sampling/sampling_logp_difference/max": 0.3934180736541748, "sampling/sampling_logp_difference/mean": 0.01886214129626751, "step": 979, "step_time": 23.97443983500125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "entropy": 0.1786588430404663, "epoch": 0.98, "frac_reward_zero_std": 0.0, "grad_norm": 1.5577703714370728, "kl": 0.013459675014019012, "learning_rate": 5.78012800792338e-09, "loss": 0.2337, "num_tokens": 2748167.0, "reward": 0.7024999856948853, "reward_std": 0.5422406792640686, "rewards/reward_func/mean": 0.7024999856948853, "rewards/reward_func/std": 0.5422407388687134, "sampling/importance_sampling_ratio/max": 1.1404391527175903, "sampling/importance_sampling_ratio/mean": 0.7370926141738892, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5177898406982422, "sampling/sampling_logp_difference/mean": 0.020062312483787537, "step": 980, "step_time": 30.58998090098612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 65.0, "completions/max_terminated_length": 65.0, "completions/mean_length": 55.5, "completions/mean_terminated_length": 55.5, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.15070614218711853, "epoch": 0.981, "frac_reward_zero_std": 0.0, "grad_norm": 1.9086700677871704, "kl": 0.01905169151723385, "learning_rate": 5.242934405720879e-09, "loss": -0.1475, "num_tokens": 2751469.0, "reward": 0.21000000834465027, "reward_std": 0.5270673632621765, "rewards/reward_func/mean": 0.21000000834465027, "rewards/reward_func/std": 0.5270673632621765, "sampling/importance_sampling_ratio/max": 1.9256672859191895, "sampling/importance_sampling_ratio/mean": 1.4372533559799194, "sampling/importance_sampling_ratio/min": 1.0259876251220703, "sampling/sampling_logp_difference/max": 0.3349342346191406, "sampling/sampling_logp_difference/mean": 0.018662840127944946, "step": 981, "step_time": 32.306661277019884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 53.5, "completions/mean_terminated_length": 53.5, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "entropy": 0.1915203332901001, "epoch": 0.982, "frac_reward_zero_std": 0.0, "grad_norm": 1.3772201538085938, "kl": 0.013318156823515892, "learning_rate": 4.7319096246378756e-09, "loss": -0.526, "num_tokens": 2753881.0, "reward": 0.7325000166893005, "reward_std": 0.5086829662322998, "rewards/reward_func/mean": 0.7325000166893005, "rewards/reward_func/std": 0.508682906627655, "sampling/importance_sampling_ratio/max": 2.626023054122925, "sampling/importance_sampling_ratio/mean": 1.3480172157287598, "sampling/importance_sampling_ratio/min": 0.3788986802101135, "sampling/sampling_logp_difference/max": 0.34895753860473633, "sampling/sampling_logp_difference/mean": 0.021359050646424294, "step": 982, "step_time": 15.102629915985744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 51.5, "completions/mean_terminated_length": 51.5, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "entropy": 0.19044700264930725, "epoch": 0.983, "frac_reward_zero_std": 0.0, "grad_norm": 1.0532186031341553, "kl": 0.01948482356965542, "learning_rate": 4.247059025082323e-09, "loss": -0.2375, "num_tokens": 2756768.0, "reward": 0.7475000023841858, "reward_std": 0.5049999952316284, "rewards/reward_func/mean": 0.7475000023841858, "rewards/reward_func/std": 0.5049999952316284, "sampling/importance_sampling_ratio/max": 1.2351608276367188, "sampling/importance_sampling_ratio/mean": 0.8008794188499451, "sampling/importance_sampling_ratio/min": 0.2944852113723755, "sampling/sampling_logp_difference/max": 1.236772060394287, "sampling/sampling_logp_difference/mean": 0.029120780527591705, "step": 983, "step_time": 27.1358503549709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 51.75, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.14379175007343292, "epoch": 0.984, "frac_reward_zero_std": 0.0, "grad_norm": 0.747440755367279, "kl": 0.010624740272760391, "learning_rate": 3.7883876929076245e-09, "loss": -0.0349, "num_tokens": 2759153.0, "reward": 0.7224999666213989, "reward_std": 0.5550000071525574, "rewards/reward_func/mean": 0.7224999666213989, "rewards/reward_func/std": 0.5550000071525574, "sampling/importance_sampling_ratio/max": 1.3225858211517334, "sampling/importance_sampling_ratio/mean": 0.8883492350578308, "sampling/importance_sampling_ratio/min": 0.44181227684020996, "sampling/sampling_logp_difference/max": 0.4240313768386841, "sampling/sampling_logp_difference/mean": 0.020186282694339752, "step": 984, "step_time": 13.409768217010424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 54.75, "completions/mean_terminated_length": 54.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1746855080127716, "epoch": 0.985, "frac_reward_zero_std": 0.0, "grad_norm": 0.6429731845855713, "kl": 0.011662250384688377, "learning_rate": 3.355900439359072e-09, "loss": 0.0776, "num_tokens": 2762161.0, "reward": 0.20000001788139343, "reward_std": 0.5147167444229126, "rewards/reward_func/mean": 0.20000001788139343, "rewards/reward_func/std": 0.5147167444229126, "sampling/importance_sampling_ratio/max": 1.2169852256774902, "sampling/importance_sampling_ratio/mean": 0.7081165313720703, "sampling/importance_sampling_ratio/min": 0.45606473088264465, "sampling/sampling_logp_difference/max": 0.2990856468677521, "sampling/sampling_logp_difference/mean": 0.01794186793267727, "step": 985, "step_time": 33.394075602001976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 70.0, "completions/max_terminated_length": 70.0, "completions/mean_length": 60.75, "completions/mean_terminated_length": 60.75, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.1752932220697403, "epoch": 0.986, "frac_reward_zero_std": 0.0, "grad_norm": 1.3124949932098389, "kl": 0.01068700011819601, "learning_rate": 2.9496018010233275e-09, "loss": 0.0588, "num_tokens": 2764653.0, "reward": 0.4449999928474426, "reward_std": 0.6293647885322571, "rewards/reward_func/mean": 0.4449999928474426, "rewards/reward_func/std": 0.6293647885322571, "sampling/importance_sampling_ratio/max": 1.3014421463012695, "sampling/importance_sampling_ratio/mean": 0.8754969239234924, "sampling/importance_sampling_ratio/min": 0.5386622548103333, "sampling/sampling_logp_difference/max": 0.4300351142883301, "sampling/sampling_logp_difference/mean": 0.015150303021073341, "step": 986, "step_time": 23.306235301017296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 57.25, "completions/mean_terminated_length": 57.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.1935296505689621, "epoch": 0.987, "frac_reward_zero_std": 0.0, "grad_norm": 1.183745265007019, "kl": 0.02951136603951454, "learning_rate": 2.5694960397806834e-09, "loss": 0.184, "num_tokens": 2767762.0, "reward": 0.48250001668930054, "reward_std": 0.5975714921951294, "rewards/reward_func/mean": 0.48250001668930054, "rewards/reward_func/std": 0.5975714921951294, "sampling/importance_sampling_ratio/max": 1.2439830303192139, "sampling/importance_sampling_ratio/mean": 0.7772431373596191, "sampling/importance_sampling_ratio/min": 0.4808202087879181, "sampling/sampling_logp_difference/max": 0.8877735137939453, "sampling/sampling_logp_difference/mean": 0.02888231910765171, "step": 987, "step_time": 36.04153415100882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 55.0, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.18376417458057404, "epoch": 0.988, "frac_reward_zero_std": 0.0, "grad_norm": 1.6739362478256226, "kl": 0.008707821369171143, "learning_rate": 2.215587142760933e-09, "loss": 0.3471, "num_tokens": 2770433.0, "reward": 0.7124999761581421, "reward_std": 0.555060088634491, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.555060088634491, "sampling/importance_sampling_ratio/max": 1.6639033555984497, "sampling/importance_sampling_ratio/mean": 1.0539108514785767, "sampling/importance_sampling_ratio/min": 0.7027592062950134, "sampling/sampling_logp_difference/max": 0.41793978214263916, "sampling/sampling_logp_difference/mean": 0.018448807299137115, "step": 988, "step_time": 24.872454523982015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 55.0, "completions/max_terminated_length": 55.0, "completions/mean_length": 52.0, "completions/mean_terminated_length": 52.0, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "entropy": 0.16002431511878967, "epoch": 0.989, "frac_reward_zero_std": 0.0, "grad_norm": 0.8533026576042175, "kl": 0.008126652799546719, "learning_rate": 1.887878822300904e-09, "loss": 0.0246, "num_tokens": 2773240.0, "reward": 0.4725000262260437, "reward_std": 0.5811124444007874, "rewards/reward_func/mean": 0.4725000262260437, "rewards/reward_func/std": 0.5811124444007874, "sampling/importance_sampling_ratio/max": 1.2286267280578613, "sampling/importance_sampling_ratio/mean": 0.8720037341117859, "sampling/importance_sampling_ratio/min": 0.526388943195343, "sampling/sampling_logp_difference/max": 0.330339252948761, "sampling/sampling_logp_difference/mean": 0.015404274687170982, "step": 989, "step_time": 22.32189856597688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 54.25, "completions/mean_terminated_length": 54.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.18743836879730225, "epoch": 0.99, "frac_reward_zero_std": 0.0, "grad_norm": 0.6455768942832947, "kl": 0.0136005450040102, "learning_rate": 1.5863745159055977e-09, "loss": 0.1057, "num_tokens": 2775984.0, "reward": 0.47999998927116394, "reward_std": 0.6004442572593689, "rewards/reward_func/mean": 0.47999998927116394, "rewards/reward_func/std": 0.6004442572593689, "sampling/importance_sampling_ratio/max": 1.00224769115448, "sampling/importance_sampling_ratio/mean": 0.6041607856750488, "sampling/importance_sampling_ratio/min": 0.28689044713974, "sampling/sampling_logp_difference/max": 0.4001193046569824, "sampling/sampling_logp_difference/mean": 0.019597399979829788, "step": 990, "step_time": 22.190387665003072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 64.0, "completions/max_terminated_length": 64.0, "completions/mean_length": 57.75, "completions/mean_terminated_length": 57.75, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.1650892198085785, "epoch": 0.991, "frac_reward_zero_std": 0.0, "grad_norm": 0.997748076915741, "kl": 0.008677901700139046, "learning_rate": 1.3110773862126669e-09, "loss": -0.1222, "num_tokens": 2778261.0, "reward": 0.4399999976158142, "reward_std": 0.6468384861946106, "rewards/reward_func/mean": 0.4399999976158142, "rewards/reward_func/std": 0.6468384861946106, "sampling/importance_sampling_ratio/max": 1.2720673084259033, "sampling/importance_sampling_ratio/mean": 1.0427968502044678, "sampling/importance_sampling_ratio/min": 0.7019581198692322, "sampling/sampling_logp_difference/max": 0.27742695808410645, "sampling/sampling_logp_difference/mean": 0.014092200435698032, "step": 991, "step_time": 26.89563663600711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 52.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.19236090779304504, "epoch": 0.992, "frac_reward_zero_std": 0.0, "grad_norm": 0.9592631459236145, "kl": 0.006179228890687227, "learning_rate": 1.0619903209588278e-09, "loss": -0.0944, "num_tokens": 2780563.0, "reward": 0.4675000011920929, "reward_std": 0.6099931597709656, "rewards/reward_func/mean": 0.4675000011920929, "rewards/reward_func/std": 0.6099932193756104, "sampling/importance_sampling_ratio/max": 0.9822006821632385, "sampling/importance_sampling_ratio/mean": 0.8167760372161865, "sampling/importance_sampling_ratio/min": 0.7038593888282776, "sampling/sampling_logp_difference/max": 0.3056199550628662, "sampling/sampling_logp_difference/mean": 0.014848885126411915, "step": 992, "step_time": 22.396465819969308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 58.25, "completions/mean_terminated_length": 58.25, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.18650946021080017, "epoch": 0.993, "frac_reward_zero_std": 0.0, "grad_norm": 0.9059720635414124, "kl": 0.009277170524001122, "learning_rate": 8.391159329496079e-10, "loss": -0.2958, "num_tokens": 2783178.0, "reward": 0.7124999761581421, "reward_std": 0.574999988079071, "rewards/reward_func/mean": 0.7124999761581421, "rewards/reward_func/std": 0.574999988079071, "sampling/importance_sampling_ratio/max": 2.044867753982544, "sampling/importance_sampling_ratio/mean": 1.2892711162567139, "sampling/importance_sampling_ratio/min": 0.7110199332237244, "sampling/sampling_logp_difference/max": 0.39438486099243164, "sampling/sampling_logp_difference/mean": 0.01551961898803711, "step": 993, "step_time": 18.498828415002208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 50.0, "completions/mean_terminated_length": 50.0, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "entropy": 0.15531335771083832, "epoch": 0.994, "frac_reward_zero_std": 0.0, "grad_norm": 1.2234525680541992, "kl": 0.013993472792208195, "learning_rate": 6.424565600315902e-10, "loss": 0.067, "num_tokens": 2786158.0, "reward": 0.23000000417232513, "reward_std": 0.5138741731643677, "rewards/reward_func/mean": 0.23000000417232513, "rewards/reward_func/std": 0.5138741731643677, "sampling/importance_sampling_ratio/max": 1.0974217653274536, "sampling/importance_sampling_ratio/mean": 0.9015105962753296, "sampling/importance_sampling_ratio/min": 0.703021764755249, "sampling/sampling_logp_difference/max": 0.3827281594276428, "sampling/sampling_logp_difference/mean": 0.016133958473801613, "step": 994, "step_time": 28.984699615975842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 63.0, "completions/max_terminated_length": 63.0, "completions/mean_length": 50.25, "completions/mean_terminated_length": 50.25, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.17106302082538605, "epoch": 0.995, "frac_reward_zero_std": 0.0, "grad_norm": 1.1084437370300293, "kl": 0.008177208714187145, "learning_rate": 4.720142650685433e-10, "loss": 0.1281, "num_tokens": 2788732.0, "reward": 0.7200000286102295, "reward_std": 0.5600000023841858, "rewards/reward_func/mean": 0.7200000286102295, "rewards/reward_func/std": 0.5600000023841858, "sampling/importance_sampling_ratio/max": 1.1129553318023682, "sampling/importance_sampling_ratio/mean": 0.9277013540267944, "sampling/importance_sampling_ratio/min": 0.5756231546401978, "sampling/sampling_logp_difference/max": 0.3986068367958069, "sampling/sampling_logp_difference/mean": 0.01679602451622486, "step": 995, "step_time": 29.27007121697534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 56.0, "completions/max_terminated_length": 56.0, "completions/mean_length": 53.75, "completions/mean_terminated_length": 53.75, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.18767443299293518, "epoch": 0.996, "frac_reward_zero_std": 0.0, "grad_norm": 1.0597896575927734, "kl": 0.00817166268825531, "learning_rate": 3.277908359194948e-10, "loss": -0.2037, "num_tokens": 2791533.0, "reward": 0.4975000023841858, "reward_std": 0.5802513957023621, "rewards/reward_func/mean": 0.4975000023841858, "rewards/reward_func/std": 0.5802513957023621, "sampling/importance_sampling_ratio/max": 1.5600533485412598, "sampling/importance_sampling_ratio/mean": 0.9307154417037964, "sampling/importance_sampling_ratio/min": 0.23852017521858215, "sampling/sampling_logp_difference/max": 0.5274460315704346, "sampling/sampling_logp_difference/mean": 0.018347494304180145, "step": 996, "step_time": 27.13238911202643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 58.0, "completions/max_terminated_length": 58.0, "completions/mean_length": 49.5, "completions/mean_terminated_length": 49.5, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "entropy": 0.18578027188777924, "epoch": 0.997, "frac_reward_zero_std": 0.0, "grad_norm": 1.6172103881835938, "kl": 0.015706490725278854, "learning_rate": 2.0978778542041223e-10, "loss": 0.1704, "num_tokens": 2794532.0, "reward": 0.48500001430511475, "reward_std": 0.5947268009185791, "rewards/reward_func/mean": 0.48500001430511475, "rewards/reward_func/std": 0.5947268605232239, "sampling/importance_sampling_ratio/max": 1.6353111267089844, "sampling/importance_sampling_ratio/mean": 0.8870031833648682, "sampling/importance_sampling_ratio/min": 0.48238202929496765, "sampling/sampling_logp_difference/max": 0.4939916729927063, "sampling/sampling_logp_difference/mean": 0.018784884363412857, "step": 997, "step_time": 24.203443703008816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 54.0, "completions/max_terminated_length": 54.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 51.25, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "entropy": 0.13702654838562012, "epoch": 0.998, "frac_reward_zero_std": 0.0, "grad_norm": 0.8184160590171814, "kl": 0.01837071403861046, "learning_rate": 1.180063513669949e-10, "loss": -0.221, "num_tokens": 2797817.0, "reward": 0.48750001192092896, "reward_std": 0.5917980670928955, "rewards/reward_func/mean": 0.48750001192092896, "rewards/reward_func/std": 0.5917980670928955, "sampling/importance_sampling_ratio/max": 1.4429056644439697, "sampling/importance_sampling_ratio/mean": 0.8643471002578735, "sampling/importance_sampling_ratio/min": 0.5936262607574463, "sampling/sampling_logp_difference/max": 0.530827522277832, "sampling/sampling_logp_difference/mean": 0.019981710240244865, "step": 998, "step_time": 38.85073777497746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 62.0, "completions/max_terminated_length": 62.0, "completions/mean_length": 52.5, "completions/mean_terminated_length": 52.5, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "entropy": 0.13236872851848602, "epoch": 0.999, "frac_reward_zero_std": 0.0, "grad_norm": 1.264488697052002, "kl": 0.03493987396359444, "learning_rate": 5.2447496503016395e-11, "loss": 0.3232, "num_tokens": 2800780.0, "reward": 0.45500001311302185, "reward_std": 0.6258060932159424, "rewards/reward_func/mean": 0.45500001311302185, "rewards/reward_func/std": 0.6258061528205872, "sampling/importance_sampling_ratio/max": 1.127501130104065, "sampling/importance_sampling_ratio/mean": 0.8220004439353943, "sampling/importance_sampling_ratio/min": 0.3521798253059387, "sampling/sampling_logp_difference/max": 0.45055508613586426, "sampling/sampling_logp_difference/mean": 0.016251809895038605, "step": 999, "step_time": 23.084633713006042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 59.0, "completions/max_terminated_length": 59.0, "completions/mean_length": 53.0, "completions/mean_terminated_length": 53.0, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "entropy": 0.1995820701122284, "epoch": 1.0, "frac_reward_zero_std": 0.0, "grad_norm": 0.39909473061561584, "kl": 0.017866650596261024, "learning_rate": 1.3111908510332616e-11, "loss": 0.2663, "num_tokens": 2804346.0, "reward": 0.23749999701976776, "reward_std": 0.508486270904541, "rewards/reward_func/mean": 0.23749999701976776, "rewards/reward_func/std": 0.5084863305091858, "sampling/importance_sampling_ratio/max": 0.9968469142913818, "sampling/importance_sampling_ratio/mean": 0.5456133484840393, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.5999367237091064, "sampling/sampling_logp_difference/mean": 0.02517210878431797, "step": 1000, "step_time": 39.248020336963236 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 2804346, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }