{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00055, "eval_steps": 500, "global_step": 55, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.940144538879395, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.03423422574996948, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0013, "num_tokens": 35616.0, "reward": -0.7051675319671631, "reward_std": 0.7764065265655518, "rewards/rollout_reward_func/mean": -0.7051675319671631, "rewards/rollout_reward_func/std": 0.75037682056427, "sampling/importance_sampling_ratio/max": 0.06733503937721252, "sampling/importance_sampling_ratio/mean": 0.035891756415367126, "sampling/importance_sampling_ratio/min": 0.012922381982207298, "sampling/sampling_logp_difference/max": 2.4574475288391113, "sampling/sampling_logp_difference/mean": 1.7373101711273193, "step": 1, "step_time": 6.607899043003272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.940144538879395, "epoch": 2e-05, "grad_norm": 0.03577549755573273, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0013, "step": 2, "step_time": 2.9063545979988703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.965680599212646, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.017016781494021416, "kl": 0.0007822737097740173, "learning_rate": 5.714285714285715e-07, "loss": -0.0006, "num_tokens": 71095.0, "reward": -0.9110076427459717, "reward_std": 0.6931561231613159, "rewards/rollout_reward_func/mean": -0.9110076427459717, "rewards/rollout_reward_func/std": 0.6800154447555542, "sampling/importance_sampling_ratio/max": 0.06864165514707565, "sampling/importance_sampling_ratio/mean": 0.03215230628848076, "sampling/importance_sampling_ratio/min": 0.011430883780121803, "sampling/sampling_logp_difference/max": 2.474456548690796, "sampling/sampling_logp_difference/mean": 1.8041703701019287, "step": 3, "step_time": 5.5894952089984145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.965598821640015, "epoch": 4e-05, "grad_norm": 0.01733771711587906, "kl": 0.0007491949945688248, "learning_rate": 8.571428571428572e-07, "loss": -0.0006, "step": 4, "step_time": 3.4044442560007155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.53125, "completions/mean_terminated_length": 2.096774101257324, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.834780097007751, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.02108524739742279, "kl": 0.0009654137102188542, "learning_rate": 1.142857142857143e-06, "loss": -0.0003, "num_tokens": 106490.0, "reward": -0.5540984869003296, "reward_std": 0.8771607279777527, "rewards/rollout_reward_func/mean": -0.5540984869003296, "rewards/rollout_reward_func/std": 0.8618184924125671, "sampling/importance_sampling_ratio/max": 0.07213470339775085, "sampling/importance_sampling_ratio/mean": 0.03297191113233566, "sampling/importance_sampling_ratio/min": 3.0050444771445584e-11, "sampling/sampling_logp_difference/max": 4.576776504516602, "sampling/sampling_logp_difference/mean": 1.773134469985962, "step": 5, "step_time": 6.008008040997083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.827986240386963, "epoch": 6e-05, "grad_norm": 0.021368548274040222, "kl": 0.0009469666983932257, "learning_rate": 1.4285714285714286e-06, "loss": -0.0004, "step": 6, "step_time": 2.88994878000085 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.875, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.796356916427612, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.013663483783602715, "kl": 0.0008234605193138123, "learning_rate": 1.7142857142857145e-06, "loss": 0.0, "num_tokens": 142069.0, "reward": -0.8088920712471008, "reward_std": 0.7424027323722839, "rewards/rollout_reward_func/mean": -0.8088920712471008, "rewards/rollout_reward_func/std": 0.7662962675094604, "sampling/importance_sampling_ratio/max": 0.057457707822322845, "sampling/importance_sampling_ratio/mean": 0.02730659209191799, "sampling/importance_sampling_ratio/min": 7.280681058041694e-10, "sampling/sampling_logp_difference/max": 4.222927093505859, "sampling/sampling_logp_difference/mean": 1.6366889476776123, "step": 7, "step_time": 5.921918200005166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.786743104457855, "epoch": 8e-05, "grad_norm": 0.013285573571920395, "kl": 0.0009508101793471724, "learning_rate": 2.0000000000000003e-06, "loss": -0.0, "step": 8, "step_time": 2.9387520060008683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.922944903373718, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.01735098287463188, "kl": 0.0008866805583238602, "learning_rate": 2.285714285714286e-06, "loss": -0.0002, "num_tokens": 176547.0, "reward": -0.618694543838501, "reward_std": 0.8990023136138916, "rewards/rollout_reward_func/mean": -0.618694543838501, "rewards/rollout_reward_func/std": 0.8754127621650696, "sampling/importance_sampling_ratio/max": 0.06334654986858368, "sampling/importance_sampling_ratio/mean": 0.03222377225756645, "sampling/importance_sampling_ratio/min": 0.011594683863222599, "sampling/sampling_logp_difference/max": 2.4042437076568604, "sampling/sampling_logp_difference/mean": 1.7828483581542969, "step": 9, "step_time": 5.69735375300661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.913637280464172, "epoch": 0.0001, "grad_norm": 0.017596419900655746, "kl": 0.000972965732216835, "learning_rate": 2.571428571428571e-06, "loss": -0.0002, "step": 10, "step_time": 3.580516988000454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.15625, "completions/mean_terminated_length": 2.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.979163527488708, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.01395466923713684, "kl": 0.0012971882097190246, "learning_rate": 2.8571428571428573e-06, "loss": -0.0002, "num_tokens": 210671.0, "reward": -0.6838527917861938, "reward_std": 0.7062864899635315, "rewards/rollout_reward_func/mean": -0.6838527917861938, "rewards/rollout_reward_func/std": 0.7574694752693176, "sampling/importance_sampling_ratio/max": 0.06857945024967194, "sampling/importance_sampling_ratio/mean": 0.03003668040037155, "sampling/importance_sampling_ratio/min": 7.147054475353798e-06, "sampling/sampling_logp_difference/max": 4.250937461853027, "sampling/sampling_logp_difference/mean": 1.8635720014572144, "step": 11, "step_time": 6.033825367005193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.966804146766663, "epoch": 0.00012, "grad_norm": 0.01391494832932949, "kl": 0.0018893439264502376, "learning_rate": 3.142857142857143e-06, "loss": -0.0002, "step": 12, "step_time": 2.8316939499891305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.28125, "completions/mean_terminated_length": 2.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.810548543930054, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 0.014992384240031242, "kl": 0.003853602087474428, "learning_rate": 3.428571428571429e-06, "loss": 0.0001, "num_tokens": 245676.0, "reward": -0.6364654302597046, "reward_std": 0.7521181106567383, "rewards/rollout_reward_func/mean": -0.6364654302597046, "rewards/rollout_reward_func/std": 0.7526334524154663, "sampling/importance_sampling_ratio/max": 0.06722358614206314, "sampling/importance_sampling_ratio/mean": 0.03307785466313362, "sampling/importance_sampling_ratio/min": 3.5045477488893084e-06, "sampling/sampling_logp_difference/max": 4.873165607452393, "sampling/sampling_logp_difference/mean": 1.8621257543563843, "step": 13, "step_time": 5.768471701994713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.802866578102112, "epoch": 0.00014, "grad_norm": 0.014974371530115604, "kl": 0.004468549799639732, "learning_rate": 3.7142857142857146e-06, "loss": 0.0001, "step": 14, "step_time": 2.8839251570025226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.717366337776184, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.020232753828167915, "kl": 0.004971407979610376, "learning_rate": 4.000000000000001e-06, "loss": -0.0008, "num_tokens": 282194.0, "reward": -0.7452265024185181, "reward_std": 0.7260236144065857, "rewards/rollout_reward_func/mean": -0.7452265024185181, "rewards/rollout_reward_func/std": 0.7854404449462891, "sampling/importance_sampling_ratio/max": 0.084043949842453, "sampling/importance_sampling_ratio/mean": 0.03686349838972092, "sampling/importance_sampling_ratio/min": 9.963324609785218e-10, "sampling/sampling_logp_difference/max": 3.4498603343963623, "sampling/sampling_logp_difference/mean": 1.676363468170166, "step": 15, "step_time": 5.806083219005814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.69223439693451, "epoch": 0.00016, "grad_norm": 0.020264672115445137, "kl": 0.005897294729948044, "learning_rate": 4.2857142857142855e-06, "loss": -0.0008, "step": 16, "step_time": 3.649606159000541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.1875, "completions/mean_terminated_length": 2.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.732311606407166, "epoch": 0.00017, "frac_reward_zero_std": 0.0, "grad_norm": 0.019782064482569695, "kl": 0.007078325026668608, "learning_rate": 4.571428571428572e-06, "loss": -0.0, "num_tokens": 317749.0, "reward": -0.5659611821174622, "reward_std": 0.7136144042015076, "rewards/rollout_reward_func/mean": -0.5659611821174622, "rewards/rollout_reward_func/std": 0.7692865133285522, "sampling/importance_sampling_ratio/max": 0.08927696198225021, "sampling/importance_sampling_ratio/mean": 0.034128978848457336, "sampling/importance_sampling_ratio/min": 6.115115684224293e-05, "sampling/sampling_logp_difference/max": 2.444645404815674, "sampling/sampling_logp_difference/mean": 1.729607105255127, "step": 17, "step_time": 6.096930697000062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 8.71469098329544, "epoch": 0.00018, "grad_norm": 0.0198439322412014, "kl": 0.00980698294006288, "learning_rate": 4.857142857142858e-06, "loss": -0.0001, "step": 18, "step_time": 2.8486052290027146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13.0, "completions/max_terminated_length": 13.0, "completions/mean_length": 2.34375, "completions/mean_terminated_length": 2.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.700989127159119, "epoch": 0.00019, "frac_reward_zero_std": 0.0, "grad_norm": 0.02092764899134636, "kl": 0.016879421891644597, "learning_rate": 5.142857142857142e-06, "loss": -0.0005, "num_tokens": 353593.0, "reward": -0.5766444802284241, "reward_std": 0.8734984397888184, "rewards/rollout_reward_func/mean": -0.5766444802284241, "rewards/rollout_reward_func/std": 0.8666929602622986, "sampling/importance_sampling_ratio/max": 0.10328938066959381, "sampling/importance_sampling_ratio/mean": 0.0412919819355011, "sampling/importance_sampling_ratio/min": 8.264829792770101e-11, "sampling/sampling_logp_difference/max": 3.909327507019043, "sampling/sampling_logp_difference/mean": 1.7047920227050781, "step": 19, "step_time": 5.767663798993453 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 8.635276675224304, "epoch": 0.0002, "grad_norm": 0.02117123454809189, "kl": 0.022729096352122724, "learning_rate": 5.428571428571429e-06, "loss": -0.0005, "step": 20, "step_time": 2.8989755920047173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 2.15625, "completions/mean_terminated_length": 2.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.578810691833496, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.044351667165756226, "kl": 0.03346684481948614, "learning_rate": 5.7142857142857145e-06, "loss": -0.0024, "num_tokens": 388691.0, "reward": -0.6427146196365356, "reward_std": 0.8122553825378418, "rewards/rollout_reward_func/mean": -0.6427146196365356, "rewards/rollout_reward_func/std": 0.7960423827171326, "sampling/importance_sampling_ratio/max": 0.10920954495668411, "sampling/importance_sampling_ratio/mean": 0.04724588990211487, "sampling/importance_sampling_ratio/min": 2.8349152216833318e-06, "sampling/sampling_logp_difference/max": 3.772367477416992, "sampling/sampling_logp_difference/mean": 1.6777459383010864, "step": 21, "step_time": 5.727157995002926 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 8.421014785766602, "epoch": 0.00022, "grad_norm": 0.044636089354753494, "kl": 0.047105960082262754, "learning_rate": 6e-06, "loss": -0.0026, "step": 22, "step_time": 4.04690631700214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.332530975341797, "epoch": 0.00023, "frac_reward_zero_std": 0.0, "grad_norm": 0.0429239459335804, "kl": 0.07239408232271671, "learning_rate": 6.285714285714286e-06, "loss": -0.0028, "num_tokens": 424016.0, "reward": -0.6825613975524902, "reward_std": 0.8769230246543884, "rewards/rollout_reward_func/mean": -0.6825613975524902, "rewards/rollout_reward_func/std": 0.852479875087738, "sampling/importance_sampling_ratio/max": 0.14365191757678986, "sampling/importance_sampling_ratio/mean": 0.05794315040111542, "sampling/importance_sampling_ratio/min": 0.008735693991184235, "sampling/sampling_logp_difference/max": 2.5880439281463623, "sampling/sampling_logp_difference/mean": 1.6414165496826172, "step": 23, "step_time": 5.61804673000006 }, { "clip_ratio/high_max": 0.21875, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 8.144118428230286, "epoch": 0.00024, "grad_norm": 0.0180932879447937, "kl": 0.0962864700704813, "learning_rate": 6.571428571428572e-06, "loss": -0.0031, "step": 24, "step_time": 2.8988405260024592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.1875, "completions/mean_terminated_length": 2.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 8.125683069229126, "epoch": 0.00025, "frac_reward_zero_std": 0.0, "grad_norm": 0.03623996675014496, "kl": 0.10392077919095755, "learning_rate": 6.857142857142858e-06, "loss": -0.0041, "num_tokens": 459589.0, "reward": -0.61258465051651, "reward_std": 0.871542751789093, "rewards/rollout_reward_func/mean": -0.61258465051651, "rewards/rollout_reward_func/std": 0.8524011969566345, "sampling/importance_sampling_ratio/max": 0.16312259435653687, "sampling/importance_sampling_ratio/mean": 0.06305442750453949, "sampling/importance_sampling_ratio/min": 1.7614916032471228e-06, "sampling/sampling_logp_difference/max": 4.772340774536133, "sampling/sampling_logp_difference/mean": 1.7246109247207642, "step": 25, "step_time": 5.539267299005587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.976094305515289, "epoch": 0.00026, "grad_norm": 0.030301710590720177, "kl": 0.13206800539046526, "learning_rate": 7.1428571428571436e-06, "loss": -0.0045, "step": 26, "step_time": 2.896310984997399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.4375, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.269331395626068, "epoch": 0.00027, "frac_reward_zero_std": 0.0, "grad_norm": 0.022369084879755974, "kl": 0.16119840927422047, "learning_rate": 7.428571428571429e-06, "loss": -0.0036, "num_tokens": 496650.0, "reward": -0.7243883013725281, "reward_std": 0.7688334584236145, "rewards/rollout_reward_func/mean": -0.7243883013725281, "rewards/rollout_reward_func/std": 0.7527879476547241, "sampling/importance_sampling_ratio/max": 0.18785437941551208, "sampling/importance_sampling_ratio/mean": 0.10117587447166443, "sampling/importance_sampling_ratio/min": 8.512477528421769e-11, "sampling/sampling_logp_difference/max": 4.909823417663574, "sampling/sampling_logp_difference/mean": 1.4340462684631348, "step": 27, "step_time": 6.4576256859945715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 7.120794892311096, "epoch": 0.00028, "grad_norm": 0.02468658983707428, "kl": 0.182576522231102, "learning_rate": 7.714285714285716e-06, "loss": -0.0038, "step": 28, "step_time": 3.5662226489985187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 7.656641006469727, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.016346270218491554, "kl": 0.1884312927722931, "learning_rate": 8.000000000000001e-06, "loss": -0.0025, "num_tokens": 531329.0, "reward": -0.6714671850204468, "reward_std": 0.8514942526817322, "rewards/rollout_reward_func/mean": -0.6714671850204468, "rewards/rollout_reward_func/std": 0.8725821375846863, "sampling/importance_sampling_ratio/max": 0.2034609168767929, "sampling/importance_sampling_ratio/mean": 0.0898696631193161, "sampling/importance_sampling_ratio/min": 0.008383152075111866, "sampling/sampling_logp_difference/max": 2.7939882278442383, "sampling/sampling_logp_difference/mean": 1.5407953262329102, "step": 29, "step_time": 5.637962408003659 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 7.5971901416778564, "epoch": 0.0003, "grad_norm": 0.013556623831391335, "kl": 0.20893656089901924, "learning_rate": 8.285714285714287e-06, "loss": -0.0026, "step": 30, "step_time": 2.9192343930008064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 9.0, "completions/max_terminated_length": 9.0, "completions/mean_length": 2.21875, "completions/mean_terminated_length": 2.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.758796453475952, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.019501112401485443, "kl": 0.3079346362501383, "learning_rate": 8.571428571428571e-06, "loss": -0.0031, "num_tokens": 567805.0, "reward": -0.568469762802124, "reward_std": 0.8567708730697632, "rewards/rollout_reward_func/mean": -0.568469762802124, "rewards/rollout_reward_func/std": 0.8660122752189636, "sampling/importance_sampling_ratio/max": 0.22180257737636566, "sampling/importance_sampling_ratio/mean": 0.12508273124694824, "sampling/importance_sampling_ratio/min": 3.750224089604792e-11, "sampling/sampling_logp_difference/max": 5.136954307556152, "sampling/sampling_logp_difference/mean": 1.4436562061309814, "step": 31, "step_time": 5.43906901200171 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 6.6603924036026, "epoch": 0.00032, "grad_norm": 0.01703478768467903, "kl": 0.3732527755200863, "learning_rate": 8.857142857142858e-06, "loss": -0.0032, "step": 32, "step_time": 2.930219074998604 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.725336015224457, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.016738811507821083, "kl": 0.27567504718899727, "learning_rate": 9.142857142857144e-06, "loss": -0.0019, "num_tokens": 603268.0, "reward": -0.30585941672325134, "reward_std": 0.6699719429016113, "rewards/rollout_reward_func/mean": -0.30585941672325134, "rewards/rollout_reward_func/std": 0.6897762417793274, "sampling/importance_sampling_ratio/max": 0.2360040694475174, "sampling/importance_sampling_ratio/mean": 0.1374823898077011, "sampling/importance_sampling_ratio/min": 0.006810983642935753, "sampling/sampling_logp_difference/max": 3.1095614433288574, "sampling/sampling_logp_difference/mean": 1.2982618808746338, "step": 33, "step_time": 6.234361916005582 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 6.653369903564453, "epoch": 0.00034, "grad_norm": 0.01500980369746685, "kl": 0.29385758377611637, "learning_rate": 9.42857142857143e-06, "loss": -0.0019, "step": 34, "step_time": 2.948668930999702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.599472224712372, "epoch": 0.00035, "frac_reward_zero_std": 0.0, "grad_norm": 0.018205825239419937, "kl": 0.35661908239126205, "learning_rate": 9.714285714285715e-06, "loss": -0.0049, "num_tokens": 639208.0, "reward": -0.5350777506828308, "reward_std": 0.7106601595878601, "rewards/rollout_reward_func/mean": -0.5350777506828308, "rewards/rollout_reward_func/std": 0.7991757392883301, "sampling/importance_sampling_ratio/max": 0.25684407353401184, "sampling/importance_sampling_ratio/mean": 0.1485980749130249, "sampling/importance_sampling_ratio/min": 3.692734389915131e-05, "sampling/sampling_logp_difference/max": 4.381838321685791, "sampling/sampling_logp_difference/mean": 1.253082275390625, "step": 35, "step_time": 5.500268681997113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 6.5274258852005005, "epoch": 0.00036, "grad_norm": 0.026637688279151917, "kl": 0.36279567517340183, "learning_rate": 1e-05, "loss": -0.0049, "step": 36, "step_time": 2.9163373929950467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.327381074428558, "epoch": 0.00037, "frac_reward_zero_std": 0.0, "grad_norm": 0.03791587054729462, "kl": 0.4374086819589138, "learning_rate": 9.999999999962232e-06, "loss": -0.0034, "num_tokens": 675224.0, "reward": -0.42839378118515015, "reward_std": 0.7165933847427368, "rewards/rollout_reward_func/mean": -0.42839378118515015, "rewards/rollout_reward_func/std": 0.6934623122215271, "sampling/importance_sampling_ratio/max": 0.2750149071216583, "sampling/importance_sampling_ratio/mean": 0.16812871396541595, "sampling/importance_sampling_ratio/min": 0.005278678145259619, "sampling/sampling_logp_difference/max": 3.2639646530151367, "sampling/sampling_logp_difference/mean": 1.122904896736145, "step": 37, "step_time": 5.68354233199716 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 6.15696656703949, "epoch": 0.00038, "grad_norm": 0.01739896647632122, "kl": 0.46510135009884834, "learning_rate": 9.999999999848919e-06, "loss": -0.0035, "step": 38, "step_time": 2.9220271470039734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 8.0, "completions/mean_length": 2.625, "completions/mean_terminated_length": 2.1935482025146484, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.091022729873657, "epoch": 0.00039, "frac_reward_zero_std": 0.0, "grad_norm": 0.0197161715477705, "kl": 0.42657361552119255, "learning_rate": 9.99999999966007e-06, "loss": -0.0024, "num_tokens": 710988.0, "reward": -0.3023349940776825, "reward_std": 0.6465471386909485, "rewards/rollout_reward_func/mean": -0.3023349940776825, "rewards/rollout_reward_func/std": 0.6331813335418701, "sampling/importance_sampling_ratio/max": 0.2962448298931122, "sampling/importance_sampling_ratio/mean": 0.18444794416427612, "sampling/importance_sampling_ratio/min": 4.504835306867738e-12, "sampling/sampling_logp_difference/max": 4.963308334350586, "sampling/sampling_logp_difference/mean": 1.1856834888458252, "step": 39, "step_time": 7.029601453006762 }, { "clip_ratio/high_max": 0.1319444444961846, "clip_ratio/high_mean": 0.07847222150303423, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07847222150303423, "entropy": 5.928341567516327, "epoch": 0.0004, "grad_norm": 0.028808562085032463, "kl": 0.44897962361574173, "learning_rate": 9.99999999939568e-06, "loss": -0.0025, "step": 40, "step_time": 2.9406937890053086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 6.25755649805069, "epoch": 0.00041, "frac_reward_zero_std": 0.0, "grad_norm": 0.0933663472533226, "kl": 0.48784746043384075, "learning_rate": 9.999999999055747e-06, "loss": 0.0029, "num_tokens": 745636.0, "reward": -0.19651329517364502, "reward_std": 0.5318358540534973, "rewards/rollout_reward_func/mean": -0.19651329517364502, "rewards/rollout_reward_func/std": 0.5945489406585693, "sampling/importance_sampling_ratio/max": 0.31440603733062744, "sampling/importance_sampling_ratio/mean": 0.18640094995498657, "sampling/importance_sampling_ratio/min": 0.011243580840528011, "sampling/sampling_logp_difference/max": 2.6481189727783203, "sampling/sampling_logp_difference/mean": 1.0152370929718018, "step": 41, "step_time": 5.63657486000011 }, { "clip_ratio/high_max": 0.1875, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 5.96447890996933, "epoch": 0.00042, "grad_norm": 0.02206423319876194, "kl": 0.5289704687893391, "learning_rate": 9.999999998640277e-06, "loss": 0.0027, "step": 42, "step_time": 2.8975234469944553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 5.439069867134094, "epoch": 0.00043, "frac_reward_zero_std": 0.0, "grad_norm": 0.11297624558210373, "kl": 0.4683471880853176, "learning_rate": 9.999999998149264e-06, "loss": 0.0006, "num_tokens": 781581.0, "reward": -0.44622302055358887, "reward_std": 0.68892902135849, "rewards/rollout_reward_func/mean": -0.44622302055358887, "rewards/rollout_reward_func/std": 0.7478122711181641, "sampling/importance_sampling_ratio/max": 0.3280465304851532, "sampling/importance_sampling_ratio/mean": 0.22684511542320251, "sampling/importance_sampling_ratio/min": 0.026074819266796112, "sampling/sampling_logp_difference/max": 2.0682249069213867, "sampling/sampling_logp_difference/mean": 0.8481977581977844, "step": 43, "step_time": 5.7263973810077005 }, { "clip_ratio/high_max": 0.15625, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 5.03247994184494, "epoch": 0.00044, "grad_norm": 0.06387817859649658, "kl": 0.5371211282908916, "learning_rate": 9.999999997582713e-06, "loss": 0.0004, "step": 44, "step_time": 3.3802060630041524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.975203037261963, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.13102945685386658, "kl": 0.6513971909880638, "learning_rate": 9.999999996940621e-06, "loss": -0.0028, "num_tokens": 817273.0, "reward": -0.587563157081604, "reward_std": 0.7007678747177124, "rewards/rollout_reward_func/mean": -0.587563157081604, "rewards/rollout_reward_func/std": 0.7760494947433472, "sampling/importance_sampling_ratio/max": 0.3387902081012726, "sampling/importance_sampling_ratio/mean": 0.2464321404695511, "sampling/importance_sampling_ratio/min": 7.80636619310826e-05, "sampling/sampling_logp_difference/max": 4.357866287231445, "sampling/sampling_logp_difference/mean": 0.8462474346160889, "step": 45, "step_time": 6.2393272499975865 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.09375, "entropy": 4.856449127197266, "epoch": 0.00046, "grad_norm": 0.0859452411532402, "kl": 0.6537227220833302, "learning_rate": 9.99999999622299e-06, "loss": -0.0031, "step": 46, "step_time": 2.901350881998951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.492773771286011, "epoch": 0.00047, "frac_reward_zero_std": 0.0, "grad_norm": 0.14956164360046387, "kl": 0.5590145848691463, "learning_rate": 9.999999995429816e-06, "loss": -0.0016, "num_tokens": 853016.0, "reward": -0.2771303355693817, "reward_std": 0.7537246942520142, "rewards/rollout_reward_func/mean": -0.2771303355693817, "rewards/rollout_reward_func/std": 0.7401061654090881, "sampling/importance_sampling_ratio/max": 0.34557926654815674, "sampling/importance_sampling_ratio/mean": 0.2762402594089508, "sampling/importance_sampling_ratio/min": 0.03973078727722168, "sampling/sampling_logp_difference/max": 1.9350109100341797, "sampling/sampling_logp_difference/mean": 0.7055625915527344, "step": 47, "step_time": 5.625663070004521 }, { "clip_ratio/high_max": 0.09375, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 4.3877677619457245, "epoch": 0.00048, "grad_norm": 0.06713134795427322, "kl": 0.578897014260292, "learning_rate": 9.999999994561102e-06, "loss": -0.0019, "step": 48, "step_time": 2.886007981996954 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0625, "completions/max_length": 16.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.96875, "completions/mean_terminated_length": 2.1000001430511475, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.886862337589264, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.3026506304740906, "kl": 0.6998865567147732, "learning_rate": 9.99999999361685e-06, "loss": -0.0043, "num_tokens": 888637.0, "reward": -0.3646780252456665, "reward_std": 0.7392382025718689, "rewards/rollout_reward_func/mean": -0.3646780252456665, "rewards/rollout_reward_func/std": 0.7437232136726379, "sampling/importance_sampling_ratio/max": 0.5906126499176025, "sampling/importance_sampling_ratio/mean": 0.2536194622516632, "sampling/importance_sampling_ratio/min": 5.002554794020231e-12, "sampling/sampling_logp_difference/max": 5.405303001403809, "sampling/sampling_logp_difference/mean": 1.139528751373291, "step": 49, "step_time": 5.81962778799425 }, { "clip_ratio/high_max": 0.045138888992369175, "clip_ratio/high_mean": 0.03819444449618459, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.03125, "clip_ratio/region_mean": 0.13194444426335394, "entropy": 4.753438889980316, "epoch": 0.0005, "grad_norm": 0.22047115862369537, "kl": 0.8953660875558853, "learning_rate": 9.999999992597058e-06, "loss": -0.0044, "step": 50, "step_time": 3.4302545330028806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 11.0, "completions/mean_length": 2.71875, "completions/mean_terminated_length": 2.2903225421905518, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.553148508071899, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.14803296327590942, "kl": 0.8594339191913605, "learning_rate": 9.999999991501723e-06, "loss": 0.0001, "num_tokens": 925783.0, "reward": -0.47938936948776245, "reward_std": 0.6224657893180847, "rewards/rollout_reward_func/mean": -0.47938936948776245, "rewards/rollout_reward_func/std": 0.6325410604476929, "sampling/importance_sampling_ratio/max": 0.8112522959709167, "sampling/importance_sampling_ratio/mean": 0.29810550808906555, "sampling/importance_sampling_ratio/min": 3.594766628464696e-13, "sampling/sampling_logp_difference/max": 5.109455108642578, "sampling/sampling_logp_difference/mean": 1.0870777368545532, "step": 51, "step_time": 6.738885986007517 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.02524038404226303, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02524038404226303, "entropy": 4.535911321640015, "epoch": 0.00052, "grad_norm": 0.04543463885784149, "kl": 0.7985228635370731, "learning_rate": 9.99999999033085e-06, "loss": -0.0004, "step": 52, "step_time": 3.152213782999752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2.0, "completions/max_terminated_length": 2.0, "completions/mean_length": 2.0, "completions/mean_terminated_length": 2.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.083370506763458, "epoch": 0.00053, "frac_reward_zero_std": 0.0, "grad_norm": 0.04525892809033394, "kl": 0.874318428337574, "learning_rate": 9.999999989084436e-06, "loss": -0.0025, "num_tokens": 961105.0, "reward": -0.21413108706474304, "reward_std": 0.5813945531845093, "rewards/rollout_reward_func/mean": -0.21413108706474304, "rewards/rollout_reward_func/std": 0.5861169099807739, "sampling/importance_sampling_ratio/max": 0.7165222764015198, "sampling/importance_sampling_ratio/mean": 0.3133776783943176, "sampling/importance_sampling_ratio/min": 0.013980884104967117, "sampling/sampling_logp_difference/max": 3.070335626602173, "sampling/sampling_logp_difference/mean": 0.7299262285232544, "step": 53, "step_time": 6.067928731994471 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 4.059947609901428, "epoch": 0.00054, "grad_norm": 0.07141973823308945, "kl": 0.9976279065012932, "learning_rate": 9.99999998776248e-06, "loss": -0.0025, "step": 54, "step_time": 3.118995607001125 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 5.0, "completions/max_terminated_length": 5.0, "completions/mean_length": 2.09375, "completions/mean_terminated_length": 2.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 4.261334180831909, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.06153355911374092, "kl": 0.8010737895965576, "learning_rate": 9.999999986364988e-06, "loss": 0.0033, "num_tokens": 996705.0, "reward": -0.33633241057395935, "reward_std": 0.4821242392063141, "rewards/rollout_reward_func/mean": -0.33633241057395935, "rewards/rollout_reward_func/std": 0.5220240354537964, "sampling/importance_sampling_ratio/max": 0.9735277891159058, "sampling/importance_sampling_ratio/mean": 0.2968878149986267, "sampling/importance_sampling_ratio/min": 0.0002361015067435801, "sampling/sampling_logp_difference/max": 4.416370868682861, "sampling/sampling_logp_difference/mean": 0.8390293717384338, "step": 55, "step_time": 5.6938710109971 } ], "logging_steps": 1.0, "max_steps": 700000, "num_input_tokens_seen": 996705, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }