{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00023, "eval_steps": 500, "global_step": 23, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 390.1875, "completions/mean_terminated_length": 390.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.605668731033802, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.8689773678779602, "kl": 0.0, "learning_rate": 0.0, "loss": -0.009, "num_tokens": 49602.0, "reward": 0.49261724948883057, "reward_std": 1.4282547235488892, "rewards/rollout_reward_func/mean": 0.49261724948883057, "rewards/rollout_reward_func/std": 1.4220702648162842, "sampling/importance_sampling_ratio/max": 1.4268709421157837, "sampling/importance_sampling_ratio/mean": 0.8554609417915344, "sampling/importance_sampling_ratio/min": 0.6006377339363098, "sampling/sampling_logp_difference/max": 0.6218547821044922, "sampling/sampling_logp_difference/mean": 0.05964243412017822, "step": 1, "step_time": 13.768371919000856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.605668731033802, "epoch": 2e-05, "grad_norm": 0.8696622252464294, "kl": 0.0, "learning_rate": 5.333333333333333e-07, "loss": -0.009, "step": 2, "step_time": 6.979965120997804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 256.3125, "completions/mean_terminated_length": 256.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5112766288220882, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.42978498339653015, "kl": 0.004170067805148392, "learning_rate": 1.0666666666666667e-06, "loss": -0.033, "num_tokens": 91865.0, "reward": 0.04000457376241684, "reward_std": 0.8832277655601501, "rewards/rollout_reward_func/mean": 0.04000457376241684, "rewards/rollout_reward_func/std": 1.1484198570251465, "sampling/importance_sampling_ratio/max": 1.1835894584655762, "sampling/importance_sampling_ratio/mean": 0.8482605218887329, "sampling/importance_sampling_ratio/min": 0.33848339319229126, "sampling/sampling_logp_difference/max": 1.006063461303711, "sampling/sampling_logp_difference/mean": 0.05736350640654564, "step": 3, "step_time": 13.413748369999666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 0.5041642189025879, "epoch": 4e-05, "grad_norm": 0.42657995223999023, "kl": 0.005512098512326702, "learning_rate": 1.6e-06, "loss": -0.0326, "step": 4, "step_time": 7.850364476997129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1745.0, "completions/max_terminated_length": 1745.0, "completions/mean_length": 340.96875, "completions/mean_terminated_length": 340.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.59855717420578, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.033186435699463, "kl": 0.002424745060807254, "learning_rate": 2.1333333333333334e-06, "loss": 0.007, "num_tokens": 138699.0, "reward": 0.3879123330116272, "reward_std": 1.4281163215637207, "rewards/rollout_reward_func/mean": 0.3879123330116272, "rewards/rollout_reward_func/std": 1.46486234664917, "sampling/importance_sampling_ratio/max": 1.9882254600524902, "sampling/importance_sampling_ratio/mean": 0.8772280812263489, "sampling/importance_sampling_ratio/min": 2.3855912800740953e-09, "sampling/sampling_logp_difference/max": 18.587005615234375, "sampling/sampling_logp_difference/mean": 0.14903730154037476, "step": 5, "step_time": 15.827661174997047 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.03258547093719244, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0388354710303247, "entropy": 0.5908495783805847, "epoch": 6e-05, "grad_norm": 0.27227118611335754, "kl": 0.009793178239533518, "learning_rate": 2.6666666666666664e-06, "loss": 0.0062, "step": 6, "step_time": 8.810438610000347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1027.0, "completions/max_terminated_length": 1027.0, "completions/mean_length": 231.6875, "completions/mean_terminated_length": 231.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5898670703172684, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.3972772359848022, "kl": 0.002214236554209492, "learning_rate": 3.2e-06, "loss": -0.0126, "num_tokens": 182583.0, "reward": 0.27162131667137146, "reward_std": 1.0868947505950928, "rewards/rollout_reward_func/mean": 0.27162131667137146, "rewards/rollout_reward_func/std": 1.5912116765975952, "sampling/importance_sampling_ratio/max": 1.6212953329086304, "sampling/importance_sampling_ratio/mean": 0.8596766591072083, "sampling/importance_sampling_ratio/min": 0.3463258147239685, "sampling/sampling_logp_difference/max": 0.8983482718467712, "sampling/sampling_logp_difference/mean": 0.06186839938163757, "step": 7, "step_time": 12.116631305001647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01458333432674408, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.01458333432674408, "entropy": 0.5949340760707855, "epoch": 8e-05, "grad_norm": 0.7176365852355957, "kl": 0.007167384720332848, "learning_rate": 3.7333333333333333e-06, "loss": -0.0125, "step": 8, "step_time": 7.219443969997883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 139.40625, "completions/mean_terminated_length": 139.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.560302022844553, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.7369521260261536, "kl": 0.004876748149399646, "learning_rate": 4.266666666666667e-06, "loss": -0.0042, "num_tokens": 222153.0, "reward": -0.008272513747215271, "reward_std": 0.5417632460594177, "rewards/rollout_reward_func/mean": -0.008272513747215271, "rewards/rollout_reward_func/std": 0.9568253755569458, "sampling/importance_sampling_ratio/max": 1.1025569438934326, "sampling/importance_sampling_ratio/mean": 0.843124270439148, "sampling/importance_sampling_ratio/min": 0.3177212178707123, "sampling/sampling_logp_difference/max": 0.4518265426158905, "sampling/sampling_logp_difference/mean": 0.06099293380975723, "step": 9, "step_time": 9.450052213000163 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.5663095861673355, "epoch": 0.0001, "grad_norm": 0.7801238298416138, "kl": 0.006132202317530755, "learning_rate": 4.8e-06, "loss": -0.0044, "step": 10, "step_time": 5.177882021996993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 456.15625, "completions/mean_terminated_length": 456.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7295192927122116, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 0.7713008522987366, "kl": 0.002301187181842579, "learning_rate": 5.333333333333333e-06, "loss": -0.0233, "num_tokens": 276525.0, "reward": -0.06042708456516266, "reward_std": 0.5548678636550903, "rewards/rollout_reward_func/mean": -0.06042708456516266, "rewards/rollout_reward_func/std": 0.6719298362731934, "sampling/importance_sampling_ratio/max": 1.2587913274765015, "sampling/importance_sampling_ratio/mean": 0.7997827529907227, "sampling/importance_sampling_ratio/min": 0.49780380725860596, "sampling/sampling_logp_difference/max": 0.3962627649307251, "sampling/sampling_logp_difference/mean": 0.06726472079753876, "step": 11, "step_time": 14.024354443001357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.724354475736618, "epoch": 0.00012, "grad_norm": 0.5204576849937439, "kl": 0.004727993551568943, "learning_rate": 5.866666666666666e-06, "loss": -0.0237, "step": 12, "step_time": 8.684970894000799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 458.78125, "completions/mean_terminated_length": 458.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6593378074467182, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 1.2321019172668457, "kl": 0.026883443380938843, "learning_rate": 6.4e-06, "loss": -0.0337, "num_tokens": 329014.0, "reward": 0.5792493224143982, "reward_std": 1.281329870223999, "rewards/rollout_reward_func/mean": 0.5792493224143982, "rewards/rollout_reward_func/std": 1.3439542055130005, "sampling/importance_sampling_ratio/max": 2.4977619647979736, "sampling/importance_sampling_ratio/mean": 0.8369683623313904, "sampling/importance_sampling_ratio/min": 0.16656683385372162, "sampling/sampling_logp_difference/max": 1.4865641593933105, "sampling/sampling_logp_difference/mean": 0.08205842226743698, "step": 13, "step_time": 12.781198183001834 }, { "clip_ratio/high_max": 0.01726190559566021, "clip_ratio/high_mean": 0.008630952797830105, "clip_ratio/low_mean": 0.02495265193283558, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.033583604730665684, "entropy": 0.6469080410897732, "epoch": 0.00014, "grad_norm": 1.151137351989746, "kl": 0.02960980085481424, "learning_rate": 6.933333333333334e-06, "loss": -0.0327, "step": 14, "step_time": 6.906088376998014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 325.0625, "completions/mean_terminated_length": 325.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6098557002842426, "epoch": 0.00015, "frac_reward_zero_std": 0.25, "grad_norm": 0.4717327952384949, "kl": 0.024435755418380722, "learning_rate": 7.466666666666667e-06, "loss": -0.0108, "num_tokens": 374591.0, "reward": 0.7930901050567627, "reward_std": 0.8609839081764221, "rewards/rollout_reward_func/mean": 0.7930901050567627, "rewards/rollout_reward_func/std": 1.4151973724365234, "sampling/importance_sampling_ratio/max": 1.0506209135055542, "sampling/importance_sampling_ratio/mean": 0.8180942535400391, "sampling/importance_sampling_ratio/min": 0.21363259851932526, "sampling/sampling_logp_difference/max": 0.7379248142242432, "sampling/sampling_logp_difference/mean": 0.058229509741067886, "step": 15, "step_time": 12.683557893002217 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.022727273404598236, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02753496589139104, "entropy": 0.5996548756957054, "epoch": 0.00016, "grad_norm": 0.4265834391117096, "kl": 0.04038397324620746, "learning_rate": 8e-06, "loss": -0.0112, "step": 16, "step_time": 6.956079359999421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.644575547426939, "epoch": 0.00017, "frac_reward_zero_std": 0.25, "grad_norm": 0.8146066665649414, "kl": 0.046186436113202944, "learning_rate": 7.999999999907465e-06, "loss": -0.007, "num_tokens": 414601.0, "reward": 1.4229528903961182, "reward_std": 0.9664409160614014, "rewards/rollout_reward_func/mean": 1.4229528903961182, "rewards/rollout_reward_func/std": 1.3361284732818604, "sampling/importance_sampling_ratio/max": 1.2239395380020142, "sampling/importance_sampling_ratio/mean": 0.8816792964935303, "sampling/importance_sampling_ratio/min": 0.2790951430797577, "sampling/sampling_logp_difference/max": 0.7080600261688232, "sampling/sampling_logp_difference/mean": 0.05067237466573715, "step": 17, "step_time": 10.178569630998027 }, { "clip_ratio/high_max": 0.029513888992369175, "clip_ratio/high_mean": 0.014756944496184587, "clip_ratio/low_mean": 0.041666666977107525, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.05642361147329211, "entropy": 0.6145340763032436, "epoch": 0.00018, "grad_norm": 0.23155571520328522, "kl": 0.23800076835323125, "learning_rate": 7.999999999629861e-06, "loss": -0.0083, "step": 18, "step_time": 5.286352907998662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 226.78125, "completions/mean_terminated_length": 226.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.475945807993412, "epoch": 0.00019, "frac_reward_zero_std": 0.25, "grad_norm": 0.6718065142631531, "kl": 0.1313640770076745, "learning_rate": 7.99999999916719e-06, "loss": -0.0172, "num_tokens": 456186.0, "reward": 0.717463493347168, "reward_std": 1.0493590831756592, "rewards/rollout_reward_func/mean": 0.717463493347168, "rewards/rollout_reward_func/std": 1.385160207748413, "sampling/importance_sampling_ratio/max": 1.1975980997085571, "sampling/importance_sampling_ratio/mean": 0.9053879380226135, "sampling/importance_sampling_ratio/min": 0.5968481302261353, "sampling/sampling_logp_difference/max": 0.49695074558258057, "sampling/sampling_logp_difference/mean": 0.04921717196702957, "step": 19, "step_time": 11.41534333400341 }, { "clip_ratio/high_max": 0.034722222946584225, "clip_ratio/high_mean": 0.017361111473292112, "clip_ratio/low_mean": 0.027777778450399637, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04513888992369175, "entropy": 0.45112011581659317, "epoch": 0.0002, "grad_norm": 0.26766785979270935, "kl": 0.20291895651462255, "learning_rate": 7.999999998519449e-06, "loss": -0.0184, "step": 20, "step_time": 6.5778307339987805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 417.625, "completions/mean_terminated_length": 417.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.750504732131958, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 1.421618103981018, "kl": 0.1448975705425255, "learning_rate": 7.999999997686637e-06, "loss": -0.053, "num_tokens": 508448.0, "reward": 0.7684807181358337, "reward_std": 1.7323917150497437, "rewards/rollout_reward_func/mean": 0.7684807181358337, "rewards/rollout_reward_func/std": 1.6796200275421143, "sampling/importance_sampling_ratio/max": 1.4029730558395386, "sampling/importance_sampling_ratio/mean": 0.7653356194496155, "sampling/importance_sampling_ratio/min": 0.23433545231819153, "sampling/sampling_logp_difference/max": 0.8456215858459473, "sampling/sampling_logp_difference/mean": 0.08127377182245255, "step": 21, "step_time": 12.92976628100223 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.057641143910586834, "clip_ratio/low_min": 0.025252525694668293, "clip_ratio/region_mean": 0.06389114400371909, "entropy": 0.7457476258277893, "epoch": 0.00022, "grad_norm": 2.050600528717041, "kl": 1.61455141013721, "learning_rate": 7.999999996668758e-06, "loss": -0.0522, "step": 22, "step_time": 6.766588177999438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 290.15625, "completions/mean_terminated_length": 290.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4916882663965225, "epoch": 0.00023, "frac_reward_zero_std": 0.25, "grad_norm": 1.3506258726119995, "kl": 1.069765329360962, "learning_rate": 7.99999999546581e-06, "loss": -0.0083, "num_tokens": 555104.0, "reward": 1.4062399864196777, "reward_std": 1.1519603729248047, "rewards/rollout_reward_func/mean": 1.4062399864196777, "rewards/rollout_reward_func/std": 1.410425066947937, "sampling/importance_sampling_ratio/max": 1.3898682594299316, "sampling/importance_sampling_ratio/mean": 0.8012120723724365, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8556509017944336, "sampling/sampling_logp_difference/mean": 0.09328283369541168, "step": 23, "step_time": 11.852552100002868 } ], "logging_steps": 1.0, "max_steps": 400000, "num_input_tokens_seen": 555104, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }