{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00026, "eval_steps": 500, "global_step": 13, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1779.0, "completions/max_terminated_length": 1779.0, "completions/mean_length": 1698.46875, "completions/mean_terminated_length": 1698.46875, "completions/min_length": 1279.0, "completions/min_terminated_length": 1279.0, "entropy": 0.48651931062340736, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.5409082174301147, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0112, "num_tokens": 75031.0, "reward": -9.59624195098877, "reward_std": 5.939093589782715, "rewards/rollout_reward_func/mean": -9.59624195098877, "rewards/rollout_reward_func/std": 10.368197441101074, "sampling/importance_sampling_ratio/max": 1.3440189361572266, "sampling/importance_sampling_ratio/mean": 0.9953499436378479, "sampling/importance_sampling_ratio/min": 0.564490556716919, "sampling/sampling_logp_difference/max": 0.45447802543640137, "sampling/sampling_logp_difference/mean": 0.016698362305760384, "step": 1, "step_time": 36.680761918001735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.48651931062340736, "epoch": 4e-05, "grad_norm": 1.5392467975616455, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.0112, "step": 2, "step_time": 5.709443093002847 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 1625.21875, "completions/mean_terminated_length": 1625.21875, "completions/min_length": 1159.0, "completions/min_terminated_length": 1159.0, "entropy": 0.48103801161050797, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.117859125137329, "kl": 0.0010091230506077409, "learning_rate": 5.714285714285715e-07, "loss": -0.0237, "num_tokens": 147721.0, "reward": -7.404824733734131, "reward_std": 11.744457244873047, "rewards/rollout_reward_func/mean": -7.404824733734131, "rewards/rollout_reward_func/std": 15.456405639648438, "sampling/importance_sampling_ratio/max": 1.4090882539749146, "sampling/importance_sampling_ratio/mean": 1.0395634174346924, "sampling/importance_sampling_ratio/min": 0.7728875279426575, "sampling/sampling_logp_difference/max": 0.2340834140777588, "sampling/sampling_logp_difference/mean": 0.019678719341754913, "step": 3, "step_time": 35.33501763899767 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.005859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "entropy": 0.48065420985221863, "epoch": 8e-05, "grad_norm": 2.1440107822418213, "kl": 0.0009154866565950215, "learning_rate": 8.571428571428572e-07, "loss": -0.0232, "step": 4, "step_time": 5.808208025997374 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005859375, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1573.65625, "completions/mean_terminated_length": 1573.65625, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "entropy": 0.43740712106227875, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 1.864342212677002, "kl": 0.0005766874128312338, "learning_rate": 1.142857142857143e-06, "loss": -0.0206, "num_tokens": 218674.0, "reward": -14.006583213806152, "reward_std": 12.985024452209473, "rewards/rollout_reward_func/mean": -14.006583213806152, "rewards/rollout_reward_func/std": 17.190784454345703, "sampling/importance_sampling_ratio/max": 1.3863141536712646, "sampling/importance_sampling_ratio/mean": 0.9954429864883423, "sampling/importance_sampling_ratio/min": 0.6810365915298462, "sampling/sampling_logp_difference/max": 0.2415471076965332, "sampling/sampling_logp_difference/mean": 0.016646649688482285, "step": 5, "step_time": 34.27298692500153 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 0.4379243813455105, "epoch": 0.00012, "grad_norm": 1.9039454460144043, "kl": 0.00071882207703311, "learning_rate": 1.4285714285714286e-06, "loss": -0.0202, "step": 6, "step_time": 5.641448482998385 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1748.0, "completions/max_terminated_length": 1748.0, "completions/mean_length": 1575.5, "completions/mean_terminated_length": 1575.5, "completions/min_length": 1186.0, "completions/min_terminated_length": 1186.0, "entropy": 0.4470406360924244, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 1.4503103494644165, "kl": 0.0008566801006963942, "learning_rate": 1.7142857142857145e-06, "loss": 0.0132, "num_tokens": 289160.0, "reward": -3.2668540477752686, "reward_std": 10.61334228515625, "rewards/rollout_reward_func/mean": -3.2668540477752686, "rewards/rollout_reward_func/std": 16.216392517089844, "sampling/importance_sampling_ratio/max": 1.3690364360809326, "sampling/importance_sampling_ratio/mean": 1.0221995115280151, "sampling/importance_sampling_ratio/min": 0.6548231840133667, "sampling/sampling_logp_difference/max": 0.392575740814209, "sampling/sampling_logp_difference/mean": 0.01853613555431366, "step": 7, "step_time": 34.67648999299854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.44634250551462173, "epoch": 0.00016, "grad_norm": 1.4721945524215698, "kl": 0.0007410887337755412, "learning_rate": 2.0000000000000003e-06, "loss": 0.014, "step": 8, "step_time": 5.566038421000485 }, { "clip_ratio/high_max": 0.01171875, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 1650.5, "completions/mean_terminated_length": 1650.5, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "entropy": 0.5013628304004669, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 2.5520236492156982, "kl": 0.001372927552438341, "learning_rate": 2.285714285714286e-06, "loss": -0.0308, "num_tokens": 362601.0, "reward": -13.83917236328125, "reward_std": 12.006336212158203, "rewards/rollout_reward_func/mean": -13.83917236328125, "rewards/rollout_reward_func/std": 14.237728118896484, "sampling/importance_sampling_ratio/max": 1.3693691492080688, "sampling/importance_sampling_ratio/mean": 0.9588738679885864, "sampling/importance_sampling_ratio/min": 0.5098013281822205, "sampling/sampling_logp_difference/max": 0.735576868057251, "sampling/sampling_logp_difference/mean": 0.02071024850010872, "step": 9, "step_time": 34.420860370997616 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "entropy": 0.49781131744384766, "epoch": 0.0002, "grad_norm": 2.5958364009857178, "kl": 0.0012885355827165768, "learning_rate": 2.571428571428571e-06, "loss": -0.0278, "step": 10, "step_time": 5.687426060998405 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 1532.0, "completions/mean_terminated_length": 1532.0, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "entropy": 0.42485806345939636, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 1.8452433347702026, "kl": 0.000992896981188096, "learning_rate": 2.8571428571428573e-06, "loss": 0.0203, "num_tokens": 431550.0, "reward": -0.7765803337097168, "reward_std": 14.750946044921875, "rewards/rollout_reward_func/mean": -0.7765803337097168, "rewards/rollout_reward_func/std": 21.5161190032959, "sampling/importance_sampling_ratio/max": 1.3237504959106445, "sampling/importance_sampling_ratio/mean": 1.0001271963119507, "sampling/importance_sampling_ratio/min": 0.6408203840255737, "sampling/sampling_logp_difference/max": 0.33285045623779297, "sampling/sampling_logp_difference/mean": 0.01815984398126602, "step": 11, "step_time": 33.55919377600003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4245442971587181, "epoch": 0.00024, "grad_norm": 1.6485886573791504, "kl": 0.0013787990028504282, "learning_rate": 3.142857142857143e-06, "loss": 0.019, "step": 12, "step_time": 5.637909467997815 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001953125, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 1633.25, "completions/mean_terminated_length": 1633.25, "completions/min_length": 1195.0, "completions/min_terminated_length": 1195.0, "entropy": 0.478180218487978, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 2.3034250736236572, "kl": 0.0017315489676548168, "learning_rate": 3.428571428571429e-06, "loss": -0.0346, "num_tokens": 504355.0, "reward": -17.26664924621582, "reward_std": 14.347229957580566, "rewards/rollout_reward_func/mean": -17.26664924621582, "rewards/rollout_reward_func/std": 18.007043838500977, "sampling/importance_sampling_ratio/max": 1.7957122325897217, "sampling/importance_sampling_ratio/mean": 1.0002973079681396, "sampling/importance_sampling_ratio/min": 0.5741486549377441, "sampling/sampling_logp_difference/max": 0.5055437088012695, "sampling/sampling_logp_difference/mean": 0.024692352861166, "step": 13, "step_time": 32.15085511000143 } ], "logging_steps": 1.0, "max_steps": 100000, "num_input_tokens_seen": 504355, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }