{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00011, "eval_steps": 500, "global_step": 11, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1752.0, "completions/max_terminated_length": 1752.0, "completions/mean_length": 1533.40625, "completions/mean_terminated_length": 1526.3333740234375, "completions/min_length": 1257.0, "completions/min_terminated_length": 1257.0, "entropy": 0.3670750502496958, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 5.134232521057129, "kl": 0.0, "learning_rate": 0.0, "loss": 0.5565, "num_tokens": 70741.0, "reward": -2.7539215087890625, "reward_std": 8.533812522888184, "rewards/rollout_reward_func/mean": -2.7539215087890625, "rewards/rollout_reward_func/std": 9.058825492858887, "sampling/importance_sampling_ratio/max": 2.21063232421875, "sampling/importance_sampling_ratio/mean": 0.9861273765563965, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.751568794250488, "sampling/sampling_logp_difference/mean": 0.06819069385528564, "step": 1, "step_time": 37.74923437399957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3670750502496958, "epoch": 2e-05, "grad_norm": 5.1552276611328125, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.5565, "step": 2, "step_time": 6.120819243000369 }, { "clip_ratio/high_max": 0.017710780957713723, "clip_ratio/high_mean": 0.009723446099087596, "clip_ratio/low_mean": 0.0025572447921149433, "clip_ratio/low_min": 0.0016891892300918698, "clip_ratio/region_mean": 0.012280690658371896, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 1557.40625, "completions/mean_terminated_length": 1557.40625, "completions/min_length": 1426.0, "completions/min_terminated_length": 1426.0, "entropy": 0.38403771445155144, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 4.518792152404785, "kl": 0.003740338608622551, "learning_rate": 5.714285714285715e-07, "loss": 0.1169, "num_tokens": 142373.0, "reward": -0.3596813678741455, "reward_std": 4.564228534698486, "rewards/rollout_reward_func/mean": -0.3596813678741455, "rewards/rollout_reward_func/std": 10.295997619628906, "sampling/importance_sampling_ratio/max": 2.081014394760132, "sampling/importance_sampling_ratio/mean": 0.9942376613616943, "sampling/importance_sampling_ratio/min": 5.060187137039498e-36, "sampling/sampling_logp_difference/max": 29.89274787902832, "sampling/sampling_logp_difference/mean": 0.13873185217380524, "step": 3, "step_time": 36.35921934200087 }, { "clip_ratio/high_max": 0.011207867413759232, "clip_ratio/high_mean": 0.00647198932711035, "clip_ratio/low_mean": 0.00706147204618901, "clip_ratio/low_min": 0.0017361111240461469, "clip_ratio/region_mean": 0.013533461140468717, "entropy": 0.38358503952622414, "epoch": 4e-05, "grad_norm": 4.347132205963135, "kl": 0.011755978513974696, "learning_rate": 8.571428571428572e-07, "loss": 0.119, "step": 4, "step_time": 6.707314684997982 }, { "clip_ratio/high_max": 0.012880883645266294, "clip_ratio/high_mean": 0.008077950216829777, "clip_ratio/low_mean": 0.005651012295857072, "clip_ratio/low_min": 0.0016891892300918698, "clip_ratio/region_mean": 0.013728962745517492, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 1573.75, "completions/mean_terminated_length": 1573.75, "completions/min_length": 1443.0, "completions/min_terminated_length": 1443.0, "entropy": 0.3456532806158066, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 4.78507661819458, "kl": 0.004491248982958496, "learning_rate": 1.142857142857143e-06, "loss": -0.0491, "num_tokens": 214405.0, "reward": -2.5846972465515137, "reward_std": 9.13107681274414, "rewards/rollout_reward_func/mean": -2.5846972465515137, "rewards/rollout_reward_func/std": 12.559075355529785, "sampling/importance_sampling_ratio/max": 2.6732370853424072, "sampling/importance_sampling_ratio/mean": 0.8290778398513794, "sampling/importance_sampling_ratio/min": 4.706499113173833e-40, "sampling/sampling_logp_difference/max": 28.09636878967285, "sampling/sampling_logp_difference/mean": 0.18767967820167542, "step": 5, "step_time": 34.56705875800071 }, { "clip_ratio/high_max": 0.015001210267655551, "clip_ratio/high_mean": 0.007500605133827776, "clip_ratio/low_mean": 0.010737838223576546, "clip_ratio/low_min": 0.004310344811528921, "clip_ratio/region_mean": 0.018238443473819643, "entropy": 0.345603134483099, "epoch": 6e-05, "grad_norm": 4.702767372131348, "kl": 0.00609777684439905, "learning_rate": 1.4285714285714286e-06, "loss": -0.0462, "step": 6, "step_time": 6.165537762000895 }, { "clip_ratio/high_max": 0.026439692708663642, "clip_ratio/high_mean": 0.01679553568828851, "clip_ratio/low_mean": 0.00500575453042984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021801290102303028, "completions/clipped_ratio": 0.0625, "completions/max_length": 1775.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 1541.125, "completions/mean_terminated_length": 1525.7333984375, "completions/min_length": 1423.0, "completions/min_terminated_length": 1423.0, "entropy": 0.39175233244895935, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 12.207725524902344, "kl": 0.007055700887576677, "learning_rate": 1.7142857142857145e-06, "loss": 0.1615, "num_tokens": 284511.0, "reward": -8.317660331726074, "reward_std": 4.3451619148254395, "rewards/rollout_reward_func/mean": -8.317660331726074, "rewards/rollout_reward_func/std": 4.4500837326049805, "sampling/importance_sampling_ratio/max": 1.7571941614151, "sampling/importance_sampling_ratio/mean": 0.8103891611099243, "sampling/importance_sampling_ratio/min": 1.6684203226478317e-26, "sampling/sampling_logp_difference/max": 24.94557762145996, "sampling/sampling_logp_difference/mean": 0.07305071502923965, "step": 7, "step_time": 35.98801361099686 }, { "clip_ratio/high_max": 0.02302091906312853, "clip_ratio/high_mean": 0.012378515035379678, "clip_ratio/low_mean": 0.006711377413012087, "clip_ratio/low_min": 0.0031250000465661287, "clip_ratio/region_mean": 0.019089892564807087, "entropy": 0.39244432002305984, "epoch": 8e-05, "grad_norm": 5.381319522857666, "kl": 0.01739203451143112, "learning_rate": 2.0000000000000003e-06, "loss": 0.1543, "step": 8, "step_time": 6.732931664002535 }, { "clip_ratio/high_max": 0.012895698891952634, "clip_ratio/high_mean": 0.008981633465737104, "clip_ratio/low_mean": 0.008420342986937612, "clip_ratio/low_min": 0.0014534883666783571, "clip_ratio/region_mean": 0.01740197604522109, "completions/clipped_ratio": 0.0625, "completions/max_length": 1966.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 1668.375, "completions/mean_terminated_length": 1649.5667724609375, "completions/min_length": 1461.0, "completions/min_terminated_length": 1461.0, "entropy": 0.3468770608305931, "epoch": 9e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.7011311054229736, "kl": 0.004557034379104152, "learning_rate": 2.285714285714286e-06, "loss": 0.2606, "num_tokens": 359632.0, "reward": -4.431756019592285, "reward_std": 7.569368362426758, "rewards/rollout_reward_func/mean": -4.431756019592285, "rewards/rollout_reward_func/std": 7.730715274810791, "sampling/importance_sampling_ratio/max": 2.27473521232605, "sampling/importance_sampling_ratio/mean": 1.0028448104858398, "sampling/importance_sampling_ratio/min": 1.6143619129999664e-21, "sampling/sampling_logp_difference/max": 28.98799705505371, "sampling/sampling_logp_difference/mean": 0.07533948123455048, "step": 9, "step_time": 38.395322346001194 }, { "clip_ratio/high_max": 0.0125588474329561, "clip_ratio/high_mean": 0.009703430347144604, "clip_ratio/low_mean": 0.0093731161323376, "clip_ratio/low_min": 0.0016891892300918698, "clip_ratio/region_mean": 0.019076546072028577, "entropy": 0.34513476490974426, "epoch": 0.0001, "grad_norm": 2.967703104019165, "kl": 0.00513960600073915, "learning_rate": 2.571428571428571e-06, "loss": 0.2603, "step": 10, "step_time": 6.693931524998334 }, { "clip_ratio/high_max": 0.02302549034357071, "clip_ratio/high_mean": 0.01321350410580635, "clip_ratio/low_mean": 0.010665920155588537, "clip_ratio/low_min": 0.0035211266949772835, "clip_ratio/region_mean": 0.023879424086771905, "completions/clipped_ratio": 0.0, "completions/max_length": 1776.0, "completions/max_terminated_length": 1776.0, "completions/mean_length": 1566.75, "completions/mean_terminated_length": 1566.75, "completions/min_length": 1419.0, "completions/min_terminated_length": 1419.0, "entropy": 0.40919550508260727, "epoch": 0.00011, "frac_reward_zero_std": 0.0, "grad_norm": 4.4168381690979, "kl": 0.008918058272683993, "learning_rate": 2.8571428571428573e-06, "loss": -0.3278, "num_tokens": 430181.0, "reward": -0.3421344757080078, "reward_std": 7.553745269775391, "rewards/rollout_reward_func/mean": -0.3421344757080078, "rewards/rollout_reward_func/std": 11.30397891998291, "sampling/importance_sampling_ratio/max": 2.6663432121276855, "sampling/importance_sampling_ratio/mean": 0.8930033445358276, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 29.21600341796875, "sampling/sampling_logp_difference/mean": 0.22408267855644226, "step": 11, "step_time": 35.09687606099942 } ], "logging_steps": 1.0, "max_steps": 500000, "num_input_tokens_seen": 430181, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }