{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.30175015087507545, "eval_steps": 250, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12125, "completions/max_length": 255.34, "completions/max_terminated_length": 252.14, "completions/mean_length": 221.534375, "completions/mean_terminated_length": 216.93697082519532, "completions/min_length": 173.54, "completions/min_terminated_length": 173.54, "entropy": 0.10048629969358444, "epoch": 0.030175015087507542, "frac_reward_zero_std": 0.3225, "grad_norm": 0.46380576491355896, "learning_rate": 5e-05, "loss": 0.004, "num_tokens": 8142396.0, "reward": 7.30375, "reward_std": 1.5006456315517425, "rewards/event_reward_fn/mean": 7.30375, "rewards/event_reward_fn/std": 6.278198585510254, "step": 50, "step_time": 40.824848868116966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068125, "completions/max_length": 251.74, "completions/max_terminated_length": 248.06, "completions/mean_length": 215.08625, "completions/mean_terminated_length": 212.25316284179686, "completions/min_length": 171.76, "completions/min_terminated_length": 171.76, "entropy": 0.10318506792187691, "epoch": 0.060350030175015085, "frac_reward_zero_std": 0.325, "grad_norm": 0.21978232264518738, "learning_rate": 5e-05, "loss": -0.0025, "num_tokens": 16421719.0, "reward": 7.36875, "reward_std": 1.3263894939422607, "rewards/event_reward_fn/mean": 7.36875, "rewards/event_reward_fn/std": 6.119045643806458, "step": 100, "step_time": 38.99798643006128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4825, "completions/max_length": 256.0, "completions/max_terminated_length": 251.32, "completions/mean_length": 238.104375, "completions/mean_terminated_length": 221.8957485961914, "completions/min_length": 191.34, "completions/min_terminated_length": 191.34, "entropy": 0.10444845259189606, "epoch": 0.09052504526252263, "frac_reward_zero_std": 0.2925, "grad_norm": 0.5579063892364502, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 24885844.0, "reward": 7.74625, "reward_std": 1.5345598912239076, "rewards/event_reward_fn/mean": 7.74625, "rewards/event_reward_fn/std": 6.464660973548889, "step": 150, "step_time": 41.26081488572061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.7925, "completions/max_length": 256.0, "completions/max_terminated_length": 202.92, "completions/mean_length": 245.916875, "completions/mean_terminated_length": 184.6587713623047, "completions/min_length": 199.94, "completions/min_terminated_length": 169.22, "entropy": 0.10581055819988251, "epoch": 0.12070006035003017, "frac_reward_zero_std": 0.33, "grad_norm": 0.31808722019195557, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 33226966.0, "reward": 7.19125, "reward_std": 1.4298825466632843, "rewards/event_reward_fn/mean": 7.19125, "rewards/event_reward_fn/std": 5.8599746036529545, "step": 200, "step_time": 41.91275953448203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.825, "completions/max_length": 256.0, "completions/max_terminated_length": 181.12, "completions/mean_length": 245.851875, "completions/mean_terminated_length": 163.72261688232422, "completions/min_length": 198.46, "completions/min_terminated_length": 152.38, "entropy": 0.10499135926365852, "epoch": 0.15087507543753773, "frac_reward_zero_std": 0.2875, "grad_norm": 0.2646925449371338, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 41523308.0, "reward": 7.9475, "reward_std": 1.5300491595268249, "rewards/event_reward_fn/mean": 7.9475, "rewards/event_reward_fn/std": 6.3965685844421385, "step": 250, "step_time": 41.663273623897695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.898125, "completions/max_length": 256.0, "completions/max_terminated_length": 149.62, "completions/mean_length": 250.625625, "completions/mean_terminated_length": 144.78653198242188, "completions/min_length": 215.68, "completions/min_terminated_length": 138.88, "entropy": 0.10884671121835708, "epoch": 0.18105009052504525, "frac_reward_zero_std": 0.3325, "grad_norm": 0.5418329834938049, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 49889481.0, "reward": 7.489375, "reward_std": 1.5504147619009019, "rewards/event_reward_fn/mean": 7.489375, "rewards/event_reward_fn/std": 6.099679977893829, "step": 300, "step_time": 40.817094522019616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9275, "completions/max_length": 256.0, "completions/max_terminated_length": 131.48, "completions/mean_length": 253.1625, "completions/mean_terminated_length": 125.53590209960937, "completions/min_length": 228.04, "completions/min_terminated_length": 120.52, "entropy": 0.10796756476163864, "epoch": 0.2112251056125528, "frac_reward_zero_std": 0.3175, "grad_norm": 0.4433981776237488, "learning_rate": 5e-05, "loss": 0.0019, "num_tokens": 58206892.0, "reward": 7.89625, "reward_std": 1.573977051973343, "rewards/event_reward_fn/mean": 7.89625, "rewards/event_reward_fn/std": 6.586006484031677, "step": 350, "step_time": 42.12015992245928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.945625, "completions/max_length": 256.0, "completions/max_terminated_length": 151.42, "completions/mean_length": 254.76625, "completions/mean_terminated_length": 146.34000091552736, "completions/min_length": 238.42, "completions/min_terminated_length": 141.14, "entropy": 0.11530103281140328, "epoch": 0.24140012070006034, "frac_reward_zero_std": 0.29, "grad_norm": 0.3932775855064392, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 66513664.0, "reward": 7.304375, "reward_std": 1.552179645895958, "rewards/event_reward_fn/mean": 7.304375, "rewards/event_reward_fn/std": 5.687906408309937, "step": 400, "step_time": 40.78123372233997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.92375, "completions/max_length": 256.0, "completions/max_terminated_length": 185.1, "completions/mean_length": 254.35875, "completions/mean_terminated_length": 178.61883544921875, "completions/min_length": 232.56, "completions/min_terminated_length": 171.12, "entropy": 0.13443249970674515, "epoch": 0.27157513578756787, "frac_reward_zero_std": 0.315, "grad_norm": 0.2284364551305771, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 74493599.0, "reward": 7.766875, "reward_std": 1.5890911322832109, "rewards/event_reward_fn/mean": 7.766875, "rewards/event_reward_fn/std": 6.074563751220703, "step": 450, "step_time": 40.8964025861409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.983125, "completions/max_length": 256.0, "completions/max_terminated_length": 73.22, "completions/mean_length": 255.728125, "completions/mean_terminated_length": 72.48666687011719, "completions/min_length": 250.14, "completions/min_terminated_length": 70.94, "entropy": 0.1348781806230545, "epoch": 0.30175015087507545, "frac_reward_zero_std": 0.32, "grad_norm": 0.44683775305747986, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 82766712.0, "reward": 7.835625, "reward_std": 1.6530324041843414, "rewards/event_reward_fn/mean": 7.835625, "rewards/event_reward_fn/std": 6.139980282783508, "step": 500, "step_time": 41.13054014526191 } ], "logging_steps": 50, "max_steps": 16570, "num_input_tokens_seen": 82766712, "num_train_epochs": 10, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }