{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 7e-05, "eval_steps": 500, "global_step": 7, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4548.0, "completions/max_terminated_length": 4548.0, "completions/mean_length": 4017.53125, "completions/mean_terminated_length": 4017.53125, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "entropy": 0.2045444082468748, "epoch": 1e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.182149887084961, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0504, "num_tokens": 152249.0, "reward": -1.9340598583221436, "reward_std": 0.786571741104126, "rewards/rollout_reward_func/mean": -1.9340598583221436, "rewards/rollout_reward_func/std": 0.8494329452514648, "sampling/importance_sampling_ratio/max": 1.7909198999404907, "sampling/importance_sampling_ratio/mean": 0.9390337467193604, "sampling/importance_sampling_ratio/min": 0.2846600413322449, "sampling/sampling_logp_difference/max": 0.8398352861404419, "sampling/sampling_logp_difference/mean": 0.019423075020313263, "step": 1, "step_time": 62.780601161008235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2045444082468748, "epoch": 2e-05, "grad_norm": 2.1998178958892822, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.0504, "step": 2, "step_time": 13.312115765969793 }, { "clip_ratio/high_max": 0.010176970972679555, "clip_ratio/high_mean": 0.005981342634186149, "clip_ratio/low_mean": 0.0027048319461755455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008686174580361694, "completions/clipped_ratio": 0.0, "completions/max_length": 4517.0, "completions/max_terminated_length": 4517.0, "completions/mean_length": 4186.15625, "completions/mean_terminated_length": 4186.15625, "completions/min_length": 1764.0, "completions/min_terminated_length": 1764.0, "entropy": 0.17139518074691296, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 4.413552761077881, "kl": 0.001802493366994895, "learning_rate": 5.714285714285715e-07, "loss": -0.0455, "num_tokens": 310339.0, "reward": -2.418001174926758, "reward_std": 0.5239017009735107, "rewards/rollout_reward_func/mean": -2.418001174926758, "rewards/rollout_reward_func/std": 0.5344496965408325, "sampling/importance_sampling_ratio/max": 2.12880802154541, "sampling/importance_sampling_ratio/mean": 1.0546352863311768, "sampling/importance_sampling_ratio/min": 0.30370277166366577, "sampling/sampling_logp_difference/max": 1.1611073017120361, "sampling/sampling_logp_difference/mean": 0.019413897767663002, "step": 3, "step_time": 61.638283942011185 }, { "clip_ratio/high_max": 0.010176970972679555, "clip_ratio/high_mean": 0.005088485486339778, "clip_ratio/low_mean": 0.0017857142956927419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0068741998402401805, "entropy": 0.17303790897130966, "epoch": 4e-05, "grad_norm": 4.2835798263549805, "kl": 0.0016472467759740539, "learning_rate": 8.571428571428572e-07, "loss": -0.0444, "step": 4, "step_time": 14.028172414000437 }, { "clip_ratio/high_max": 0.007093983003869653, "clip_ratio/high_mean": 0.004427273175679147, "clip_ratio/low_mean": 0.003508900583256036, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007936173758935183, "completions/clipped_ratio": 0.0, "completions/max_length": 4595.0, "completions/max_terminated_length": 4595.0, "completions/mean_length": 4118.15625, "completions/mean_terminated_length": 4118.15625, "completions/min_length": 2390.0, "completions/min_terminated_length": 2390.0, "entropy": 0.17528061009943485, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 5.063594818115234, "kl": 0.0027295032050460577, "learning_rate": 1.142857142857143e-06, "loss": -0.229, "num_tokens": 465900.0, "reward": -1.7591285705566406, "reward_std": 1.178617238998413, "rewards/rollout_reward_func/mean": -1.7591285705566406, "rewards/rollout_reward_func/std": 1.5458139181137085, "sampling/importance_sampling_ratio/max": 2.2674672603607178, "sampling/importance_sampling_ratio/mean": 0.9966017007827759, "sampling/importance_sampling_ratio/min": 0.47512274980545044, "sampling/sampling_logp_difference/max": 0.6794887781143188, "sampling/sampling_logp_difference/mean": 0.018155813217163086, "step": 5, "step_time": 62.168050993997895 }, { "clip_ratio/high_max": 0.010666901711374521, "clip_ratio/high_mean": 0.005333450855687261, "clip_ratio/low_mean": 0.0043769561452791095, "clip_ratio/low_min": 0.0017605633474886417, "clip_ratio/region_mean": 0.009710407059174031, "entropy": 0.17503651790320873, "epoch": 6e-05, "grad_norm": 3.878859281539917, "kl": 0.0019046790257561952, "learning_rate": 1.4285714285714286e-06, "loss": -0.2351, "step": 6, "step_time": 13.381077748003008 }, { "clip_ratio/high_max": 0.017248393152840436, "clip_ratio/high_mean": 0.00949225208023563, "clip_ratio/low_mean": 0.006492404674645513, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015984656638465822, "completions/clipped_ratio": 0.0, "completions/max_length": 4658.0, "completions/max_terminated_length": 4658.0, "completions/mean_length": 4001.125, "completions/mean_terminated_length": 4001.125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.21643172018229961, "epoch": 7e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.980289936065674, "kl": 0.001957591186510399, "learning_rate": 1.7142857142857145e-06, "loss": 0.1748, "num_tokens": 618440.0, "reward": -1.9611331224441528, "reward_std": 0.7152342796325684, "rewards/rollout_reward_func/mean": -1.9611331224441528, "rewards/rollout_reward_func/std": 0.7630742192268372, "sampling/importance_sampling_ratio/max": 1.5719738006591797, "sampling/importance_sampling_ratio/mean": 0.9174970388412476, "sampling/importance_sampling_ratio/min": 0.30265289545059204, "sampling/sampling_logp_difference/max": 0.5645420551300049, "sampling/sampling_logp_difference/mean": 0.019142862409353256, "step": 7, "step_time": 61.62411752099433 } ], "logging_steps": 1.0, "max_steps": 400000, "num_input_tokens_seen": 618440, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }