{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.8, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 794.7712326049805, "epoch": 0.2, "grad_norm": 0.1648624688386917, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0129, "reward": 0.1897321529686451, "reward_std": 0.19452152773737907, "rewards/accuracy_reward": 0.16183036379516125, "rewards/format_reward": 0.027901787078008056, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 788.7571487426758, "epoch": 1.2, "grad_norm": 0.2475416511297226, "kl": 0.0007533133029937744, "learning_rate": 2.9392394604217463e-06, "loss": 0.024, "reward": 0.16434152494184673, "reward_std": 0.19889171607792377, "rewards/accuracy_reward": 0.13504464935977012, "rewards/format_reward": 0.029296876542503014, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 780.9437911987304, "epoch": 2.4, "grad_norm": 0.29467591643333435, "kl": 0.030749130249023437, "learning_rate": 2.3109612261833968e-06, "loss": 0.0394, "reward": 0.23705358430743217, "reward_std": 0.29603362139314415, "rewards/accuracy_reward": 0.13928571976721288, "rewards/format_reward": 0.09776786174625159, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 736.5603805541992, "epoch": 3.6, "grad_norm": 0.3751678168773651, "kl": 0.06479034423828126, "learning_rate": 1.2865277425900725e-06, "loss": 0.0791, "reward": 0.4093750208616257, "reward_std": 0.43266602158546447, "rewards/accuracy_reward": 0.12991072016302496, "rewards/format_reward": 0.2794642996042967, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 690.4044540405273, "epoch": 4.8, "grad_norm": 0.3264864385128021, "kl": 0.0665924072265625, "learning_rate": 3.6637563846861275e-07, "loss": 0.0782, "reward": 0.5441964529454708, "reward_std": 0.4915049530565739, "rewards/accuracy_reward": 0.118750005424954, "rewards/format_reward": 0.4254464492201805, "step": 20 }, { "epoch": 4.8, "step": 20, "total_flos": 0.0, "train_loss": 0.054626915324479344, "train_runtime": 6323.8598, "train_samples_per_second": 0.499, "train_steps_per_second": 0.004 } ], "logging_steps": 5, "max_steps": 25, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }