{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 500, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.017057569296375266, "grad_norm": 0.09009335935115814, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0, "reward": 0.2321428693830967, "reward_std": 0.3139922395348549, "rewards/accuracy_reward": 0.2321428693830967, "rewards/format_reward": 0.0, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.08528784648187633, "grad_norm": 0.07332542538642883, "kl": 9.924173355102539e-05, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.22935269074514508, "reward_std": 0.30691308877430856, "rewards/accuracy_reward": 0.22935269074514508, "rewards/format_reward": 0.0, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.17057569296375266, "grad_norm": 0.0727960467338562, "kl": 0.0027547836303710937, "learning_rate": 2.956412726139078e-06, "loss": 0.0001, "reward": 0.4187500193715096, "reward_std": 0.33326763547956945, "rewards/accuracy_reward": 0.4187500193715096, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.255863539445629, "grad_norm": 0.06532283872365952, "kl": 0.17901992797851562, "learning_rate": 2.7836719084521715e-06, "loss": 0.0072, "reward": 0.7390625298023223, "reward_std": 0.17226867079734803, "rewards/accuracy_reward": 0.7390625298023223, "rewards/format_reward": 0.0, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.3411513859275053, "grad_norm": 0.08123523741960526, "kl": 0.020281982421875, "learning_rate": 2.4946839873611927e-06, "loss": 0.0008, "reward": 0.7357143208384513, "reward_std": 0.16036716918461025, "rewards/accuracy_reward": 0.7357143208384513, "rewards/format_reward": 0.0, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.42643923240938164, "grad_norm": 98653831168.0, "kl": 209715200.01299286, "learning_rate": 2.1156192081791355e-06, "loss": 8388608.0, "reward": 0.731919676065445, "reward_std": 0.15736169517040252, "rewards/accuracy_reward": 0.731919676065445, "rewards/format_reward": 0.0, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.511727078891258, "grad_norm": 13029.787109375, "kl": 67.636962890625, "learning_rate": 1.6808050203829845e-06, "loss": 2.7023, "reward": 0.7205357491970062, "reward_std": 0.1578733256086707, "rewards/accuracy_reward": 0.7205357491970062, "rewards/format_reward": 0.0, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.5970149253731343, "grad_norm": 0.39150699973106384, "kl": 2.8228424072265623, "learning_rate": 1.2296174432791415e-06, "loss": 0.1129, "reward": 0.7111607402563095, "reward_std": 0.16545333191752434, "rewards/accuracy_reward": 0.7111607402563095, "rewards/format_reward": 0.0, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.6823027718550106, "grad_norm": 91.16580963134766, "kl": 28502426.75253143, "learning_rate": 8.029152419343472e-07, "loss": 1146119.4, "reward": 0.7203125312924386, "reward_std": 0.16900291871279477, "rewards/accuracy_reward": 0.7203125312924386, "rewards/format_reward": 0.0, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.767590618336887, "grad_norm": 552.0439453125, "kl": 116332.38398895264, "learning_rate": 4.3933982822017883e-07, "loss": 4652.1254, "reward": 0.7176339611411094, "reward_std": 0.1728838672861457, "rewards/accuracy_reward": 0.7176339611411094, "rewards/format_reward": 0.0, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.8528784648187633, "grad_norm": 0.14540241658687592, "kl": 801.6095245361328, "learning_rate": 1.718159615201853e-07, "loss": 32.0004, "reward": 0.7058035969734192, "reward_std": 0.1750278390944004, "rewards/accuracy_reward": 0.7058035969734192, "rewards/format_reward": 0.0, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.9381663113006397, "grad_norm": 4.781561851501465, "kl": 22835.21565475464, "learning_rate": 2.4570139579284723e-08, "loss": 914.4983, "reward": 0.7439732491970062, "reward_std": 0.16849460527300836, "rewards/accuracy_reward": 0.7439732491970062, "rewards/format_reward": 0.0, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1024.0, "epoch": 0.9893390191897654, "kl": 1372.5967407226562, "reward": 0.7388393208384514, "reward_std": 0.16757233006258807, "rewards/accuracy_reward": 0.7388393208384514, "rewards/format_reward": 0.0, "step": 58, "total_flos": 0.0, "train_loss": 822442.3794969221, "train_runtime": 20401.2733, "train_samples_per_second": 0.368, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }