{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9893390191897654, "eval_steps": 500, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1023.3348236083984, "epoch": 0.017057569296375266, "grad_norm": 295.6246337890625, "kl": 0.0, "learning_rate": 5e-07, "loss": 0.0007, "reward": 1.3604911267757416, "reward_std": 0.43993261456489563, "rewards/accuracy_reward": 0.4196428768336773, "rewards/format_reward": 0.9408482611179352, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 1023.4154644012451, "epoch": 0.08528784648187633, "grad_norm": 0.2173105925321579, "kl": 0.0015696585178375244, "learning_rate": 2.5e-06, "loss": 0.0003, "reward": 1.36104916036129, "reward_std": 0.4599178032949567, "rewards/accuracy_reward": 0.4065290354192257, "rewards/format_reward": 0.9545201268047094, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 1023.4203186035156, "epoch": 0.17057569296375266, "grad_norm": 0.3614032566547394, "kl": 80.7047048330307, "learning_rate": 2.956412726139078e-06, "loss": 3.2442, "reward": 1.4700893461704254, "reward_std": 0.42430560290813446, "rewards/accuracy_reward": 0.5145089522004127, "rewards/format_reward": 0.9555803969502449, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 1023.5118347167969, "epoch": 0.255863539445629, "grad_norm": 5230.3447265625, "kl": 0.45456657409667967, "learning_rate": 2.7836719084521715e-06, "loss": 0.0182, "reward": 1.5975447058677674, "reward_std": 0.3535365674644709, "rewards/accuracy_reward": 0.6444196701049805, "rewards/format_reward": 0.9531250447034836, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 1023.7593780517578, "epoch": 0.3411513859275053, "grad_norm": 0.2595553398132324, "kl": 3.4332809448242188, "learning_rate": 2.4946839873611927e-06, "loss": 0.1369, "reward": 1.663169714808464, "reward_std": 0.30231964513659476, "rewards/accuracy_reward": 0.7095982447266579, "rewards/format_reward": 0.9535714656114578, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 1023.8439743041993, "epoch": 0.42643923240938164, "grad_norm": 0.1941014677286148, "kl": 0.010175323486328125, "learning_rate": 2.1156192081791355e-06, "loss": 0.0004, "reward": 1.632812574505806, "reward_std": 0.3012897618114948, "rewards/accuracy_reward": 0.6939732454717159, "rewards/format_reward": 0.9388393223285675, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 1023.8616088867187, "epoch": 0.511727078891258, "grad_norm": 0.18191584944725037, "kl": 0.021715545654296876, "learning_rate": 1.6808050203829845e-06, "loss": 0.0008, "reward": 1.6042411476373672, "reward_std": 0.3133077774196863, "rewards/accuracy_reward": 0.6660714574158192, "rewards/format_reward": 0.9381696850061416, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 1023.3654098510742, "epoch": 0.5970149253731343, "grad_norm": 0.32974478602409363, "kl": 0.02928009033203125, "learning_rate": 1.2296174432791415e-06, "loss": 0.001, "reward": 1.5727679282426834, "reward_std": 0.3634680099785328, "rewards/accuracy_reward": 0.6535714581608772, "rewards/format_reward": 0.9191964700818062, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 1023.0618392944336, "epoch": 0.6823027718550106, "grad_norm": 0.38136547803878784, "kl": 0.0487884521484375, "learning_rate": 8.029152419343472e-07, "loss": 0.0015, "reward": 1.5879464954137803, "reward_std": 0.3696904189884663, "rewards/accuracy_reward": 0.6725446730852127, "rewards/format_reward": 0.9154018253087998, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 1021.6725723266602, "epoch": 0.767590618336887, "grad_norm": 0.5596392750740051, "kl": 0.0637298583984375, "learning_rate": 4.3933982822017883e-07, "loss": 0.0009, "reward": 1.5805804252624511, "reward_std": 0.3978343676775694, "rewards/accuracy_reward": 0.6763393178582191, "rewards/format_reward": 0.9042411163449288, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 1020.4614181518555, "epoch": 0.8528784648187633, "grad_norm": 0.46670985221862793, "kl": 0.07415771484375, "learning_rate": 1.718159615201853e-07, "loss": 0.0019, "reward": 1.5774554371833802, "reward_std": 0.40023380927741525, "rewards/accuracy_reward": 0.6667411029338837, "rewards/format_reward": 0.9107143297791481, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1020.0493682861328, "epoch": 0.9381663113006397, "grad_norm": 0.28767386078834534, "kl": 0.143035888671875, "learning_rate": 2.4570139579284723e-08, "loss": 0.0027, "reward": 1.6111607879400254, "reward_std": 0.3957536414265633, "rewards/accuracy_reward": 0.6975446730852127, "rewards/format_reward": 0.9136161133646965, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 1021.3299789428711, "epoch": 0.9893390191897654, "kl": 0.07426961263020833, "reward": 1.6130953133106232, "reward_std": 0.3950556789835294, "rewards/accuracy_reward": 0.691592293481032, "rewards/format_reward": 0.9215030198295912, "step": 58, "total_flos": 0.0, "train_loss": 0.294493970670938, "train_runtime": 20554.7381, "train_samples_per_second": 0.365, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }