{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0012675024260788625, "eval_steps": 500, "global_step": 8, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001584378032598578, "grad_norm": 7.086519326549023e-05, "learning_rate": 2.9998415465061005e-05, "loss": 0.0, "loss/policy_avg": 9.534414857625961e-08, "objective/entropy": 66.7407455444336, "objective/kl": 0.0, "objective/rlhf_reward": 1.9394512176513672, "objective/scores": 1.9393310546875, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.501089334487915, "step": 1, "timer/calc_advantages": 1.9189459085464478, "timer/calc_loss": 0.7234929800033569, "timer/get_reward": 0.5044295787811279, "timer/training_step": 5.130897045135498, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.0003168756065197156, "grad_norm": 9.20624828338623, "learning_rate": 2.999683093012201e-05, "loss": 0.0082, "loss/policy_avg": 0.008180337026715279, "objective/entropy": 58.23992919921875, "objective/kl": 0.04112038388848305, "objective/rlhf_reward": 2.586242914199829, "objective/scores": 2.59033203125, "policy/approxkl_avg": 0.13751985132694244, "policy/clipfrac_avg": 0.353515625, "policy/entropy_avg": 0.4371030628681183, "step": 2, "timer/calc_advantages": 1.77765953540802, "timer/calc_loss": 0.620478630065918, "timer/get_reward": 0.43380415439605713, "timer/training_step": 4.58284854888916, "val/num_eos_tokens": 0.0, "val/ratio": 1.0005027055740356, "val/ratio_var": NaN }, { "epoch": 0.0004753134097795734, "grad_norm": 11.192896842956543, "learning_rate": 2.9995246395183014e-05, "loss": 0.0083, "loss/policy_avg": 0.008318130858242512, "objective/entropy": 60.747886657714844, "objective/kl": 0.1742033064365387, "objective/rlhf_reward": 2.7990851402282715, "objective/scores": 2.8162841796875, "policy/approxkl_avg": 0.1877279430627823, "policy/clipfrac_avg": 0.345703125, "policy/entropy_avg": 0.47007349133491516, "step": 3, "timer/calc_advantages": 2.159975528717041, "timer/calc_loss": 0.8658402562141418, "timer/get_reward": 0.5989301204681396, "timer/training_step": 5.865015029907227, "val/num_eos_tokens": 0.0, "val/ratio": 0.9998154640197754, "val/ratio_var": NaN }, { "epoch": 0.0006337512130394313, "grad_norm": 10.267037391662598, "learning_rate": 2.999366186024402e-05, "loss": 0.0119, "loss/policy_avg": 0.011887951754033566, "objective/entropy": 59.366294860839844, "objective/kl": 0.12045621126890182, "objective/rlhf_reward": 2.8014278411865234, "objective/scores": 2.8134765625, "policy/approxkl_avg": 0.2340196967124939, "policy/clipfrac_avg": 0.369140625, "policy/entropy_avg": 0.4660375118255615, "step": 4, "timer/calc_advantages": 1.9160572290420532, "timer/calc_loss": 0.8047055602073669, "timer/get_reward": 0.5554770231246948, "timer/training_step": 5.4731550216674805, "val/num_eos_tokens": 0.0, "val/ratio": 1.000678539276123, "val/ratio_var": NaN }, { "epoch": 0.000792189016299289, "grad_norm": 5.658022200805135e-05, "learning_rate": 2.9992077325305024e-05, "loss": 0.0, "loss/policy_avg": 9.505311027169228e-08, "objective/entropy": 63.056705474853516, "objective/kl": 0.0, "objective/rlhf_reward": 1.961860179901123, "objective/scores": 1.961883544921875, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.49685803055763245, "step": 5, "timer/calc_advantages": 1.9551451206207275, "timer/calc_loss": 0.7570784091949463, "timer/get_reward": 0.5253068208694458, "timer/training_step": 5.274328231811523, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.0009506268195591468, "grad_norm": 5.630626201629639, "learning_rate": 2.9990492790366028e-05, "loss": 0.0036, "loss/policy_avg": 0.0035882075317204, "objective/entropy": 64.0039291381836, "objective/kl": -0.06272067129611969, "objective/rlhf_reward": 2.5381531715393066, "objective/scores": 2.53204345703125, "policy/approxkl_avg": 0.07097644358873367, "policy/clipfrac_avg": 0.244140625, "policy/entropy_avg": 0.49359244108200073, "step": 6, "timer/calc_advantages": 1.9771534204483032, "timer/calc_loss": 0.7805941700935364, "timer/get_reward": 0.5281075835227966, "timer/training_step": 5.320864677429199, "val/num_eos_tokens": 0.0, "val/ratio": 1.000971794128418, "val/ratio_var": NaN }, { "epoch": 0.0011090646228190046, "grad_norm": 7.572779178619385, "learning_rate": 2.9988908255427033e-05, "loss": 0.0058, "loss/policy_avg": 0.005815813317894936, "objective/entropy": 69.00190734863281, "objective/kl": 0.07886971533298492, "objective/rlhf_reward": 3.3839988708496094, "objective/scores": 3.39208984375, "policy/approxkl_avg": 0.1876513808965683, "policy/clipfrac_avg": 0.34765625, "policy/entropy_avg": 0.520235538482666, "step": 7, "timer/calc_advantages": 2.296631336212158, "timer/calc_loss": 0.9676017761230469, "timer/get_reward": 0.6649996042251587, "timer/training_step": 6.399840831756592, "val/num_eos_tokens": 0.0, "val/ratio": 1.000232458114624, "val/ratio_var": NaN }, { "epoch": 0.0012675024260788625, "grad_norm": 11.984278678894043, "learning_rate": 2.9987323720488037e-05, "loss": 0.0093, "loss/policy_avg": 0.009322328492999077, "objective/entropy": 66.09063720703125, "objective/kl": 0.015054229646921158, "objective/rlhf_reward": 2.489471673965454, "objective/scores": 2.491485595703125, "policy/approxkl_avg": 0.18275578320026398, "policy/clipfrac_avg": 0.3046875, "policy/entropy_avg": 0.5070106387138367, "step": 8, "timer/calc_advantages": 2.2399752140045166, "timer/calc_loss": 1.0477569103240967, "timer/get_reward": 0.7243468761444092, "timer/training_step": 6.686029434204102, "val/num_eos_tokens": 0.0, "val/ratio": 1.0009795427322388, "val/ratio_var": NaN } ], "logging_steps": 1, "max_steps": 18933, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 8, "total_flos": 4025485044940800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }