{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0012675024260788625, "eval_steps": 500, "global_step": 8, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001584378032598578, "grad_norm": 6.941703031770885e-05, "learning_rate": 2.9998415465061005e-05, "loss": 0.0, "loss/policy_avg": 9.534414857625961e-08, "objective/entropy": 66.7407455444336, "objective/kl": 0.0, "objective/rlhf_reward": 1.9394512176513672, "objective/scores": 1.9393310546875, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.501089334487915, "step": 1, "timer/calc_advantages": 1.838552713394165, "timer/calc_loss": 0.718826174736023, "timer/get_reward": 0.5057681202888489, "timer/training_step": 5.023805141448975, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.0003168756065197156, "grad_norm": 8.962108612060547, "learning_rate": 2.999683093012201e-05, "loss": 0.0079, "loss/policy_avg": 0.007912077941000462, "objective/entropy": 58.23992919921875, "objective/kl": 0.006753697991371155, "objective/rlhf_reward": 2.58967924118042, "objective/scores": 2.59033203125, "policy/approxkl_avg": 0.143972247838974, "policy/clipfrac_avg": 0.376953125, "policy/entropy_avg": 0.437211811542511, "step": 2, "timer/calc_advantages": 1.693800687789917, "timer/calc_loss": 0.6173452734947205, "timer/get_reward": 0.431838721036911, "timer/training_step": 4.488475322723389, "val/num_eos_tokens": 0.0, "val/ratio": 1.0006933212280273, "val/ratio_var": NaN }, { "epoch": 0.0004753134097795734, "grad_norm": 8.80933666229248, "learning_rate": 2.9995246395183014e-05, "loss": 0.0087, "loss/policy_avg": 0.008671796880662441, "objective/entropy": 60.747886657714844, "objective/kl": 0.19268979132175446, "objective/rlhf_reward": 2.797236442565918, "objective/scores": 2.8162841796875, "policy/approxkl_avg": 0.22207684814929962, "policy/clipfrac_avg": 0.369140625, "policy/entropy_avg": 0.47127196192741394, "step": 3, "timer/calc_advantages": 2.117703437805176, "timer/calc_loss": 0.8592336773872375, "timer/get_reward": 0.5989187955856323, "timer/training_step": 5.804616928100586, "val/num_eos_tokens": 0.0, "val/ratio": 0.9997344613075256, "val/ratio_var": NaN }, { "epoch": 0.0006337512130394313, "grad_norm": 15.397004127502441, "learning_rate": 2.999366186024402e-05, "loss": 0.0181, "loss/policy_avg": 0.018090050667524338, "objective/entropy": 59.366294860839844, "objective/kl": 0.16402865946292877, "objective/rlhf_reward": 2.7970707416534424, "objective/scores": 2.8134765625, "policy/approxkl_avg": 0.30915290117263794, "policy/clipfrac_avg": 0.41796875, "policy/entropy_avg": 0.4661800265312195, "step": 4, "timer/calc_advantages": 2.018901824951172, "timer/calc_loss": 0.8049512505531311, "timer/get_reward": 0.5547392964363098, "timer/training_step": 5.506021022796631, "val/num_eos_tokens": 0.0, "val/ratio": 1.0003275871276855, "val/ratio_var": NaN }, { "epoch": 0.000792189016299289, "grad_norm": 6.137847231002524e-05, "learning_rate": 2.9992077325305024e-05, "loss": 0.0, "loss/policy_avg": 1.0040821507573128e-07, "objective/entropy": 61.850738525390625, "objective/kl": 0.0, "objective/rlhf_reward": 2.3658785820007324, "objective/scores": 2.36578369140625, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.48403242230415344, "step": 5, "timer/calc_advantages": 1.889434576034546, "timer/calc_loss": 0.7489800453186035, "timer/get_reward": 0.5218558311462402, "timer/training_step": 5.19569206237793, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.0009506268195591468, "grad_norm": 3.129560947418213, "learning_rate": 2.9990492790366028e-05, "loss": 0.002, "loss/policy_avg": 0.002019597217440605, "objective/entropy": 64.70206451416016, "objective/kl": -0.011846143752336502, "objective/rlhf_reward": 2.4429714679718018, "objective/scores": 2.44146728515625, "policy/approxkl_avg": 0.05796036496758461, "policy/clipfrac_avg": 0.224609375, "policy/entropy_avg": 0.4975966215133667, "step": 6, "timer/calc_advantages": 1.939255714416504, "timer/calc_loss": 0.7629643082618713, "timer/get_reward": 0.5286913514137268, "timer/training_step": 5.261943340301514, "val/num_eos_tokens": 0.0, "val/ratio": 1.0005271434783936, "val/ratio_var": NaN }, { "epoch": 0.0011090646228190046, "grad_norm": 4.28013277053833, "learning_rate": 2.9988908255427033e-05, "loss": 0.0037, "loss/policy_avg": 0.0036908092442899942, "objective/entropy": 68.02294158935547, "objective/kl": 0.15080219507217407, "objective/rlhf_reward": 3.149240016937256, "objective/scores": 3.1646728515625, "policy/approxkl_avg": 0.11497651040554047, "policy/clipfrac_avg": 0.25, "policy/entropy_avg": 0.5161693692207336, "step": 7, "timer/calc_advantages": 2.2946014404296875, "timer/calc_loss": 0.96665358543396, "timer/get_reward": 0.6638086438179016, "timer/training_step": 6.369439125061035, "val/num_eos_tokens": 0.0, "val/ratio": 0.99953293800354, "val/ratio_var": NaN }, { "epoch": 0.0012675024260788625, "grad_norm": 8.370040893554688, "learning_rate": 2.9987323720488037e-05, "loss": 0.0078, "loss/policy_avg": 0.00783943198621273, "objective/entropy": 66.24089813232422, "objective/kl": 0.0030039921402931213, "objective/rlhf_reward": 3.1140072345733643, "objective/scores": 3.1142501831054688, "policy/approxkl_avg": 0.13499276340007782, "policy/clipfrac_avg": 0.345703125, "policy/entropy_avg": 0.4966353476047516, "step": 8, "timer/calc_advantages": 2.4299309253692627, "timer/calc_loss": 1.046256422996521, "timer/get_reward": 0.7162653207778931, "timer/training_step": 6.773098945617676, "val/num_eos_tokens": 0.0, "val/ratio": 1.0011930465698242, "val/ratio_var": NaN } ], "logging_steps": 1, "max_steps": 18933, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 8, "total_flos": 4025485044940800.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }