{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.002535004852157725, "eval_steps": 500, "global_step": 16, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001584378032598578, "grad_norm": 7.086519326549023e-05, "learning_rate": 2.9998415465061005e-05, "loss": 0.0, "loss/policy_avg": 9.534414857625961e-08, "objective/entropy": 66.7407455444336, "objective/kl": 0.0, "objective/rlhf_reward": 1.9394512176513672, "objective/scores": 1.9393310546875, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.501089334487915, "step": 1, "timer/calc_advantages": 1.9189459085464478, "timer/calc_loss": 0.7234929800033569, "timer/get_reward": 0.5044295787811279, "timer/training_step": 5.130897045135498, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.0003168756065197156, "grad_norm": 9.20624828338623, "learning_rate": 2.999683093012201e-05, "loss": 0.0082, "loss/policy_avg": 0.008180337026715279, "objective/entropy": 58.23992919921875, "objective/kl": 0.04112038388848305, "objective/rlhf_reward": 2.586242914199829, "objective/scores": 2.59033203125, "policy/approxkl_avg": 0.13751985132694244, "policy/clipfrac_avg": 0.353515625, "policy/entropy_avg": 0.4371030628681183, "step": 2, "timer/calc_advantages": 1.77765953540802, "timer/calc_loss": 0.620478630065918, "timer/get_reward": 0.43380415439605713, "timer/training_step": 4.58284854888916, "val/num_eos_tokens": 0.0, "val/ratio": 1.0005027055740356, "val/ratio_var": NaN }, { "epoch": 0.0004753134097795734, "grad_norm": 11.192896842956543, "learning_rate": 2.9995246395183014e-05, "loss": 0.0083, "loss/policy_avg": 0.008318130858242512, "objective/entropy": 60.747886657714844, "objective/kl": 0.1742033064365387, "objective/rlhf_reward": 2.7990851402282715, "objective/scores": 2.8162841796875, "policy/approxkl_avg": 0.1877279430627823, "policy/clipfrac_avg": 0.345703125, "policy/entropy_avg": 0.47007349133491516, "step": 3, "timer/calc_advantages": 2.159975528717041, "timer/calc_loss": 0.8658402562141418, "timer/get_reward": 0.5989301204681396, "timer/training_step": 5.865015029907227, "val/num_eos_tokens": 0.0, "val/ratio": 0.9998154640197754, "val/ratio_var": NaN }, { "epoch": 0.0006337512130394313, "grad_norm": 10.267037391662598, "learning_rate": 2.999366186024402e-05, "loss": 0.0119, "loss/policy_avg": 0.011887951754033566, "objective/entropy": 59.366294860839844, "objective/kl": 0.12045621126890182, "objective/rlhf_reward": 2.8014278411865234, "objective/scores": 2.8134765625, "policy/approxkl_avg": 0.2340196967124939, "policy/clipfrac_avg": 0.369140625, "policy/entropy_avg": 0.4660375118255615, "step": 4, "timer/calc_advantages": 1.9160572290420532, "timer/calc_loss": 0.8047055602073669, "timer/get_reward": 0.5554770231246948, "timer/training_step": 5.4731550216674805, "val/num_eos_tokens": 0.0, "val/ratio": 1.000678539276123, "val/ratio_var": NaN }, { "epoch": 0.000792189016299289, "grad_norm": 5.658022200805135e-05, "learning_rate": 2.9992077325305024e-05, "loss": 0.0, "loss/policy_avg": 9.505311027169228e-08, "objective/entropy": 63.056705474853516, "objective/kl": 0.0, "objective/rlhf_reward": 1.961860179901123, "objective/scores": 1.961883544921875, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.49685803055763245, "step": 5, "timer/calc_advantages": 1.9551451206207275, "timer/calc_loss": 0.7570784091949463, "timer/get_reward": 0.5253068208694458, "timer/training_step": 5.274328231811523, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.0009506268195591468, "grad_norm": 5.630626201629639, "learning_rate": 2.9990492790366028e-05, "loss": 0.0036, "loss/policy_avg": 0.0035882075317204, "objective/entropy": 64.0039291381836, "objective/kl": -0.06272067129611969, "objective/rlhf_reward": 2.5381531715393066, "objective/scores": 2.53204345703125, "policy/approxkl_avg": 0.07097644358873367, "policy/clipfrac_avg": 0.244140625, "policy/entropy_avg": 0.49359244108200073, "step": 6, "timer/calc_advantages": 1.9771534204483032, "timer/calc_loss": 0.7805941700935364, "timer/get_reward": 0.5281075835227966, "timer/training_step": 5.320864677429199, "val/num_eos_tokens": 0.0, "val/ratio": 1.000971794128418, "val/ratio_var": NaN }, { "epoch": 0.0011090646228190046, "grad_norm": 7.572779178619385, "learning_rate": 2.9988908255427033e-05, "loss": 0.0058, "loss/policy_avg": 0.005815813317894936, "objective/entropy": 69.00190734863281, "objective/kl": 0.07886971533298492, "objective/rlhf_reward": 3.3839988708496094, "objective/scores": 3.39208984375, "policy/approxkl_avg": 0.1876513808965683, "policy/clipfrac_avg": 0.34765625, "policy/entropy_avg": 0.520235538482666, "step": 7, "timer/calc_advantages": 2.296631336212158, "timer/calc_loss": 0.9676017761230469, "timer/get_reward": 0.6649996042251587, "timer/training_step": 6.399840831756592, "val/num_eos_tokens": 0.0, "val/ratio": 1.000232458114624, "val/ratio_var": NaN }, { "epoch": 0.0012675024260788625, "grad_norm": 11.984278678894043, "learning_rate": 2.9987323720488037e-05, "loss": 0.0093, "loss/policy_avg": 0.009322328492999077, "objective/entropy": 66.09063720703125, "objective/kl": 0.015054229646921158, "objective/rlhf_reward": 2.489471673965454, "objective/scores": 2.491485595703125, "policy/approxkl_avg": 0.18275578320026398, "policy/clipfrac_avg": 0.3046875, "policy/entropy_avg": 0.5070106387138367, "step": 8, "timer/calc_advantages": 2.2399752140045166, "timer/calc_loss": 1.0477569103240967, "timer/get_reward": 0.7243468761444092, "timer/training_step": 6.686029434204102, "val/num_eos_tokens": 0.0, "val/ratio": 1.0009795427322388, "val/ratio_var": NaN }, { "epoch": 0.0014259402293387202, "grad_norm": 7.540817750850692e-05, "learning_rate": 2.9985739185549042e-05, "loss": 0.0, "loss/policy_avg": 1.367880031466484e-07, "objective/entropy": 58.587005615234375, "objective/kl": 0.0, "objective/rlhf_reward": 2.611638069152832, "objective/scores": 2.61175537109375, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.42249488830566406, "step": 9, "timer/calc_advantages": 2.2680113315582275, "timer/calc_loss": 0.9395630359649658, "timer/get_reward": 0.6489749550819397, "timer/training_step": 6.220600128173828, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.001584378032598578, "grad_norm": 1.935127854347229, "learning_rate": 2.9984154650610046e-05, "loss": 0.0007, "loss/policy_avg": 0.0006855675601400435, "objective/entropy": 55.21089172363281, "objective/kl": 0.0106657724827528, "objective/rlhf_reward": 2.033531427383423, "objective/scores": 2.0347900390625, "policy/approxkl_avg": 0.02970672771334648, "policy/clipfrac_avg": 0.197265625, "policy/entropy_avg": 0.4399160146713257, "step": 10, "timer/calc_advantages": 1.9682772159576416, "timer/calc_loss": 0.7604256272315979, "timer/get_reward": 0.5244199633598328, "timer/training_step": 5.279101371765137, "val/num_eos_tokens": 0.0, "val/ratio": 1.0002206563949585, "val/ratio_var": NaN }, { "epoch": 0.0017428158358584358, "grad_norm": 2.560812473297119, "learning_rate": 2.998257011567105e-05, "loss": 0.0011, "loss/policy_avg": 0.0011356198228895664, "objective/entropy": 64.66464233398438, "objective/kl": 0.009169694036245346, "objective/rlhf_reward": 2.824911594390869, "objective/scores": 2.82574462890625, "policy/approxkl_avg": 0.03749970346689224, "policy/clipfrac_avg": 0.21484375, "policy/entropy_avg": 0.5036777257919312, "step": 11, "timer/calc_advantages": 2.0788302421569824, "timer/calc_loss": 0.8143972754478455, "timer/get_reward": 0.5581433176994324, "timer/training_step": 5.590597629547119, "val/num_eos_tokens": 0.0, "val/ratio": 1.000451683998108, "val/ratio_var": NaN }, { "epoch": 0.0019012536391182935, "grad_norm": 4.297569751739502, "learning_rate": 2.9980985580732055e-05, "loss": 0.0034, "loss/policy_avg": 0.003396428655833006, "objective/entropy": 69.60541534423828, "objective/kl": 0.10166570544242859, "objective/rlhf_reward": 1.655887246131897, "objective/scores": 1.6659393310546875, "policy/approxkl_avg": 0.09505846351385117, "policy/clipfrac_avg": 0.318359375, "policy/entropy_avg": 0.534148633480072, "step": 12, "timer/calc_advantages": 1.8084050416946411, "timer/calc_loss": 0.6441062092781067, "timer/get_reward": 0.44651317596435547, "timer/training_step": 4.796171188354492, "val/num_eos_tokens": 0.0, "val/ratio": 0.9999833106994629, "val/ratio_var": NaN }, { "epoch": 0.0020596914423781513, "grad_norm": 7.657416426809505e-05, "learning_rate": 2.997940104579306e-05, "loss": 0.0, "loss/policy_avg": 1.4971010386943817e-07, "objective/entropy": 58.742462158203125, "objective/kl": 0.0, "objective/rlhf_reward": 3.092315673828125, "objective/scores": 3.0924072265625, "policy/approxkl_avg": 0.0, "policy/clipfrac_avg": 0.0, "policy/entropy_avg": 0.4520529806613922, "step": 13, "timer/calc_advantages": 2.120788335800171, "timer/calc_loss": 0.8547518849372864, "timer/get_reward": 0.5879230499267578, "timer/training_step": 5.762821197509766, "val/num_eos_tokens": 0.0, "val/ratio": 1.0, "val/ratio_var": NaN }, { "epoch": 0.002218129245638009, "grad_norm": 2.661646604537964, "learning_rate": 2.9977816510854064e-05, "loss": 0.001, "loss/policy_avg": 0.001029975712299347, "objective/entropy": 68.30726623535156, "objective/kl": 0.03951136767864227, "objective/rlhf_reward": 2.7953004837036133, "objective/scores": 2.7994384765625, "policy/approxkl_avg": 0.040699079632759094, "policy/clipfrac_avg": 0.2265625, "policy/entropy_avg": 0.5269634127616882, "step": 14, "timer/calc_advantages": 1.9552991390228271, "timer/calc_loss": 0.7335948944091797, "timer/get_reward": 0.5061210989952087, "timer/training_step": 5.143916130065918, "val/num_eos_tokens": 0.0, "val/ratio": 1.000035047531128, "val/ratio_var": NaN }, { "epoch": 0.002376567048897867, "grad_norm": 3.1387367248535156, "learning_rate": 2.997623197591507e-05, "loss": 0.0025, "loss/policy_avg": 0.00247659208253026, "objective/entropy": 56.77816390991211, "objective/kl": 0.006408168934285641, "objective/rlhf_reward": 2.673807144165039, "objective/scores": 2.6744384765625, "policy/approxkl_avg": 0.06710580736398697, "policy/clipfrac_avg": 0.283203125, "policy/entropy_avg": 0.44521695375442505, "step": 15, "timer/calc_advantages": 2.5635008811950684, "timer/calc_loss": 1.1095852851867676, "timer/get_reward": 0.7531031966209412, "timer/training_step": 7.173662185668945, "val/num_eos_tokens": 0.0, "val/ratio": 1.0003986358642578, "val/ratio_var": NaN }, { "epoch": 0.002535004852157725, "grad_norm": 3.675922155380249, "learning_rate": 2.9974647440976074e-05, "loss": 0.0035, "loss/policy_avg": 0.0034786476753652096, "objective/entropy": 55.65203094482422, "objective/kl": 0.0999540314078331, "objective/rlhf_reward": 2.479569435119629, "objective/scores": 2.489501953125, "policy/approxkl_avg": 0.10638778656721115, "policy/clipfrac_avg": 0.359375, "policy/entropy_avg": 0.4279758334159851, "step": 16, "timer/calc_advantages": 2.1651806831359863, "timer/calc_loss": 0.8663696646690369, "timer/get_reward": 0.5983204245567322, "timer/training_step": 5.955389022827148, "val/num_eos_tokens": 0.0, "val/ratio": 0.9999206066131592, "val/ratio_var": NaN } ], "logging_steps": 1, "max_steps": 18933, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 8, "total_flos": 8202719718014976.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }