| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 11.971223021582734, | |
| "eval_steps": 25, | |
| "global_step": 468, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.127359124604765, | |
| "learning_rate": 9.999874838141888e-05, | |
| "loss": 0.2903, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.17698872089385986, | |
| "eval_runtime": 109.7117, | |
| "eval_samples_per_second": 91.157, | |
| "eval_steps_per_second": 1.431, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 0.27062951121760287, | |
| "learning_rate": 9.915628588978522e-05, | |
| "loss": 0.1566, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "eval_loss": 0.1319342851638794, | |
| "eval_runtime": 109.0269, | |
| "eval_samples_per_second": 91.73, | |
| "eval_steps_per_second": 1.44, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 0.24204285961516211, | |
| "learning_rate": 9.67797005288181e-05, | |
| "loss": 0.1379, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "eval_loss": 0.12531398236751556, | |
| "eval_runtime": 108.9721, | |
| "eval_samples_per_second": 91.776, | |
| "eval_steps_per_second": 1.441, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 0.17190878979127733, | |
| "learning_rate": 9.294316336102132e-05, | |
| "loss": 0.1246, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "eval_loss": 0.11651688814163208, | |
| "eval_runtime": 109.0854, | |
| "eval_samples_per_second": 91.68, | |
| "eval_steps_per_second": 1.439, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 0.19157295828856058, | |
| "learning_rate": 8.776640921382584e-05, | |
| "loss": 0.1159, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 0.10494253039360046, | |
| "eval_runtime": 109.434, | |
| "eval_samples_per_second": 91.388, | |
| "eval_steps_per_second": 1.435, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 0.13204284591576954, | |
| "learning_rate": 8.141099986478212e-05, | |
| "loss": 0.1048, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "eval_loss": 0.09983079880475998, | |
| "eval_runtime": 109.7612, | |
| "eval_samples_per_second": 91.116, | |
| "eval_steps_per_second": 1.43, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 0.17932878424857554, | |
| "learning_rate": 7.407528184577019e-05, | |
| "loss": 0.0947, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "eval_loss": 0.09489566832780838, | |
| "eval_runtime": 108.9729, | |
| "eval_samples_per_second": 91.775, | |
| "eval_steps_per_second": 1.441, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "grad_norm": 0.10314936548781714, | |
| "learning_rate": 6.598819622856227e-05, | |
| "loss": 0.0872, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 5.12, | |
| "eval_loss": 0.09059683978557587, | |
| "eval_runtime": 109.4369, | |
| "eval_samples_per_second": 91.386, | |
| "eval_steps_per_second": 1.435, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "grad_norm": 0.09896949138120692, | |
| "learning_rate": 5.7402133582686576e-05, | |
| "loss": 0.0836, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 5.76, | |
| "eval_loss": 0.08898729830980301, | |
| "eval_runtime": 109.6712, | |
| "eval_samples_per_second": 91.191, | |
| "eval_steps_per_second": 1.432, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 6.39, | |
| "grad_norm": 0.09411679948573648, | |
| "learning_rate": 4.85850570958441e-05, | |
| "loss": 0.0774, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 6.39, | |
| "eval_loss": 0.08496005833148956, | |
| "eval_runtime": 108.989, | |
| "eval_samples_per_second": 91.762, | |
| "eval_steps_per_second": 1.441, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 7.03, | |
| "grad_norm": 0.0741392458596649, | |
| "learning_rate": 3.9812139687108815e-05, | |
| "loss": 0.0717, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 7.03, | |
| "eval_loss": 0.08271028101444244, | |
| "eval_runtime": 108.611, | |
| "eval_samples_per_second": 92.081, | |
| "eval_steps_per_second": 1.446, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "grad_norm": 0.08461241295692067, | |
| "learning_rate": 3.135717611098458e-05, | |
| "loss": 0.0639, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 7.67, | |
| "eval_loss": 0.08073248714208603, | |
| "eval_runtime": 109.2638, | |
| "eval_samples_per_second": 91.531, | |
| "eval_steps_per_second": 1.437, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 8.31, | |
| "grad_norm": 0.10663906934510009, | |
| "learning_rate": 2.3484038072721758e-05, | |
| "loss": 0.0596, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 8.31, | |
| "eval_loss": 0.07886555045843124, | |
| "eval_runtime": 109.6916, | |
| "eval_samples_per_second": 91.174, | |
| "eval_steps_per_second": 1.431, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "grad_norm": 0.09636182626655444, | |
| "learning_rate": 1.6438439032954855e-05, | |
| "loss": 0.0555, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 8.95, | |
| "eval_loss": 0.0773012638092041, | |
| "eval_runtime": 109.6624, | |
| "eval_samples_per_second": 91.198, | |
| "eval_steps_per_second": 1.432, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 9.59, | |
| "grad_norm": 0.05049345737531785, | |
| "learning_rate": 1.0440265714600572e-05, | |
| "loss": 0.0498, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 9.59, | |
| "eval_loss": 0.07774835079908371, | |
| "eval_runtime": 109.5603, | |
| "eval_samples_per_second": 91.283, | |
| "eval_steps_per_second": 1.433, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 10.23, | |
| "grad_norm": 0.0430056992457794, | |
| "learning_rate": 5.676715638695063e-06, | |
| "loss": 0.0491, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 10.23, | |
| "eval_loss": 0.07812906801700592, | |
| "eval_runtime": 109.8294, | |
| "eval_samples_per_second": 91.059, | |
| "eval_steps_per_second": 1.429, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 10.87, | |
| "grad_norm": 0.040355667582564335, | |
| "learning_rate": 2.2964548604209213e-06, | |
| "loss": 0.0467, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 10.87, | |
| "eval_loss": 0.07796623557806015, | |
| "eval_runtime": 109.5541, | |
| "eval_samples_per_second": 91.288, | |
| "eval_steps_per_second": 1.433, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 11.51, | |
| "grad_norm": 0.041530575942009804, | |
| "learning_rate": 4.049782370561583e-07, | |
| "loss": 0.0459, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 11.51, | |
| "eval_loss": 0.07810305058956146, | |
| "eval_runtime": 108.746, | |
| "eval_samples_per_second": 91.967, | |
| "eval_steps_per_second": 1.444, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 11.97, | |
| "step": 468, | |
| "total_flos": 1.0672624631808e+16, | |
| "train_loss": 0.0933543870336989, | |
| "train_runtime": 39555.7956, | |
| "train_samples_per_second": 24.271, | |
| "train_steps_per_second": 0.012 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 468, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 12, | |
| "save_steps": 2000, | |
| "total_flos": 1.0672624631808e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |