{ "best_global_step": 2500, "best_metric": 0.7841161489486694, "best_model_checkpoint": "./llama2-m2/checkpoint-2500", "epoch": 2.997022036926742, "eval_steps": 100, "global_step": 2517, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05955926146515783, "grad_norm": 11.590325355529785, "learning_rate": 4.9000000000000005e-06, "loss": 3.0703, "step": 50 }, { "epoch": 0.11911852293031566, "grad_norm": 6.117842197418213, "learning_rate": 9.9e-06, "loss": 2.3868, "step": 100 }, { "epoch": 0.11911852293031566, "eval_loss": 1.5943528413772583, "eval_runtime": 86.9995, "eval_samples_per_second": 8.081, "eval_steps_per_second": 2.023, "step": 100 }, { "epoch": 0.1786777843954735, "grad_norm": 0.4276258051395416, "learning_rate": 9.797269342159703e-06, "loss": 1.1152, "step": 150 }, { "epoch": 0.23823704586063132, "grad_norm": 0.34813204407691956, "learning_rate": 9.590401323955318e-06, "loss": 0.9458, "step": 200 }, { "epoch": 0.23823704586063132, "eval_loss": 0.9201429486274719, "eval_runtime": 86.8761, "eval_samples_per_second": 8.092, "eval_steps_per_second": 2.026, "step": 200 }, { "epoch": 0.29779630732578916, "grad_norm": 0.31664666533470154, "learning_rate": 9.383533305750931e-06, "loss": 0.8754, "step": 250 }, { "epoch": 0.357355568790947, "grad_norm": 0.3039833903312683, "learning_rate": 9.176665287546546e-06, "loss": 0.8236, "step": 300 }, { "epoch": 0.357355568790947, "eval_loss": 0.8294563293457031, "eval_runtime": 86.6916, "eval_samples_per_second": 8.109, "eval_steps_per_second": 2.03, "step": 300 }, { "epoch": 0.4169148302561048, "grad_norm": 0.3569670021533966, "learning_rate": 8.969797269342161e-06, "loss": 0.7643, "step": 350 }, { "epoch": 0.47647409172126265, "grad_norm": 0.4587797224521637, "learning_rate": 8.762929251137776e-06, "loss": 0.7638, "step": 400 }, { "epoch": 0.47647409172126265, "eval_loss": 0.8007138967514038, "eval_runtime": 86.7325, "eval_samples_per_second": 8.105, "eval_steps_per_second": 2.029, "step": 400 }, { "epoch": 0.5360333531864205, "grad_norm": 0.2903870940208435, "learning_rate": 8.556061232933389e-06, "loss": 0.7505, "step": 450 }, { "epoch": 0.5955926146515783, "grad_norm": 0.39271315932273865, "learning_rate": 8.349193214729004e-06, "loss": 0.7773, "step": 500 }, { "epoch": 0.5955926146515783, "eval_loss": 0.7964405417442322, "eval_runtime": 86.7496, "eval_samples_per_second": 8.104, "eval_steps_per_second": 2.029, "step": 500 }, { "epoch": 0.6551518761167362, "grad_norm": 0.2611350119113922, "learning_rate": 8.142325196524617e-06, "loss": 0.7339, "step": 550 }, { "epoch": 0.714711137581894, "grad_norm": 0.3096601665019989, "learning_rate": 7.935457178320233e-06, "loss": 0.7867, "step": 600 }, { "epoch": 0.714711137581894, "eval_loss": 0.7935438752174377, "eval_runtime": 86.8192, "eval_samples_per_second": 8.097, "eval_steps_per_second": 2.027, "step": 600 }, { "epoch": 0.7742703990470519, "grad_norm": 0.28062084317207336, "learning_rate": 7.728589160115847e-06, "loss": 0.7642, "step": 650 }, { "epoch": 0.8338296605122096, "grad_norm": 0.2916211783885956, "learning_rate": 7.521721141911461e-06, "loss": 0.7436, "step": 700 }, { "epoch": 0.8338296605122096, "eval_loss": 0.7918882369995117, "eval_runtime": 86.8706, "eval_samples_per_second": 8.092, "eval_steps_per_second": 2.026, "step": 700 }, { "epoch": 0.8933889219773675, "grad_norm": 0.4260661005973816, "learning_rate": 7.3148531237070755e-06, "loss": 0.7944, "step": 750 }, { "epoch": 0.9529481834425253, "grad_norm": 0.3311309218406677, "learning_rate": 7.1079851055026895e-06, "loss": 0.7618, "step": 800 }, { "epoch": 0.9529481834425253, "eval_loss": 0.7905948758125305, "eval_runtime": 86.7293, "eval_samples_per_second": 8.106, "eval_steps_per_second": 2.029, "step": 800 }, { "epoch": 1.0119118522930315, "grad_norm": 0.33902204036712646, "learning_rate": 6.901117087298304e-06, "loss": 0.7565, "step": 850 }, { "epoch": 1.0714711137581894, "grad_norm": 0.3156481981277466, "learning_rate": 6.694249069093918e-06, "loss": 0.7834, "step": 900 }, { "epoch": 1.0714711137581894, "eval_loss": 0.789471447467804, "eval_runtime": 86.7639, "eval_samples_per_second": 8.102, "eval_steps_per_second": 2.028, "step": 900 }, { "epoch": 1.1310303752233473, "grad_norm": 0.29626569151878357, "learning_rate": 6.487381050889533e-06, "loss": 0.7636, "step": 950 }, { "epoch": 1.1905896366885051, "grad_norm": 0.32058003544807434, "learning_rate": 6.280513032685147e-06, "loss": 0.7588, "step": 1000 }, { "epoch": 1.1905896366885051, "eval_loss": 0.7887451648712158, "eval_runtime": 86.8029, "eval_samples_per_second": 8.099, "eval_steps_per_second": 2.028, "step": 1000 }, { "epoch": 1.2501488981536628, "grad_norm": 0.3029298484325409, "learning_rate": 6.073645014480761e-06, "loss": 0.7651, "step": 1050 }, { "epoch": 1.3097081596188207, "grad_norm": 0.30075645446777344, "learning_rate": 5.866776996276376e-06, "loss": 0.747, "step": 1100 }, { "epoch": 1.3097081596188207, "eval_loss": 0.7880399227142334, "eval_runtime": 86.7707, "eval_samples_per_second": 8.102, "eval_steps_per_second": 2.028, "step": 1100 }, { "epoch": 1.3692674210839786, "grad_norm": 0.30230703949928284, "learning_rate": 5.659908978071991e-06, "loss": 0.7694, "step": 1150 }, { "epoch": 1.4288266825491365, "grad_norm": 0.2981889545917511, "learning_rate": 5.453040959867605e-06, "loss": 0.7546, "step": 1200 }, { "epoch": 1.4288266825491365, "eval_loss": 0.7873143553733826, "eval_runtime": 86.9249, "eval_samples_per_second": 8.087, "eval_steps_per_second": 2.025, "step": 1200 }, { "epoch": 1.4883859440142944, "grad_norm": 0.33295580744743347, "learning_rate": 5.246172941663219e-06, "loss": 0.7356, "step": 1250 }, { "epoch": 1.547945205479452, "grad_norm": 0.2881334125995636, "learning_rate": 5.039304923458833e-06, "loss": 0.7616, "step": 1300 }, { "epoch": 1.547945205479452, "eval_loss": 0.7868330478668213, "eval_runtime": 86.9371, "eval_samples_per_second": 8.086, "eval_steps_per_second": 2.024, "step": 1300 }, { "epoch": 1.60750446694461, "grad_norm": 0.42549142241477966, "learning_rate": 4.832436905254448e-06, "loss": 0.7613, "step": 1350 }, { "epoch": 1.6670637284097678, "grad_norm": 0.32537880539894104, "learning_rate": 4.625568887050063e-06, "loss": 0.777, "step": 1400 }, { "epoch": 1.6670637284097678, "eval_loss": 0.7863583564758301, "eval_runtime": 86.9105, "eval_samples_per_second": 8.089, "eval_steps_per_second": 2.025, "step": 1400 }, { "epoch": 1.7266229898749255, "grad_norm": 0.31612130999565125, "learning_rate": 4.418700868845677e-06, "loss": 0.7123, "step": 1450 }, { "epoch": 1.7861822513400833, "grad_norm": 0.39497706294059753, "learning_rate": 4.211832850641292e-06, "loss": 0.7999, "step": 1500 }, { "epoch": 1.7861822513400833, "eval_loss": 0.7859570980072021, "eval_runtime": 86.7739, "eval_samples_per_second": 8.102, "eval_steps_per_second": 2.028, "step": 1500 }, { "epoch": 1.8457415128052412, "grad_norm": 0.3905975818634033, "learning_rate": 4.004964832436906e-06, "loss": 0.7105, "step": 1550 }, { "epoch": 1.905300774270399, "grad_norm": 0.3420596718788147, "learning_rate": 3.7980968142325196e-06, "loss": 0.7735, "step": 1600 }, { "epoch": 1.905300774270399, "eval_loss": 0.7855594754219055, "eval_runtime": 86.9977, "eval_samples_per_second": 8.081, "eval_steps_per_second": 2.023, "step": 1600 }, { "epoch": 1.964860035735557, "grad_norm": 0.2925880551338196, "learning_rate": 3.5912287960281345e-06, "loss": 0.7675, "step": 1650 }, { "epoch": 2.023823704586063, "grad_norm": 0.42387983202934265, "learning_rate": 3.3843607778237485e-06, "loss": 0.7679, "step": 1700 }, { "epoch": 2.023823704586063, "eval_loss": 0.7852116227149963, "eval_runtime": 86.7932, "eval_samples_per_second": 8.1, "eval_steps_per_second": 2.028, "step": 1700 }, { "epoch": 2.083382966051221, "grad_norm": 0.3012678325176239, "learning_rate": 3.1774927596193634e-06, "loss": 0.7529, "step": 1750 }, { "epoch": 2.1429422275163788, "grad_norm": 0.3647378385066986, "learning_rate": 2.9706247414149774e-06, "loss": 0.7772, "step": 1800 }, { "epoch": 2.1429422275163788, "eval_loss": 0.7850247025489807, "eval_runtime": 86.7181, "eval_samples_per_second": 8.107, "eval_steps_per_second": 2.03, "step": 1800 }, { "epoch": 2.202501488981537, "grad_norm": 0.30863115191459656, "learning_rate": 2.763756723210592e-06, "loss": 0.7485, "step": 1850 }, { "epoch": 2.2620607504466945, "grad_norm": 0.3829723298549652, "learning_rate": 2.5568887050062062e-06, "loss": 0.7449, "step": 1900 }, { "epoch": 2.2620607504466945, "eval_loss": 0.7847884893417358, "eval_runtime": 86.725, "eval_samples_per_second": 8.106, "eval_steps_per_second": 2.029, "step": 1900 }, { "epoch": 2.321620011911852, "grad_norm": 0.3733135759830475, "learning_rate": 2.3500206868018207e-06, "loss": 0.7508, "step": 1950 }, { "epoch": 2.3811792733770103, "grad_norm": 0.37344199419021606, "learning_rate": 2.143152668597435e-06, "loss": 0.7509, "step": 2000 }, { "epoch": 2.3811792733770103, "eval_loss": 0.7846249938011169, "eval_runtime": 86.7361, "eval_samples_per_second": 8.105, "eval_steps_per_second": 2.029, "step": 2000 }, { "epoch": 2.440738534842168, "grad_norm": 0.46035104990005493, "learning_rate": 1.9362846503930496e-06, "loss": 0.7901, "step": 2050 }, { "epoch": 2.5002977963073256, "grad_norm": 0.31786802411079407, "learning_rate": 1.7294166321886638e-06, "loss": 0.7654, "step": 2100 }, { "epoch": 2.5002977963073256, "eval_loss": 0.7844468951225281, "eval_runtime": 86.7628, "eval_samples_per_second": 8.103, "eval_steps_per_second": 2.029, "step": 2100 }, { "epoch": 2.5598570577724837, "grad_norm": 0.337811678647995, "learning_rate": 1.5225486139842782e-06, "loss": 0.7524, "step": 2150 }, { "epoch": 2.6194163192376414, "grad_norm": 0.29232126474380493, "learning_rate": 1.3156805957798926e-06, "loss": 0.7279, "step": 2200 }, { "epoch": 2.6194163192376414, "eval_loss": 0.7843312621116638, "eval_runtime": 86.7533, "eval_samples_per_second": 8.103, "eval_steps_per_second": 2.029, "step": 2200 }, { "epoch": 2.678975580702799, "grad_norm": 0.4377705454826355, "learning_rate": 1.1088125775755069e-06, "loss": 0.7593, "step": 2250 }, { "epoch": 2.738534842167957, "grad_norm": 0.36447674036026, "learning_rate": 9.019445593711212e-07, "loss": 0.7523, "step": 2300 }, { "epoch": 2.738534842167957, "eval_loss": 0.7842342257499695, "eval_runtime": 86.8097, "eval_samples_per_second": 8.098, "eval_steps_per_second": 2.027, "step": 2300 }, { "epoch": 2.798094103633115, "grad_norm": 0.38712531328201294, "learning_rate": 6.950765411667356e-07, "loss": 0.7347, "step": 2350 }, { "epoch": 2.857653365098273, "grad_norm": 0.34733325242996216, "learning_rate": 4.882085229623501e-07, "loss": 0.7605, "step": 2400 }, { "epoch": 2.857653365098273, "eval_loss": 0.7841441035270691, "eval_runtime": 86.8339, "eval_samples_per_second": 8.096, "eval_steps_per_second": 2.027, "step": 2400 }, { "epoch": 2.9172126265634306, "grad_norm": 0.3819723129272461, "learning_rate": 2.8134050475796445e-07, "loss": 0.7412, "step": 2450 }, { "epoch": 2.9767718880285887, "grad_norm": 0.3409363329410553, "learning_rate": 7.447248655357883e-08, "loss": 0.7425, "step": 2500 }, { "epoch": 2.9767718880285887, "eval_loss": 0.7841161489486694, "eval_runtime": 87.2598, "eval_samples_per_second": 8.056, "eval_steps_per_second": 2.017, "step": 2500 } ], "logging_steps": 50, "max_steps": 2517, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.17584683310121e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }