| { | |
| "best_global_step": 2500, | |
| "best_metric": 0.7841161489486694, | |
| "best_model_checkpoint": "./llama2-m2/checkpoint-2500", | |
| "epoch": 2.9767718880285887, | |
| "eval_steps": 100, | |
| "global_step": 2500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05955926146515783, | |
| "grad_norm": 11.590325355529785, | |
| "learning_rate": 4.9000000000000005e-06, | |
| "loss": 3.0703, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.11911852293031566, | |
| "grad_norm": 6.117842197418213, | |
| "learning_rate": 9.9e-06, | |
| "loss": 2.3868, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11911852293031566, | |
| "eval_loss": 1.5943528413772583, | |
| "eval_runtime": 86.9995, | |
| "eval_samples_per_second": 8.081, | |
| "eval_steps_per_second": 2.023, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1786777843954735, | |
| "grad_norm": 0.4276258051395416, | |
| "learning_rate": 9.797269342159703e-06, | |
| "loss": 1.1152, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.23823704586063132, | |
| "grad_norm": 0.34813204407691956, | |
| "learning_rate": 9.590401323955318e-06, | |
| "loss": 0.9458, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.23823704586063132, | |
| "eval_loss": 0.9201429486274719, | |
| "eval_runtime": 86.8761, | |
| "eval_samples_per_second": 8.092, | |
| "eval_steps_per_second": 2.026, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.29779630732578916, | |
| "grad_norm": 0.31664666533470154, | |
| "learning_rate": 9.383533305750931e-06, | |
| "loss": 0.8754, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.357355568790947, | |
| "grad_norm": 0.3039833903312683, | |
| "learning_rate": 9.176665287546546e-06, | |
| "loss": 0.8236, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.357355568790947, | |
| "eval_loss": 0.8294563293457031, | |
| "eval_runtime": 86.6916, | |
| "eval_samples_per_second": 8.109, | |
| "eval_steps_per_second": 2.03, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4169148302561048, | |
| "grad_norm": 0.3569670021533966, | |
| "learning_rate": 8.969797269342161e-06, | |
| "loss": 0.7643, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.47647409172126265, | |
| "grad_norm": 0.4587797224521637, | |
| "learning_rate": 8.762929251137776e-06, | |
| "loss": 0.7638, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.47647409172126265, | |
| "eval_loss": 0.8007138967514038, | |
| "eval_runtime": 86.7325, | |
| "eval_samples_per_second": 8.105, | |
| "eval_steps_per_second": 2.029, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.5360333531864205, | |
| "grad_norm": 0.2903870940208435, | |
| "learning_rate": 8.556061232933389e-06, | |
| "loss": 0.7505, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5955926146515783, | |
| "grad_norm": 0.39271315932273865, | |
| "learning_rate": 8.349193214729004e-06, | |
| "loss": 0.7773, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5955926146515783, | |
| "eval_loss": 0.7964405417442322, | |
| "eval_runtime": 86.7496, | |
| "eval_samples_per_second": 8.104, | |
| "eval_steps_per_second": 2.029, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6551518761167362, | |
| "grad_norm": 0.2611350119113922, | |
| "learning_rate": 8.142325196524617e-06, | |
| "loss": 0.7339, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.714711137581894, | |
| "grad_norm": 0.3096601665019989, | |
| "learning_rate": 7.935457178320233e-06, | |
| "loss": 0.7867, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.714711137581894, | |
| "eval_loss": 0.7935438752174377, | |
| "eval_runtime": 86.8192, | |
| "eval_samples_per_second": 8.097, | |
| "eval_steps_per_second": 2.027, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.7742703990470519, | |
| "grad_norm": 0.28062084317207336, | |
| "learning_rate": 7.728589160115847e-06, | |
| "loss": 0.7642, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.8338296605122096, | |
| "grad_norm": 0.2916211783885956, | |
| "learning_rate": 7.521721141911461e-06, | |
| "loss": 0.7436, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8338296605122096, | |
| "eval_loss": 0.7918882369995117, | |
| "eval_runtime": 86.8706, | |
| "eval_samples_per_second": 8.092, | |
| "eval_steps_per_second": 2.026, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.8933889219773675, | |
| "grad_norm": 0.4260661005973816, | |
| "learning_rate": 7.3148531237070755e-06, | |
| "loss": 0.7944, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.9529481834425253, | |
| "grad_norm": 0.3311309218406677, | |
| "learning_rate": 7.1079851055026895e-06, | |
| "loss": 0.7618, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.9529481834425253, | |
| "eval_loss": 0.7905948758125305, | |
| "eval_runtime": 86.7293, | |
| "eval_samples_per_second": 8.106, | |
| "eval_steps_per_second": 2.029, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.0119118522930315, | |
| "grad_norm": 0.33902204036712646, | |
| "learning_rate": 6.901117087298304e-06, | |
| "loss": 0.7565, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.0714711137581894, | |
| "grad_norm": 0.3156481981277466, | |
| "learning_rate": 6.694249069093918e-06, | |
| "loss": 0.7834, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0714711137581894, | |
| "eval_loss": 0.789471447467804, | |
| "eval_runtime": 86.7639, | |
| "eval_samples_per_second": 8.102, | |
| "eval_steps_per_second": 2.028, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.1310303752233473, | |
| "grad_norm": 0.29626569151878357, | |
| "learning_rate": 6.487381050889533e-06, | |
| "loss": 0.7636, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.1905896366885051, | |
| "grad_norm": 0.32058003544807434, | |
| "learning_rate": 6.280513032685147e-06, | |
| "loss": 0.7588, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1905896366885051, | |
| "eval_loss": 0.7887451648712158, | |
| "eval_runtime": 86.8029, | |
| "eval_samples_per_second": 8.099, | |
| "eval_steps_per_second": 2.028, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.2501488981536628, | |
| "grad_norm": 0.3029298484325409, | |
| "learning_rate": 6.073645014480761e-06, | |
| "loss": 0.7651, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.3097081596188207, | |
| "grad_norm": 0.30075645446777344, | |
| "learning_rate": 5.866776996276376e-06, | |
| "loss": 0.747, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.3097081596188207, | |
| "eval_loss": 0.7880399227142334, | |
| "eval_runtime": 86.7707, | |
| "eval_samples_per_second": 8.102, | |
| "eval_steps_per_second": 2.028, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.3692674210839786, | |
| "grad_norm": 0.30230703949928284, | |
| "learning_rate": 5.659908978071991e-06, | |
| "loss": 0.7694, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.4288266825491365, | |
| "grad_norm": 0.2981889545917511, | |
| "learning_rate": 5.453040959867605e-06, | |
| "loss": 0.7546, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4288266825491365, | |
| "eval_loss": 0.7873143553733826, | |
| "eval_runtime": 86.9249, | |
| "eval_samples_per_second": 8.087, | |
| "eval_steps_per_second": 2.025, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.4883859440142944, | |
| "grad_norm": 0.33295580744743347, | |
| "learning_rate": 5.246172941663219e-06, | |
| "loss": 0.7356, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.547945205479452, | |
| "grad_norm": 0.2881334125995636, | |
| "learning_rate": 5.039304923458833e-06, | |
| "loss": 0.7616, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.547945205479452, | |
| "eval_loss": 0.7868330478668213, | |
| "eval_runtime": 86.9371, | |
| "eval_samples_per_second": 8.086, | |
| "eval_steps_per_second": 2.024, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.60750446694461, | |
| "grad_norm": 0.42549142241477966, | |
| "learning_rate": 4.832436905254448e-06, | |
| "loss": 0.7613, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.6670637284097678, | |
| "grad_norm": 0.32537880539894104, | |
| "learning_rate": 4.625568887050063e-06, | |
| "loss": 0.777, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.6670637284097678, | |
| "eval_loss": 0.7863583564758301, | |
| "eval_runtime": 86.9105, | |
| "eval_samples_per_second": 8.089, | |
| "eval_steps_per_second": 2.025, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.7266229898749255, | |
| "grad_norm": 0.31612130999565125, | |
| "learning_rate": 4.418700868845677e-06, | |
| "loss": 0.7123, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.7861822513400833, | |
| "grad_norm": 0.39497706294059753, | |
| "learning_rate": 4.211832850641292e-06, | |
| "loss": 0.7999, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.7861822513400833, | |
| "eval_loss": 0.7859570980072021, | |
| "eval_runtime": 86.7739, | |
| "eval_samples_per_second": 8.102, | |
| "eval_steps_per_second": 2.028, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.8457415128052412, | |
| "grad_norm": 0.3905975818634033, | |
| "learning_rate": 4.004964832436906e-06, | |
| "loss": 0.7105, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.905300774270399, | |
| "grad_norm": 0.3420596718788147, | |
| "learning_rate": 3.7980968142325196e-06, | |
| "loss": 0.7735, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.905300774270399, | |
| "eval_loss": 0.7855594754219055, | |
| "eval_runtime": 86.9977, | |
| "eval_samples_per_second": 8.081, | |
| "eval_steps_per_second": 2.023, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.964860035735557, | |
| "grad_norm": 0.2925880551338196, | |
| "learning_rate": 3.5912287960281345e-06, | |
| "loss": 0.7675, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.023823704586063, | |
| "grad_norm": 0.42387983202934265, | |
| "learning_rate": 3.3843607778237485e-06, | |
| "loss": 0.7679, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.023823704586063, | |
| "eval_loss": 0.7852116227149963, | |
| "eval_runtime": 86.7932, | |
| "eval_samples_per_second": 8.1, | |
| "eval_steps_per_second": 2.028, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.083382966051221, | |
| "grad_norm": 0.3012678325176239, | |
| "learning_rate": 3.1774927596193634e-06, | |
| "loss": 0.7529, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.1429422275163788, | |
| "grad_norm": 0.3647378385066986, | |
| "learning_rate": 2.9706247414149774e-06, | |
| "loss": 0.7772, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.1429422275163788, | |
| "eval_loss": 0.7850247025489807, | |
| "eval_runtime": 86.7181, | |
| "eval_samples_per_second": 8.107, | |
| "eval_steps_per_second": 2.03, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.202501488981537, | |
| "grad_norm": 0.30863115191459656, | |
| "learning_rate": 2.763756723210592e-06, | |
| "loss": 0.7485, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.2620607504466945, | |
| "grad_norm": 0.3829723298549652, | |
| "learning_rate": 2.5568887050062062e-06, | |
| "loss": 0.7449, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.2620607504466945, | |
| "eval_loss": 0.7847884893417358, | |
| "eval_runtime": 86.725, | |
| "eval_samples_per_second": 8.106, | |
| "eval_steps_per_second": 2.029, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.321620011911852, | |
| "grad_norm": 0.3733135759830475, | |
| "learning_rate": 2.3500206868018207e-06, | |
| "loss": 0.7508, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 2.3811792733770103, | |
| "grad_norm": 0.37344199419021606, | |
| "learning_rate": 2.143152668597435e-06, | |
| "loss": 0.7509, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.3811792733770103, | |
| "eval_loss": 0.7846249938011169, | |
| "eval_runtime": 86.7361, | |
| "eval_samples_per_second": 8.105, | |
| "eval_steps_per_second": 2.029, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.440738534842168, | |
| "grad_norm": 0.46035104990005493, | |
| "learning_rate": 1.9362846503930496e-06, | |
| "loss": 0.7901, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 2.5002977963073256, | |
| "grad_norm": 0.31786802411079407, | |
| "learning_rate": 1.7294166321886638e-06, | |
| "loss": 0.7654, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.5002977963073256, | |
| "eval_loss": 0.7844468951225281, | |
| "eval_runtime": 86.7628, | |
| "eval_samples_per_second": 8.103, | |
| "eval_steps_per_second": 2.029, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.5598570577724837, | |
| "grad_norm": 0.337811678647995, | |
| "learning_rate": 1.5225486139842782e-06, | |
| "loss": 0.7524, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 2.6194163192376414, | |
| "grad_norm": 0.29232126474380493, | |
| "learning_rate": 1.3156805957798926e-06, | |
| "loss": 0.7279, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.6194163192376414, | |
| "eval_loss": 0.7843312621116638, | |
| "eval_runtime": 86.7533, | |
| "eval_samples_per_second": 8.103, | |
| "eval_steps_per_second": 2.029, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.678975580702799, | |
| "grad_norm": 0.4377705454826355, | |
| "learning_rate": 1.1088125775755069e-06, | |
| "loss": 0.7593, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 2.738534842167957, | |
| "grad_norm": 0.36447674036026, | |
| "learning_rate": 9.019445593711212e-07, | |
| "loss": 0.7523, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.738534842167957, | |
| "eval_loss": 0.7842342257499695, | |
| "eval_runtime": 86.8097, | |
| "eval_samples_per_second": 8.098, | |
| "eval_steps_per_second": 2.027, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.798094103633115, | |
| "grad_norm": 0.38712531328201294, | |
| "learning_rate": 6.950765411667356e-07, | |
| "loss": 0.7347, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 2.857653365098273, | |
| "grad_norm": 0.34733325242996216, | |
| "learning_rate": 4.882085229623501e-07, | |
| "loss": 0.7605, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.857653365098273, | |
| "eval_loss": 0.7841441035270691, | |
| "eval_runtime": 86.8339, | |
| "eval_samples_per_second": 8.096, | |
| "eval_steps_per_second": 2.027, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.9172126265634306, | |
| "grad_norm": 0.3819723129272461, | |
| "learning_rate": 2.8134050475796445e-07, | |
| "loss": 0.7412, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 2.9767718880285887, | |
| "grad_norm": 0.3409363329410553, | |
| "learning_rate": 7.447248655357883e-08, | |
| "loss": 0.7425, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.9767718880285887, | |
| "eval_loss": 0.7841161489486694, | |
| "eval_runtime": 87.2598, | |
| "eval_samples_per_second": 8.056, | |
| "eval_steps_per_second": 2.017, | |
| "step": 2500 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 2517, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.120601880087757e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |