| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.20393596410727033, | |
| "eval_steps": 50, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_loss": 4.004721641540527, | |
| "eval_runtime": 173.7131, | |
| "eval_samples_per_second": 7.052, | |
| "eval_steps_per_second": 7.052, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.004078719282145407, | |
| "grad_norm": 0.7785049676895142, | |
| "learning_rate": 3.6e-05, | |
| "loss": 3.8826, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.008157438564290813, | |
| "grad_norm": 0.9169650077819824, | |
| "learning_rate": 7.6e-05, | |
| "loss": 3.8197, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.012236157846436219, | |
| "grad_norm": 1.1420930624008179, | |
| "learning_rate": 0.000116, | |
| "loss": 3.1268, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.016314877128581626, | |
| "grad_norm": 1.2675455808639526, | |
| "learning_rate": 0.00015600000000000002, | |
| "loss": 2.1309, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.020393596410727032, | |
| "grad_norm": 1.0368549823760986, | |
| "learning_rate": 0.000196, | |
| "loss": 1.5617, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.020393596410727032, | |
| "eval_loss": 1.423304557800293, | |
| "eval_runtime": 173.1158, | |
| "eval_samples_per_second": 7.076, | |
| "eval_steps_per_second": 7.076, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.024472315692872438, | |
| "grad_norm": 0.8746965527534485, | |
| "learning_rate": 0.00019980267284282717, | |
| "loss": 1.3048, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.028551034975017844, | |
| "grad_norm": 4.214962959289551, | |
| "learning_rate": 0.00019912155402515417, | |
| "loss": 1.0715, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.03262975425716325, | |
| "grad_norm": 0.665006697177887, | |
| "learning_rate": 0.0001979575249599344, | |
| "loss": 1.0188, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.03670847353930866, | |
| "grad_norm": 0.5204000473022461, | |
| "learning_rate": 0.00019631625667976583, | |
| "loss": 0.9787, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.040787192821454064, | |
| "grad_norm": 0.5942240357398987, | |
| "learning_rate": 0.00019420574527872968, | |
| "loss": 0.9041, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.040787192821454064, | |
| "eval_loss": 0.9287890791893005, | |
| "eval_runtime": 173.4553, | |
| "eval_samples_per_second": 7.062, | |
| "eval_steps_per_second": 7.062, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.04486591210359947, | |
| "grad_norm": 0.6731751561164856, | |
| "learning_rate": 0.00019163627295622397, | |
| "loss": 0.9193, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.048944631385744876, | |
| "grad_norm": 0.7434952855110168, | |
| "learning_rate": 0.00018862035792312147, | |
| "loss": 0.9247, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.05302335066789028, | |
| "grad_norm": 0.6485812664031982, | |
| "learning_rate": 0.00018517269341430476, | |
| "loss": 1.0027, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.05710206995003569, | |
| "grad_norm": 0.6013619303703308, | |
| "learning_rate": 0.00018131007610470276, | |
| "loss": 0.9267, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.06118078923218109, | |
| "grad_norm": 0.6937788724899292, | |
| "learning_rate": 0.00017705132427757895, | |
| "loss": 0.9152, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.06118078923218109, | |
| "eval_loss": 0.9047658443450928, | |
| "eval_runtime": 173.377, | |
| "eval_samples_per_second": 7.066, | |
| "eval_steps_per_second": 7.066, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0652595085143265, | |
| "grad_norm": 0.6242877244949341, | |
| "learning_rate": 0.00017241718614374678, | |
| "loss": 0.9777, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.0693382277964719, | |
| "grad_norm": 0.6055032014846802, | |
| "learning_rate": 0.00016743023875837233, | |
| "loss": 0.9655, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.07341694707861732, | |
| "grad_norm": 0.6116403937339783, | |
| "learning_rate": 0.00016211477802783103, | |
| "loss": 1.0206, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.07749566636076272, | |
| "grad_norm": 0.8313778042793274, | |
| "learning_rate": 0.0001564967003424938, | |
| "loss": 0.9532, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.08157438564290813, | |
| "grad_norm": 0.6586077213287354, | |
| "learning_rate": 0.00015060337641211637, | |
| "loss": 0.9392, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08157438564290813, | |
| "eval_loss": 0.8928409218788147, | |
| "eval_runtime": 173.3344, | |
| "eval_samples_per_second": 7.067, | |
| "eval_steps_per_second": 7.067, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.08565310492505353, | |
| "grad_norm": 0.6433130502700806, | |
| "learning_rate": 0.00014446351791849276, | |
| "loss": 0.9729, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.08973182420719894, | |
| "grad_norm": 0.7685410976409912, | |
| "learning_rate": 0.00013810703763502744, | |
| "loss": 0.9407, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.09381054348934434, | |
| "grad_norm": 0.7548204064369202, | |
| "learning_rate": 0.00013156490369471027, | |
| "loss": 0.8866, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.09788926277148975, | |
| "grad_norm": 0.6521336436271667, | |
| "learning_rate": 0.0001248689887164855, | |
| "loss": 0.8709, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.10196798205363516, | |
| "grad_norm": 0.7266818284988403, | |
| "learning_rate": 0.00011805191452505602, | |
| "loss": 0.977, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10196798205363516, | |
| "eval_loss": 0.882000744342804, | |
| "eval_runtime": 173.4293, | |
| "eval_samples_per_second": 7.063, | |
| "eval_steps_per_second": 7.063, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.10604670133578056, | |
| "grad_norm": 0.7059846520423889, | |
| "learning_rate": 0.00011114689322063255, | |
| "loss": 0.9515, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.11012542061792598, | |
| "grad_norm": 0.7184727787971497, | |
| "learning_rate": 0.00010418756537291996, | |
| "loss": 0.9049, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.11420413990007137, | |
| "grad_norm": 0.8101040124893188, | |
| "learning_rate": 9.720783612764314e-05, | |
| "loss": 0.9918, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.11828285918221679, | |
| "grad_norm": 0.5306759476661682, | |
| "learning_rate": 9.024171002408506e-05, | |
| "loss": 0.9756, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.12236157846436219, | |
| "grad_norm": 0.8896522521972656, | |
| "learning_rate": 8.332312532838978e-05, | |
| "loss": 0.8556, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12236157846436219, | |
| "eval_loss": 0.873528003692627, | |
| "eval_runtime": 173.4611, | |
| "eval_samples_per_second": 7.062, | |
| "eval_steps_per_second": 7.062, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12644029774650759, | |
| "grad_norm": 0.715740978717804, | |
| "learning_rate": 7.6485788689741e-05, | |
| "loss": 0.8381, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.130519017028653, | |
| "grad_norm": 0.7031327486038208, | |
| "learning_rate": 6.976301092495556e-05, | |
| "loss": 0.8517, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.1345977363107984, | |
| "grad_norm": 0.6190944910049438, | |
| "learning_rate": 6.318754473153221e-05, | |
| "loss": 0.8552, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.1386764555929438, | |
| "grad_norm": 0.6262460947036743, | |
| "learning_rate": 5.679142511980175e-05, | |
| "loss": 0.9806, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.14275517487508924, | |
| "grad_norm": 0.5318371653556824, | |
| "learning_rate": 5.0605813341576924e-05, | |
| "loss": 0.8857, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.14275517487508924, | |
| "eval_loss": 0.8685455322265625, | |
| "eval_runtime": 173.2693, | |
| "eval_samples_per_second": 7.07, | |
| "eval_steps_per_second": 7.07, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.14683389415723463, | |
| "grad_norm": 0.8289533853530884, | |
| "learning_rate": 4.46608450756656e-05, | |
| "loss": 0.9057, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.15091261343938003, | |
| "grad_norm": 0.7940821051597595, | |
| "learning_rate": 3.8985483609873244e-05, | |
| "loss": 0.8912, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.15499133272152543, | |
| "grad_norm": 0.6407303810119629, | |
| "learning_rate": 3.360737873477584e-05, | |
| "loss": 0.944, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.15907005200367086, | |
| "grad_norm": 0.741663932800293, | |
| "learning_rate": 2.8552732036719687e-05, | |
| "loss": 0.8752, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.16314877128581626, | |
| "grad_norm": 0.5255782604217529, | |
| "learning_rate": 2.3846169246326343e-05, | |
| "loss": 0.8946, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16314877128581626, | |
| "eval_loss": 0.8629406690597534, | |
| "eval_runtime": 173.2005, | |
| "eval_samples_per_second": 7.073, | |
| "eval_steps_per_second": 7.073, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.16722749056796166, | |
| "grad_norm": 0.6469287872314453, | |
| "learning_rate": 1.9510620264408596e-05, | |
| "loss": 0.9102, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.17130620985010706, | |
| "grad_norm": 0.7001602649688721, | |
| "learning_rate": 1.5567207449798515e-05, | |
| "loss": 0.8821, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.17538492913225248, | |
| "grad_norm": 0.9216477870941162, | |
| "learning_rate": 1.2035142713338366e-05, | |
| "loss": 0.931, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.17946364841439788, | |
| "grad_norm": 0.6395500898361206, | |
| "learning_rate": 8.931633919382298e-06, | |
| "loss": 0.8528, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.18354236769654328, | |
| "grad_norm": 0.7768850922584534, | |
| "learning_rate": 6.2718010508108545e-06, | |
| "loss": 0.8807, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.18354236769654328, | |
| "eval_loss": 0.8612557649612427, | |
| "eval_runtime": 173.4106, | |
| "eval_samples_per_second": 7.064, | |
| "eval_steps_per_second": 7.064, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.18762108697868868, | |
| "grad_norm": 0.7101565003395081, | |
| "learning_rate": 4.068602545994249e-06, | |
| "loss": 0.9011, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.1916998062608341, | |
| "grad_norm": 0.7115808725357056, | |
| "learning_rate": 2.332772166583208e-06, | |
| "loss": 0.9117, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.1957785255429795, | |
| "grad_norm": 0.5874737501144409, | |
| "learning_rate": 1.0727667037011668e-06, | |
| "loss": 0.8661, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.1998572448251249, | |
| "grad_norm": 0.6979043483734131, | |
| "learning_rate": 2.947247773079753e-07, | |
| "loss": 0.9236, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.20393596410727033, | |
| "grad_norm": 0.7868551015853882, | |
| "learning_rate": 2.4369294605253166e-09, | |
| "loss": 0.813, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.20393596410727033, | |
| "eval_loss": 0.8608318567276001, | |
| "eval_runtime": 173.2348, | |
| "eval_samples_per_second": 7.071, | |
| "eval_steps_per_second": 7.071, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.3472888266752e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |