{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20393596410727033, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 4.004721641540527, "eval_runtime": 173.7131, "eval_samples_per_second": 7.052, "eval_steps_per_second": 7.052, "step": 0 }, { "epoch": 0.004078719282145407, "grad_norm": 0.7785049676895142, "learning_rate": 3.6e-05, "loss": 3.8826, "step": 10 }, { "epoch": 0.008157438564290813, "grad_norm": 0.9169650077819824, "learning_rate": 7.6e-05, "loss": 3.8197, "step": 20 }, { "epoch": 0.012236157846436219, "grad_norm": 1.1420930624008179, "learning_rate": 0.000116, "loss": 3.1268, "step": 30 }, { "epoch": 0.016314877128581626, "grad_norm": 1.2675455808639526, "learning_rate": 0.00015600000000000002, "loss": 2.1309, "step": 40 }, { "epoch": 0.020393596410727032, "grad_norm": 1.0368549823760986, "learning_rate": 0.000196, "loss": 1.5617, "step": 50 }, { "epoch": 0.020393596410727032, "eval_loss": 1.423304557800293, "eval_runtime": 173.1158, "eval_samples_per_second": 7.076, "eval_steps_per_second": 7.076, "step": 50 }, { "epoch": 0.024472315692872438, "grad_norm": 0.8746965527534485, "learning_rate": 0.00019980267284282717, "loss": 1.3048, "step": 60 }, { "epoch": 0.028551034975017844, "grad_norm": 4.214962959289551, "learning_rate": 0.00019912155402515417, "loss": 1.0715, "step": 70 }, { "epoch": 0.03262975425716325, "grad_norm": 0.665006697177887, "learning_rate": 0.0001979575249599344, "loss": 1.0188, "step": 80 }, { "epoch": 0.03670847353930866, "grad_norm": 0.5204000473022461, "learning_rate": 0.00019631625667976583, "loss": 0.9787, "step": 90 }, { "epoch": 0.040787192821454064, "grad_norm": 0.5942240357398987, "learning_rate": 0.00019420574527872968, "loss": 0.9041, "step": 100 }, { "epoch": 0.040787192821454064, "eval_loss": 0.9287890791893005, "eval_runtime": 173.4553, "eval_samples_per_second": 7.062, "eval_steps_per_second": 7.062, "step": 100 }, { "epoch": 0.04486591210359947, "grad_norm": 0.6731751561164856, "learning_rate": 0.00019163627295622397, "loss": 0.9193, "step": 110 }, { "epoch": 0.048944631385744876, "grad_norm": 0.7434952855110168, "learning_rate": 0.00018862035792312147, "loss": 0.9247, "step": 120 }, { "epoch": 0.05302335066789028, "grad_norm": 0.6485812664031982, "learning_rate": 0.00018517269341430476, "loss": 1.0027, "step": 130 }, { "epoch": 0.05710206995003569, "grad_norm": 0.6013619303703308, "learning_rate": 0.00018131007610470276, "loss": 0.9267, "step": 140 }, { "epoch": 0.06118078923218109, "grad_norm": 0.6937788724899292, "learning_rate": 0.00017705132427757895, "loss": 0.9152, "step": 150 }, { "epoch": 0.06118078923218109, "eval_loss": 0.9047658443450928, "eval_runtime": 173.377, "eval_samples_per_second": 7.066, "eval_steps_per_second": 7.066, "step": 150 }, { "epoch": 0.0652595085143265, "grad_norm": 0.6242877244949341, "learning_rate": 0.00017241718614374678, "loss": 0.9777, "step": 160 }, { "epoch": 0.0693382277964719, "grad_norm": 0.6055032014846802, "learning_rate": 0.00016743023875837233, "loss": 0.9655, "step": 170 }, { "epoch": 0.07341694707861732, "grad_norm": 0.6116403937339783, "learning_rate": 0.00016211477802783103, "loss": 1.0206, "step": 180 }, { "epoch": 0.07749566636076272, "grad_norm": 0.8313778042793274, "learning_rate": 0.0001564967003424938, "loss": 0.9532, "step": 190 }, { "epoch": 0.08157438564290813, "grad_norm": 0.6586077213287354, "learning_rate": 0.00015060337641211637, "loss": 0.9392, "step": 200 }, { "epoch": 0.08157438564290813, "eval_loss": 0.8928409218788147, "eval_runtime": 173.3344, "eval_samples_per_second": 7.067, "eval_steps_per_second": 7.067, "step": 200 }, { "epoch": 0.08565310492505353, "grad_norm": 0.6433130502700806, "learning_rate": 0.00014446351791849276, "loss": 0.9729, "step": 210 }, { "epoch": 0.08973182420719894, "grad_norm": 0.7685410976409912, "learning_rate": 0.00013810703763502744, "loss": 0.9407, "step": 220 }, { "epoch": 0.09381054348934434, "grad_norm": 0.7548204064369202, "learning_rate": 0.00013156490369471027, "loss": 0.8866, "step": 230 }, { "epoch": 0.09788926277148975, "grad_norm": 0.6521336436271667, "learning_rate": 0.0001248689887164855, "loss": 0.8709, "step": 240 }, { "epoch": 0.10196798205363516, "grad_norm": 0.7266818284988403, "learning_rate": 0.00011805191452505602, "loss": 0.977, "step": 250 }, { "epoch": 0.10196798205363516, "eval_loss": 0.882000744342804, "eval_runtime": 173.4293, "eval_samples_per_second": 7.063, "eval_steps_per_second": 7.063, "step": 250 }, { "epoch": 0.10604670133578056, "grad_norm": 0.7059846520423889, "learning_rate": 0.00011114689322063255, "loss": 0.9515, "step": 260 }, { "epoch": 0.11012542061792598, "grad_norm": 0.7184727787971497, "learning_rate": 0.00010418756537291996, "loss": 0.9049, "step": 270 }, { "epoch": 0.11420413990007137, "grad_norm": 0.8101040124893188, "learning_rate": 9.720783612764314e-05, "loss": 0.9918, "step": 280 }, { "epoch": 0.11828285918221679, "grad_norm": 0.5306759476661682, "learning_rate": 9.024171002408506e-05, "loss": 0.9756, "step": 290 }, { "epoch": 0.12236157846436219, "grad_norm": 0.8896522521972656, "learning_rate": 8.332312532838978e-05, "loss": 0.8556, "step": 300 }, { "epoch": 0.12236157846436219, "eval_loss": 0.873528003692627, "eval_runtime": 173.4611, "eval_samples_per_second": 7.062, "eval_steps_per_second": 7.062, "step": 300 }, { "epoch": 0.12644029774650759, "grad_norm": 0.715740978717804, "learning_rate": 7.6485788689741e-05, "loss": 0.8381, "step": 310 }, { "epoch": 0.130519017028653, "grad_norm": 0.7031327486038208, "learning_rate": 6.976301092495556e-05, "loss": 0.8517, "step": 320 }, { "epoch": 0.1345977363107984, "grad_norm": 0.6190944910049438, "learning_rate": 6.318754473153221e-05, "loss": 0.8552, "step": 330 }, { "epoch": 0.1386764555929438, "grad_norm": 0.6262460947036743, "learning_rate": 5.679142511980175e-05, "loss": 0.9806, "step": 340 }, { "epoch": 0.14275517487508924, "grad_norm": 0.5318371653556824, "learning_rate": 5.0605813341576924e-05, "loss": 0.8857, "step": 350 }, { "epoch": 0.14275517487508924, "eval_loss": 0.8685455322265625, "eval_runtime": 173.2693, "eval_samples_per_second": 7.07, "eval_steps_per_second": 7.07, "step": 350 }, { "epoch": 0.14683389415723463, "grad_norm": 0.8289533853530884, "learning_rate": 4.46608450756656e-05, "loss": 0.9057, "step": 360 }, { "epoch": 0.15091261343938003, "grad_norm": 0.7940821051597595, "learning_rate": 3.8985483609873244e-05, "loss": 0.8912, "step": 370 }, { "epoch": 0.15499133272152543, "grad_norm": 0.6407303810119629, "learning_rate": 3.360737873477584e-05, "loss": 0.944, "step": 380 }, { "epoch": 0.15907005200367086, "grad_norm": 0.741663932800293, "learning_rate": 2.8552732036719687e-05, "loss": 0.8752, "step": 390 }, { "epoch": 0.16314877128581626, "grad_norm": 0.5255782604217529, "learning_rate": 2.3846169246326343e-05, "loss": 0.8946, "step": 400 }, { "epoch": 0.16314877128581626, "eval_loss": 0.8629406690597534, "eval_runtime": 173.2005, "eval_samples_per_second": 7.073, "eval_steps_per_second": 7.073, "step": 400 }, { "epoch": 0.16722749056796166, "grad_norm": 0.6469287872314453, "learning_rate": 1.9510620264408596e-05, "loss": 0.9102, "step": 410 }, { "epoch": 0.17130620985010706, "grad_norm": 0.7001602649688721, "learning_rate": 1.5567207449798515e-05, "loss": 0.8821, "step": 420 }, { "epoch": 0.17538492913225248, "grad_norm": 0.9216477870941162, "learning_rate": 1.2035142713338366e-05, "loss": 0.931, "step": 430 }, { "epoch": 0.17946364841439788, "grad_norm": 0.6395500898361206, "learning_rate": 8.931633919382298e-06, "loss": 0.8528, "step": 440 }, { "epoch": 0.18354236769654328, "grad_norm": 0.7768850922584534, "learning_rate": 6.2718010508108545e-06, "loss": 0.8807, "step": 450 }, { "epoch": 0.18354236769654328, "eval_loss": 0.8612557649612427, "eval_runtime": 173.4106, "eval_samples_per_second": 7.064, "eval_steps_per_second": 7.064, "step": 450 }, { "epoch": 0.18762108697868868, "grad_norm": 0.7101565003395081, "learning_rate": 4.068602545994249e-06, "loss": 0.9011, "step": 460 }, { "epoch": 0.1916998062608341, "grad_norm": 0.7115808725357056, "learning_rate": 2.332772166583208e-06, "loss": 0.9117, "step": 470 }, { "epoch": 0.1957785255429795, "grad_norm": 0.5874737501144409, "learning_rate": 1.0727667037011668e-06, "loss": 0.8661, "step": 480 }, { "epoch": 0.1998572448251249, "grad_norm": 0.6979043483734131, "learning_rate": 2.947247773079753e-07, "loss": 0.9236, "step": 490 }, { "epoch": 0.20393596410727033, "grad_norm": 0.7868551015853882, "learning_rate": 2.4369294605253166e-09, "loss": 0.813, "step": 500 }, { "epoch": 0.20393596410727033, "eval_loss": 0.8608318567276001, "eval_runtime": 173.2348, "eval_samples_per_second": 7.071, "eval_steps_per_second": 7.071, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3472888266752e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }