{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3000, "global_step": 88686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.382721060821325e-05, "grad_norm": 1216.0, "learning_rate": 0.00029999661727893914, "loss": 684.0, "step": 1 }, { "epoch": 0.10148163182463973, "grad_norm": 26.75, "learning_rate": 0.000289851836817536, "loss": 46.6558, "step": 3000 }, { "epoch": 0.10148163182463973, "eval_loss": 21.720190048217773, "eval_runtime": 69.6955, "eval_samples_per_second": 1354.534, "eval_steps_per_second": 10.589, "step": 3000 }, { "epoch": 0.20296326364927947, "grad_norm": 22.875, "learning_rate": 0.000279703673635072, "loss": 32.5002, "step": 6000 }, { "epoch": 0.20296326364927947, "eval_loss": 21.16615867614746, "eval_runtime": 69.7326, "eval_samples_per_second": 1353.815, "eval_steps_per_second": 10.583, "step": 6000 }, { "epoch": 0.30444489547391923, "grad_norm": 25.5, "learning_rate": 0.00026955551045260807, "loss": 31.6686, "step": 9000 }, { "epoch": 0.30444489547391923, "eval_loss": 20.685806274414062, "eval_runtime": 69.6891, "eval_samples_per_second": 1354.659, "eval_steps_per_second": 10.59, "step": 9000 }, { "epoch": 0.40592652729855894, "grad_norm": 23.25, "learning_rate": 0.0002594073472701441, "loss": 31.3231, "step": 12000 }, { "epoch": 0.40592652729855894, "eval_loss": 20.587228775024414, "eval_runtime": 69.7199, "eval_samples_per_second": 1354.062, "eval_steps_per_second": 10.585, "step": 12000 }, { "epoch": 0.5074081591231987, "grad_norm": 22.75, "learning_rate": 0.0002492591840876801, "loss": 31.1165, "step": 15000 }, { "epoch": 0.5074081591231987, "eval_loss": 20.39651107788086, "eval_runtime": 69.6974, "eval_samples_per_second": 1354.497, "eval_steps_per_second": 10.589, "step": 15000 }, { "epoch": 0.6088897909478385, "grad_norm": 20.125, "learning_rate": 0.0002391110209052161, "loss": 31.0253, "step": 18000 }, { "epoch": 0.6088897909478385, "eval_loss": 20.266767501831055, "eval_runtime": 69.7207, "eval_samples_per_second": 1354.045, "eval_steps_per_second": 10.585, "step": 18000 }, { "epoch": 0.7103714227724782, "grad_norm": 22.875, "learning_rate": 0.00022896285772275215, "loss": 30.9446, "step": 21000 }, { "epoch": 0.7103714227724782, "eval_loss": 20.1310977935791, "eval_runtime": 69.7013, "eval_samples_per_second": 1354.422, "eval_steps_per_second": 10.588, "step": 21000 }, { "epoch": 0.8118530545971179, "grad_norm": 23.75, "learning_rate": 0.0002188146945402882, "loss": 30.9031, "step": 24000 }, { "epoch": 0.8118530545971179, "eval_loss": 20.095190048217773, "eval_runtime": 69.7102, "eval_samples_per_second": 1354.25, "eval_steps_per_second": 10.587, "step": 24000 }, { "epoch": 0.9133346864217576, "grad_norm": 20.875, "learning_rate": 0.00020866653135782423, "loss": 30.8587, "step": 27000 }, { "epoch": 0.9133346864217576, "eval_loss": 20.028963088989258, "eval_runtime": 69.6951, "eval_samples_per_second": 1354.544, "eval_steps_per_second": 10.589, "step": 27000 }, { "epoch": 1.0148163182463974, "grad_norm": 27.875, "learning_rate": 0.00019851836817536025, "loss": 30.8229, "step": 30000 }, { "epoch": 1.0148163182463974, "eval_loss": 20.114498138427734, "eval_runtime": 69.6966, "eval_samples_per_second": 1354.514, "eval_steps_per_second": 10.589, "step": 30000 }, { "epoch": 1.116297950071037, "grad_norm": 20.125, "learning_rate": 0.0001883702049928963, "loss": 30.821, "step": 33000 }, { "epoch": 1.116297950071037, "eval_loss": 19.9164981842041, "eval_runtime": 69.8254, "eval_samples_per_second": 1352.015, "eval_steps_per_second": 10.569, "step": 33000 }, { "epoch": 1.217779581895677, "grad_norm": 30.25, "learning_rate": 0.0001782220418104323, "loss": 30.7911, "step": 36000 }, { "epoch": 1.217779581895677, "eval_loss": 19.815717697143555, "eval_runtime": 69.8336, "eval_samples_per_second": 1351.856, "eval_steps_per_second": 10.568, "step": 36000 }, { "epoch": 1.3192612137203166, "grad_norm": 23.125, "learning_rate": 0.00016807387862796832, "loss": 30.784, "step": 39000 }, { "epoch": 1.3192612137203166, "eval_loss": 19.988821029663086, "eval_runtime": 69.8098, "eval_samples_per_second": 1352.318, "eval_steps_per_second": 10.572, "step": 39000 }, { "epoch": 1.4207428455449564, "grad_norm": 22.875, "learning_rate": 0.00015792571544550436, "loss": 30.7765, "step": 42000 }, { "epoch": 1.4207428455449564, "eval_loss": 19.912771224975586, "eval_runtime": 69.8089, "eval_samples_per_second": 1352.334, "eval_steps_per_second": 10.572, "step": 42000 }, { "epoch": 1.522224477369596, "grad_norm": 22.375, "learning_rate": 0.00014777755226304037, "loss": 30.774, "step": 45000 }, { "epoch": 1.522224477369596, "eval_loss": 19.893800735473633, "eval_runtime": 69.8327, "eval_samples_per_second": 1351.873, "eval_steps_per_second": 10.568, "step": 45000 }, { "epoch": 1.6237061091942357, "grad_norm": 21.375, "learning_rate": 0.0001376293890805764, "loss": 30.7461, "step": 48000 }, { "epoch": 1.6237061091942357, "eval_loss": 19.777099609375, "eval_runtime": 69.8255, "eval_samples_per_second": 1352.013, "eval_steps_per_second": 10.569, "step": 48000 }, { "epoch": 1.7251877410188756, "grad_norm": 19.5, "learning_rate": 0.00012748122589811243, "loss": 30.7454, "step": 51000 }, { "epoch": 1.7251877410188756, "eval_loss": 19.92310333251953, "eval_runtime": 69.8429, "eval_samples_per_second": 1351.677, "eval_steps_per_second": 10.567, "step": 51000 }, { "epoch": 1.8266693728435153, "grad_norm": 27.375, "learning_rate": 0.00011733306271564845, "loss": 30.724, "step": 54000 }, { "epoch": 1.8266693728435153, "eval_loss": 19.74203872680664, "eval_runtime": 69.8312, "eval_samples_per_second": 1351.903, "eval_steps_per_second": 10.568, "step": 54000 }, { "epoch": 1.928151004668155, "grad_norm": 21.0, "learning_rate": 0.00010718489953318448, "loss": 30.731, "step": 57000 }, { "epoch": 1.928151004668155, "eval_loss": 19.870426177978516, "eval_runtime": 69.839, "eval_samples_per_second": 1351.753, "eval_steps_per_second": 10.567, "step": 57000 }, { "epoch": 2.029632636492795, "grad_norm": 23.375, "learning_rate": 9.703673635072052e-05, "loss": 30.7417, "step": 60000 }, { "epoch": 2.029632636492795, "eval_loss": 19.947154998779297, "eval_runtime": 69.8406, "eval_samples_per_second": 1351.721, "eval_steps_per_second": 10.567, "step": 60000 }, { "epoch": 2.1311142683174347, "grad_norm": 20.375, "learning_rate": 8.688857316825655e-05, "loss": 30.7423, "step": 63000 }, { "epoch": 2.1311142683174347, "eval_loss": 19.931232452392578, "eval_runtime": 69.6923, "eval_samples_per_second": 1354.597, "eval_steps_per_second": 10.589, "step": 63000 }, { "epoch": 2.232595900142074, "grad_norm": 23.25, "learning_rate": 7.674040998579256e-05, "loss": 30.7266, "step": 66000 }, { "epoch": 2.232595900142074, "eval_loss": 19.900915145874023, "eval_runtime": 69.6769, "eval_samples_per_second": 1354.896, "eval_steps_per_second": 10.592, "step": 66000 }, { "epoch": 2.334077531966714, "grad_norm": 19.625, "learning_rate": 6.659224680332859e-05, "loss": 30.7288, "step": 69000 }, { "epoch": 2.334077531966714, "eval_loss": 19.946815490722656, "eval_runtime": 69.7005, "eval_samples_per_second": 1354.439, "eval_steps_per_second": 10.588, "step": 69000 }, { "epoch": 2.435559163791354, "grad_norm": 19.5, "learning_rate": 5.644408362086462e-05, "loss": 30.7245, "step": 72000 }, { "epoch": 2.435559163791354, "eval_loss": 19.871952056884766, "eval_runtime": 69.6924, "eval_samples_per_second": 1354.595, "eval_steps_per_second": 10.589, "step": 72000 }, { "epoch": 2.5370407956159937, "grad_norm": 24.875, "learning_rate": 4.629592043840065e-05, "loss": 30.7434, "step": 75000 }, { "epoch": 2.5370407956159937, "eval_loss": 19.933265686035156, "eval_runtime": 69.6748, "eval_samples_per_second": 1354.937, "eval_steps_per_second": 10.592, "step": 75000 }, { "epoch": 2.638522427440633, "grad_norm": 20.625, "learning_rate": 3.614775725593667e-05, "loss": 30.731, "step": 78000 }, { "epoch": 2.638522427440633, "eval_loss": 19.861787796020508, "eval_runtime": 69.6852, "eval_samples_per_second": 1354.735, "eval_steps_per_second": 10.59, "step": 78000 }, { "epoch": 2.740004059265273, "grad_norm": 22.375, "learning_rate": 2.59995940734727e-05, "loss": 30.7104, "step": 81000 }, { "epoch": 2.740004059265273, "eval_loss": 19.897865295410156, "eval_runtime": 69.685, "eval_samples_per_second": 1354.74, "eval_steps_per_second": 10.591, "step": 81000 }, { "epoch": 2.841485691089913, "grad_norm": 19.875, "learning_rate": 1.5851430891008727e-05, "loss": 30.7213, "step": 84000 }, { "epoch": 2.841485691089913, "eval_loss": 19.8810977935791, "eval_runtime": 69.6701, "eval_samples_per_second": 1355.029, "eval_steps_per_second": 10.593, "step": 84000 }, { "epoch": 2.9429673229145523, "grad_norm": 19.875, "learning_rate": 5.703267708544753e-06, "loss": 30.7303, "step": 87000 }, { "epoch": 2.9429673229145523, "eval_loss": 19.892446517944336, "eval_runtime": 69.6841, "eval_samples_per_second": 1354.757, "eval_steps_per_second": 10.591, "step": 87000 }, { "epoch": 3.0, "step": 88686, "total_flos": 6.274081768106557e+17, "train_loss": 31.441612825023114, "train_runtime": 27391.1958, "train_samples_per_second": 414.42, "train_steps_per_second": 3.238 } ], "logging_steps": 3000, "max_steps": 88686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 3000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.274081768106557e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }