| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 3000, | |
| "global_step": 88686, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 3.382721060821325e-05, | |
| "grad_norm": 1352.0, | |
| "learning_rate": 0.00029999661727893914, | |
| "loss": 22.625, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.10148163182463973, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 0.000289851836817536, | |
| "loss": 3.5832, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.10148163182463973, | |
| "eval_loss": 2.45703125, | |
| "eval_runtime": 96.0596, | |
| "eval_samples_per_second": 982.775, | |
| "eval_steps_per_second": 7.683, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.20296326364927947, | |
| "grad_norm": 1.375, | |
| "learning_rate": 0.000279703673635072, | |
| "loss": 2.9906, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.20296326364927947, | |
| "eval_loss": 2.388899564743042, | |
| "eval_runtime": 95.3468, | |
| "eval_samples_per_second": 990.122, | |
| "eval_steps_per_second": 7.74, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.30444489547391923, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 0.00026955551045260807, | |
| "loss": 2.9367, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.30444489547391923, | |
| "eval_loss": 2.3747141361236572, | |
| "eval_runtime": 95.1929, | |
| "eval_samples_per_second": 991.723, | |
| "eval_steps_per_second": 7.753, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.40592652729855894, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 0.0002594073472701441, | |
| "loss": 2.9115, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.40592652729855894, | |
| "eval_loss": 2.3593432903289795, | |
| "eval_runtime": 95.2533, | |
| "eval_samples_per_second": 991.095, | |
| "eval_steps_per_second": 7.748, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.5074081591231987, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 0.0002492591840876801, | |
| "loss": 2.891, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.5074081591231987, | |
| "eval_loss": 2.3490748405456543, | |
| "eval_runtime": 95.1766, | |
| "eval_samples_per_second": 991.893, | |
| "eval_steps_per_second": 7.754, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.6088897909478385, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 0.0002391110209052161, | |
| "loss": 2.8835, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.6088897909478385, | |
| "eval_loss": 2.3357362747192383, | |
| "eval_runtime": 95.2104, | |
| "eval_samples_per_second": 991.541, | |
| "eval_steps_per_second": 7.751, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.7103714227724782, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00022896285772275215, | |
| "loss": 2.8777, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.7103714227724782, | |
| "eval_loss": 2.33984375, | |
| "eval_runtime": 95.1033, | |
| "eval_samples_per_second": 992.658, | |
| "eval_steps_per_second": 7.76, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.8118530545971179, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 0.0002188146945402882, | |
| "loss": 2.8722, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.8118530545971179, | |
| "eval_loss": 2.335789203643799, | |
| "eval_runtime": 95.1796, | |
| "eval_samples_per_second": 991.862, | |
| "eval_steps_per_second": 7.754, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.9133346864217576, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00020866653135782423, | |
| "loss": 2.8663, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.9133346864217576, | |
| "eval_loss": 2.3275110721588135, | |
| "eval_runtime": 95.2734, | |
| "eval_samples_per_second": 990.885, | |
| "eval_steps_per_second": 7.746, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.0148163182463974, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00019851836817536025, | |
| "loss": 2.8658, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.0148163182463974, | |
| "eval_loss": 2.330390453338623, | |
| "eval_runtime": 95.2824, | |
| "eval_samples_per_second": 990.792, | |
| "eval_steps_per_second": 7.745, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.116297950071037, | |
| "grad_norm": 1.375, | |
| "learning_rate": 0.0001883702049928963, | |
| "loss": 2.8623, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.116297950071037, | |
| "eval_loss": 2.3300411701202393, | |
| "eval_runtime": 95.3514, | |
| "eval_samples_per_second": 990.074, | |
| "eval_steps_per_second": 7.74, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.217779581895677, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.0001782220418104323, | |
| "loss": 2.8579, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.217779581895677, | |
| "eval_loss": 2.3285059928894043, | |
| "eval_runtime": 95.288, | |
| "eval_samples_per_second": 990.734, | |
| "eval_steps_per_second": 7.745, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.3192612137203166, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 0.00016807387862796832, | |
| "loss": 2.857, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.3192612137203166, | |
| "eval_loss": 2.3231706619262695, | |
| "eval_runtime": 95.3264, | |
| "eval_samples_per_second": 990.334, | |
| "eval_steps_per_second": 7.742, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.4207428455449564, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 0.00015792571544550436, | |
| "loss": 2.8552, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.4207428455449564, | |
| "eval_loss": 2.3225038051605225, | |
| "eval_runtime": 95.2774, | |
| "eval_samples_per_second": 990.843, | |
| "eval_steps_per_second": 7.746, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 1.522224477369596, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00014777755226304037, | |
| "loss": 2.8548, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.522224477369596, | |
| "eval_loss": 2.3205666542053223, | |
| "eval_runtime": 95.4014, | |
| "eval_samples_per_second": 989.556, | |
| "eval_steps_per_second": 7.736, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 1.6237061091942357, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 0.0001376293890805764, | |
| "loss": 2.8518, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.6237061091942357, | |
| "eval_loss": 2.324282169342041, | |
| "eval_runtime": 95.4158, | |
| "eval_samples_per_second": 989.407, | |
| "eval_steps_per_second": 7.735, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 1.7251877410188756, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 0.00012748122589811243, | |
| "loss": 2.8539, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.7251877410188756, | |
| "eval_loss": 2.3227896690368652, | |
| "eval_runtime": 95.374, | |
| "eval_samples_per_second": 989.84, | |
| "eval_steps_per_second": 7.738, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 1.8266693728435153, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 0.00011733306271564845, | |
| "loss": 2.8483, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.8266693728435153, | |
| "eval_loss": 2.3198044300079346, | |
| "eval_runtime": 95.1153, | |
| "eval_samples_per_second": 992.532, | |
| "eval_steps_per_second": 7.759, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 1.928151004668155, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 0.00010718489953318448, | |
| "loss": 2.8512, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 1.928151004668155, | |
| "eval_loss": 2.321180582046509, | |
| "eval_runtime": 95.1519, | |
| "eval_samples_per_second": 992.15, | |
| "eval_steps_per_second": 7.756, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.029632636492795, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 9.703673635072052e-05, | |
| "loss": 2.8515, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.029632636492795, | |
| "eval_loss": 2.3204078674316406, | |
| "eval_runtime": 95.1797, | |
| "eval_samples_per_second": 991.861, | |
| "eval_steps_per_second": 7.754, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 2.1311142683174347, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 8.688857316825655e-05, | |
| "loss": 2.8512, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 2.1311142683174347, | |
| "eval_loss": 2.3204712867736816, | |
| "eval_runtime": 95.1755, | |
| "eval_samples_per_second": 991.905, | |
| "eval_steps_per_second": 7.754, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 2.232595900142074, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 7.674040998579256e-05, | |
| "loss": 2.8492, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.232595900142074, | |
| "eval_loss": 2.3218369483947754, | |
| "eval_runtime": 95.2213, | |
| "eval_samples_per_second": 991.428, | |
| "eval_steps_per_second": 7.75, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 2.334077531966714, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 6.659224680332859e-05, | |
| "loss": 2.851, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 2.334077531966714, | |
| "eval_loss": 2.3220698833465576, | |
| "eval_runtime": 95.1826, | |
| "eval_samples_per_second": 991.831, | |
| "eval_steps_per_second": 7.754, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 2.435559163791354, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 5.644408362086462e-05, | |
| "loss": 2.8497, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 2.435559163791354, | |
| "eval_loss": 2.320767641067505, | |
| "eval_runtime": 95.2504, | |
| "eval_samples_per_second": 991.125, | |
| "eval_steps_per_second": 7.748, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 2.5370407956159937, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 4.629592043840065e-05, | |
| "loss": 2.848, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 2.5370407956159937, | |
| "eval_loss": 2.3203125, | |
| "eval_runtime": 95.295, | |
| "eval_samples_per_second": 990.661, | |
| "eval_steps_per_second": 7.744, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 2.638522427440633, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 3.614775725593667e-05, | |
| "loss": 2.852, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 2.638522427440633, | |
| "eval_loss": 2.319963216781616, | |
| "eval_runtime": 95.168, | |
| "eval_samples_per_second": 991.982, | |
| "eval_steps_per_second": 7.755, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 2.740004059265273, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 2.59995940734727e-05, | |
| "loss": 2.8483, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 2.740004059265273, | |
| "eval_loss": 2.321169853210449, | |
| "eval_runtime": 95.17, | |
| "eval_samples_per_second": 991.962, | |
| "eval_steps_per_second": 7.755, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 2.841485691089913, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 1.5851430891008727e-05, | |
| "loss": 2.85, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 2.841485691089913, | |
| "eval_loss": 2.3206405639648438, | |
| "eval_runtime": 95.2006, | |
| "eval_samples_per_second": 991.642, | |
| "eval_steps_per_second": 7.752, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 2.9429673229145523, | |
| "grad_norm": 1.375, | |
| "learning_rate": 5.703267708544753e-06, | |
| "loss": 2.8503, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 2.9429673229145523, | |
| "eval_loss": 2.320661783218384, | |
| "eval_runtime": 95.1542, | |
| "eval_samples_per_second": 992.127, | |
| "eval_steps_per_second": 7.756, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 88686, | |
| "total_flos": 1.3129555583125094e+18, | |
| "train_loss": 2.891493231738944, | |
| "train_runtime": 37678.5959, | |
| "train_samples_per_second": 301.271, | |
| "train_steps_per_second": 2.354 | |
| } | |
| ], | |
| "logging_steps": 3000, | |
| "max_steps": 88686, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 3000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3129555583125094e+18, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |