{ "best_metric": 1.5053555965423584, "best_model_checkpoint": "./tmp/clm-gpt2/checkpoint-11500", "epoch": 2.9996095275283094, "eval_steps": 500, "global_step": 11523, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13015749056358195, "grad_norm": 0.3655782639980316, "learning_rate": 0.0015, "loss": 2.4536, "step": 500 }, { "epoch": 0.13015749056358195, "eval_accuracy": 0.4954853308958881, "eval_loss": 2.1316182613372803, "eval_runtime": 630.0742, "eval_samples_per_second": 24.442, "eval_steps_per_second": 6.11, "step": 500 }, { "epoch": 0.2603149811271639, "grad_norm": 0.2795959711074829, "learning_rate": 0.003, "loss": 2.1054, "step": 1000 }, { "epoch": 0.2603149811271639, "eval_accuracy": 0.5220788107298371, "eval_loss": 2.0124123096466064, "eval_runtime": 571.1208, "eval_samples_per_second": 26.965, "eval_steps_per_second": 6.741, "step": 1000 }, { "epoch": 0.3904724716907458, "grad_norm": 0.2532959282398224, "learning_rate": 0.0028574550983559823, "loss": 1.9756, "step": 1500 }, { "epoch": 0.3904724716907458, "eval_accuracy": 0.5452574551548158, "eval_loss": 1.902522087097168, "eval_runtime": 521.1164, "eval_samples_per_second": 29.552, "eval_steps_per_second": 7.388, "step": 1500 }, { "epoch": 0.5206299622543278, "grad_norm": 0.1999560445547104, "learning_rate": 0.002714910196711964, "loss": 1.8863, "step": 2000 }, { "epoch": 0.5206299622543278, "eval_accuracy": 0.5600627134351475, "eval_loss": 1.836737871170044, "eval_runtime": 521.7562, "eval_samples_per_second": 29.516, "eval_steps_per_second": 7.379, "step": 2000 }, { "epoch": 0.6507874528179096, "grad_norm": 0.18795448541641235, "learning_rate": 0.0025723652950679463, "loss": 1.8283, "step": 2500 }, { "epoch": 0.6507874528179096, "eval_accuracy": 0.5686439806527783, "eval_loss": 1.7926692962646484, "eval_runtime": 521.5355, "eval_samples_per_second": 29.528, "eval_steps_per_second": 7.382, "step": 2500 }, { "epoch": 0.7809449433814916, "grad_norm": 0.21605215966701508, "learning_rate": 0.0024298203934239285, "loss": 1.7893, "step": 3000 }, { "epoch": 0.7809449433814916, "eval_accuracy": 0.5759645681786444, "eval_loss": 1.758531093597412, "eval_runtime": 521.9502, "eval_samples_per_second": 29.505, "eval_steps_per_second": 7.376, "step": 3000 }, { "epoch": 0.9111024339450735, "grad_norm": 0.17780712246894836, "learning_rate": 0.0022872754917799107, "loss": 1.7555, "step": 3500 }, { "epoch": 0.9111024339450735, "eval_accuracy": 0.581453644107603, "eval_loss": 1.732818841934204, "eval_runtime": 521.3914, "eval_samples_per_second": 29.536, "eval_steps_per_second": 7.384, "step": 3500 }, { "epoch": 1.0412599245086556, "grad_norm": 0.17426873743534088, "learning_rate": 0.002144730590135893, "loss": 1.7143, "step": 4000 }, { "epoch": 1.0412599245086556, "eval_accuracy": 0.5881837859110587, "eval_loss": 1.701590895652771, "eval_runtime": 522.4527, "eval_samples_per_second": 29.476, "eval_steps_per_second": 7.369, "step": 4000 }, { "epoch": 1.1714174150722374, "grad_norm": 0.1862766295671463, "learning_rate": 0.002002185688491875, "loss": 1.6697, "step": 4500 }, { "epoch": 1.1714174150722374, "eval_accuracy": 0.592953625065062, "eval_loss": 1.6812903881072998, "eval_runtime": 520.916, "eval_samples_per_second": 29.563, "eval_steps_per_second": 7.391, "step": 4500 }, { "epoch": 1.3015749056358192, "grad_norm": 0.17118503153324127, "learning_rate": 0.0018596407868478571, "loss": 1.6584, "step": 5000 }, { "epoch": 1.3015749056358192, "eval_accuracy": 0.5972079191580658, "eval_loss": 1.6614623069763184, "eval_runtime": 520.2098, "eval_samples_per_second": 29.603, "eval_steps_per_second": 7.401, "step": 5000 }, { "epoch": 1.4317323961994013, "grad_norm": 0.17990975081920624, "learning_rate": 0.0017170958852038393, "loss": 1.6438, "step": 5500 }, { "epoch": 1.4317323961994013, "eval_accuracy": 0.6009242614667835, "eval_loss": 1.6421809196472168, "eval_runtime": 520.0082, "eval_samples_per_second": 29.615, "eval_steps_per_second": 7.404, "step": 5500 }, { "epoch": 1.5618898867629833, "grad_norm": 0.1535763144493103, "learning_rate": 0.0015745509835598213, "loss": 1.6184, "step": 6000 }, { "epoch": 1.5618898867629833, "eval_accuracy": 0.6048871412067893, "eval_loss": 1.6235625743865967, "eval_runtime": 518.9862, "eval_samples_per_second": 29.673, "eval_steps_per_second": 7.418, "step": 6000 }, { "epoch": 1.6920473773265652, "grad_norm": 0.16722074151039124, "learning_rate": 0.0014320060819158036, "loss": 1.6086, "step": 6500 }, { "epoch": 1.6920473773265652, "eval_accuracy": 0.6081556664254611, "eval_loss": 1.6102288961410522, "eval_runtime": 519.7651, "eval_samples_per_second": 29.629, "eval_steps_per_second": 7.407, "step": 6500 }, { "epoch": 1.822204867890147, "grad_norm": 0.1687261015176773, "learning_rate": 0.0012894611802717856, "loss": 1.5882, "step": 7000 }, { "epoch": 1.822204867890147, "eval_accuracy": 0.6113670005458862, "eval_loss": 1.5938153266906738, "eval_runtime": 518.0575, "eval_samples_per_second": 29.726, "eval_steps_per_second": 7.432, "step": 7000 }, { "epoch": 1.9523623584537289, "grad_norm": 0.17714297771453857, "learning_rate": 0.0011469162786277678, "loss": 1.5719, "step": 7500 }, { "epoch": 1.9523623584537289, "eval_accuracy": 0.6148032270759544, "eval_loss": 1.578608751296997, "eval_runtime": 518.0664, "eval_samples_per_second": 29.726, "eval_steps_per_second": 7.431, "step": 7500 }, { "epoch": 2.082519849017311, "grad_norm": 0.16494958102703094, "learning_rate": 0.00100437137698375, "loss": 1.5272, "step": 8000 }, { "epoch": 2.082519849017311, "eval_accuracy": 0.617450838506557, "eval_loss": 1.5717881917953491, "eval_runtime": 518.5565, "eval_samples_per_second": 29.698, "eval_steps_per_second": 7.424, "step": 8000 }, { "epoch": 2.212677339580893, "grad_norm": 0.14182300865650177, "learning_rate": 0.0008618264753397321, "loss": 1.4971, "step": 8500 }, { "epoch": 2.212677339580893, "eval_accuracy": 0.6203893564890632, "eval_loss": 1.5592997074127197, "eval_runtime": 517.6299, "eval_samples_per_second": 29.751, "eval_steps_per_second": 7.438, "step": 8500 }, { "epoch": 2.342834830144475, "grad_norm": 0.15439514815807343, "learning_rate": 0.0007192815736957142, "loss": 1.4893, "step": 9000 }, { "epoch": 2.342834830144475, "eval_accuracy": 0.6227424432849653, "eval_loss": 1.5474613904953003, "eval_runtime": 517.7895, "eval_samples_per_second": 29.742, "eval_steps_per_second": 7.435, "step": 9000 }, { "epoch": 2.4729923207080566, "grad_norm": 0.1444994956254959, "learning_rate": 0.0005767366720516963, "loss": 1.4808, "step": 9500 }, { "epoch": 2.4729923207080566, "eval_accuracy": 0.6250960378819617, "eval_loss": 1.5381561517715454, "eval_runtime": 518.2423, "eval_samples_per_second": 29.716, "eval_steps_per_second": 7.429, "step": 9500 }, { "epoch": 2.6031498112716385, "grad_norm": 0.14975884556770325, "learning_rate": 0.00043419177040767847, "loss": 1.4689, "step": 10000 }, { "epoch": 2.6031498112716385, "eval_accuracy": 0.6274774980640083, "eval_loss": 1.5274103879928589, "eval_runtime": 518.577, "eval_samples_per_second": 29.697, "eval_steps_per_second": 7.424, "step": 10000 }, { "epoch": 2.7333073018352207, "grad_norm": 0.14258165657520294, "learning_rate": 0.00029164686876366057, "loss": 1.4572, "step": 10500 }, { "epoch": 2.7333073018352207, "eval_accuracy": 0.6297954196341293, "eval_loss": 1.5169227123260498, "eval_runtime": 518.5892, "eval_samples_per_second": 29.696, "eval_steps_per_second": 7.424, "step": 10500 }, { "epoch": 2.8634647923988026, "grad_norm": 0.14571569859981537, "learning_rate": 0.00014910196711964268, "loss": 1.4488, "step": 11000 }, { "epoch": 2.8634647923988026, "eval_accuracy": 0.6314827791953892, "eval_loss": 1.5106253623962402, "eval_runtime": 519.3798, "eval_samples_per_second": 29.651, "eval_steps_per_second": 7.413, "step": 11000 }, { "epoch": 2.9936222829623844, "grad_norm": 0.15030799806118011, "learning_rate": 6.557065475624823e-06, "loss": 1.4465, "step": 11500 }, { "epoch": 2.9936222829623844, "eval_accuracy": 0.6325420522781227, "eval_loss": 1.5053555965423584, "eval_runtime": 518.7173, "eval_samples_per_second": 29.689, "eval_steps_per_second": 7.422, "step": 11500 }, { "epoch": 2.9996095275283094, "step": 11523, "total_flos": 9.6346733543424e+16, "train_loss": 1.6814149037051365, "train_runtime": 28276.4686, "train_samples_per_second": 6.521, "train_steps_per_second": 0.408 } ], "logging_steps": 500, "max_steps": 11523, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.6346733543424e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }