{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 10, "global_step": 141, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10666666666666667, "grad_norm": 0.9061112999916077, "learning_rate": 5.333333333333333e-05, "loss": 0.3768, "mean_token_accuracy": 0.8932562351226807, "num_tokens": 412634.0, "step": 5 }, { "epoch": 0.21333333333333335, "grad_norm": 0.13824236392974854, "learning_rate": 0.00012, "loss": 0.1635, "mean_token_accuracy": 0.9263631403446198, "num_tokens": 825178.0, "step": 10 }, { "epoch": 0.21333333333333335, "eval_loss": 0.13056232035160065, "eval_mean_token_accuracy": 0.931514581044515, "eval_num_tokens": 825178.0, "eval_runtime": 5.6975, "eval_samples_per_second": 42.124, "eval_steps_per_second": 2.633, "step": 10 }, { "epoch": 0.32, "grad_norm": 0.09409969300031662, "learning_rate": 0.0001866666666666667, "loss": 0.13, "mean_token_accuracy": 0.9291968494653702, "num_tokens": 1237729.0, "step": 15 }, { "epoch": 0.4266666666666667, "grad_norm": 0.05856617912650108, "learning_rate": 0.00019950307753654017, "loss": 0.1253, "mean_token_accuracy": 0.9320971354842186, "num_tokens": 1650270.0, "step": 20 }, { "epoch": 0.4266666666666667, "eval_loss": 0.12424330413341522, "eval_mean_token_accuracy": 0.9342584053675334, "eval_num_tokens": 1650270.0, "eval_runtime": 5.7775, "eval_samples_per_second": 41.541, "eval_steps_per_second": 2.596, "step": 20 }, { "epoch": 0.5333333333333333, "grad_norm": 0.031423866748809814, "learning_rate": 0.00019749279121818235, "loss": 0.1218, "mean_token_accuracy": 0.9353153184056282, "num_tokens": 2062754.0, "step": 25 }, { "epoch": 0.64, "grad_norm": 0.01414029486477375, "learning_rate": 0.00019396926207859084, "loss": 0.1205, "mean_token_accuracy": 0.9350377127528191, "num_tokens": 2475201.0, "step": 30 }, { "epoch": 0.64, "eval_loss": 0.11949822306632996, "eval_mean_token_accuracy": 0.9356223146120707, "eval_num_tokens": 2475201.0, "eval_runtime": 5.6121, "eval_samples_per_second": 42.765, "eval_steps_per_second": 2.673, "step": 30 }, { "epoch": 0.7466666666666667, "grad_norm": 0.019041303545236588, "learning_rate": 0.0001889871808811469, "loss": 0.119, "mean_token_accuracy": 0.935609245300293, "num_tokens": 2887864.0, "step": 35 }, { "epoch": 0.8533333333333334, "grad_norm": 0.019156765192747116, "learning_rate": 0.0001826238774315995, "loss": 0.1186, "mean_token_accuracy": 0.9359540060162544, "num_tokens": 3300296.0, "step": 40 }, { "epoch": 0.8533333333333334, "eval_loss": 0.11780127137899399, "eval_mean_token_accuracy": 0.936646310488383, "eval_num_tokens": 3300296.0, "eval_runtime": 5.5646, "eval_samples_per_second": 43.13, "eval_steps_per_second": 2.696, "step": 40 }, { "epoch": 0.96, "grad_norm": 0.028364377096295357, "learning_rate": 0.00017497812029677344, "loss": 0.1174, "mean_token_accuracy": 0.9368061780929565, "num_tokens": 3712869.0, "step": 45 }, { "epoch": 1.064, "grad_norm": 0.025021756067872047, "learning_rate": 0.00016616858375968595, "loss": 0.1161, "mean_token_accuracy": 0.937648336092631, "num_tokens": 4114987.0, "step": 50 }, { "epoch": 1.064, "eval_loss": 0.11562421917915344, "eval_mean_token_accuracy": 0.9389559427897135, "eval_num_tokens": 4114987.0, "eval_runtime": 5.6496, "eval_samples_per_second": 42.481, "eval_steps_per_second": 2.655, "step": 50 }, { "epoch": 1.1706666666666667, "grad_norm": 0.02837551198899746, "learning_rate": 0.0001563320058063622, "loss": 0.1148, "mean_token_accuracy": 0.9386155918240547, "num_tokens": 4527414.0, "step": 55 }, { "epoch": 1.2773333333333334, "grad_norm": 0.025281216949224472, "learning_rate": 0.0001456210657353163, "loss": 0.1137, "mean_token_accuracy": 0.9398613184690475, "num_tokens": 4939842.0, "step": 60 }, { "epoch": 1.2773333333333334, "eval_loss": 0.11369061470031738, "eval_mean_token_accuracy": 0.9391787648200989, "eval_num_tokens": 4939842.0, "eval_runtime": 5.5173, "eval_samples_per_second": 43.499, "eval_steps_per_second": 2.719, "step": 60 }, { "epoch": 1.384, "grad_norm": 0.027760421857237816, "learning_rate": 0.00013420201433256689, "loss": 0.1137, "mean_token_accuracy": 0.9399674132466316, "num_tokens": 5352381.0, "step": 65 }, { "epoch": 1.4906666666666666, "grad_norm": 0.0254357922822237, "learning_rate": 0.00012225209339563145, "loss": 0.1125, "mean_token_accuracy": 0.9407781735062599, "num_tokens": 5764966.0, "step": 70 }, { "epoch": 1.4906666666666666, "eval_loss": 0.11183393746614456, "eval_mean_token_accuracy": 0.9411992589632671, "eval_num_tokens": 5764966.0, "eval_runtime": 5.6195, "eval_samples_per_second": 42.709, "eval_steps_per_second": 2.669, "step": 70 }, { "epoch": 1.5973333333333333, "grad_norm": 0.036309123039245605, "learning_rate": 0.00010995678465958168, "loss": 0.1115, "mean_token_accuracy": 0.9412863209843636, "num_tokens": 6177504.0, "step": 75 }, { "epoch": 1.704, "grad_norm": 0.03966047242283821, "learning_rate": 9.750693082619273e-05, "loss": 0.1112, "mean_token_accuracy": 0.9416596934199333, "num_tokens": 6590159.0, "step": 80 }, { "epoch": 1.704, "eval_loss": 0.11047063767910004, "eval_mean_token_accuracy": 0.9423920075098674, "eval_num_tokens": 6590159.0, "eval_runtime": 6.1817, "eval_samples_per_second": 38.824, "eval_steps_per_second": 2.427, "step": 80 }, { "epoch": 1.8106666666666666, "grad_norm": 0.04924263805150986, "learning_rate": 8.509577338238255e-05, "loss": 0.1106, "mean_token_accuracy": 0.9417484939098358, "num_tokens": 7002597.0, "step": 85 }, { "epoch": 1.9173333333333333, "grad_norm": 0.03291332349181175, "learning_rate": 7.291595318569951e-05, "loss": 0.1099, "mean_token_accuracy": 0.9427696943283081, "num_tokens": 7415216.0, "step": 90 }, { "epoch": 1.9173333333333333, "eval_loss": 0.10944115370512009, "eval_mean_token_accuracy": 0.9429703712463379, "eval_num_tokens": 7415216.0, "eval_runtime": 5.7041, "eval_samples_per_second": 42.075, "eval_steps_per_second": 2.63, "step": 90 }, { "epoch": 2.021333333333333, "grad_norm": 0.046362534165382385, "learning_rate": 6.115652037253053e-05, "loss": 0.1089, "mean_token_accuracy": 0.9430881432997875, "num_tokens": 7817416.0, "step": 95 }, { "epoch": 2.128, "grad_norm": 0.042689789086580276, "learning_rate": 5.000000000000002e-05, "loss": 0.1088, "mean_token_accuracy": 0.9429372027516365, "num_tokens": 8229908.0, "step": 100 }, { "epoch": 2.128, "eval_loss": 0.1087011843919754, "eval_mean_token_accuracy": 0.9434430122375488, "eval_num_tokens": 8229908.0, "eval_runtime": 5.3953, "eval_samples_per_second": 44.483, "eval_steps_per_second": 2.78, "step": 100 }, { "epoch": 2.2346666666666666, "grad_norm": 0.04179241508245468, "learning_rate": 3.961955896745224e-05, "loss": 0.1084, "mean_token_accuracy": 0.9430723547935486, "num_tokens": 8642398.0, "step": 105 }, { "epoch": 2.3413333333333335, "grad_norm": 0.043695490807294846, "learning_rate": 3.0176318191392726e-05, "loss": 0.1075, "mean_token_accuracy": 0.9439892217516899, "num_tokens": 9054924.0, "step": 110 }, { "epoch": 2.3413333333333335, "eval_loss": 0.10814117640256882, "eval_mean_token_accuracy": 0.9440598924954732, "eval_num_tokens": 9054924.0, "eval_runtime": 8.1225, "eval_samples_per_second": 29.547, "eval_steps_per_second": 1.847, "step": 110 }, { "epoch": 2.448, "grad_norm": 0.04274023696780205, "learning_rate": 2.181685175319702e-05, "loss": 0.1084, "mean_token_accuracy": 0.9428785502910614, "num_tokens": 9467498.0, "step": 115 }, { "epoch": 2.554666666666667, "grad_norm": 0.041405290365219116, "learning_rate": 1.467091183678444e-05, "loss": 0.108, "mean_token_accuracy": 0.9433182254433632, "num_tokens": 9880034.0, "step": 120 }, { "epoch": 2.554666666666667, "eval_loss": 0.10807047039270401, "eval_mean_token_accuracy": 0.94312850634257, "eval_num_tokens": 9880034.0, "eval_runtime": 6.2679, "eval_samples_per_second": 38.29, "eval_steps_per_second": 2.393, "step": 120 }, { "epoch": 2.6613333333333333, "grad_norm": 0.05150838941335678, "learning_rate": 8.849414768832687e-06, "loss": 0.1082, "mean_token_accuracy": 0.9431885123252869, "num_tokens": 10292635.0, "step": 125 }, { "epoch": 2.768, "grad_norm": 0.04310686141252518, "learning_rate": 4.442719421385922e-06, "loss": 0.1076, "mean_token_accuracy": 0.9435940250754357, "num_tokens": 10705122.0, "step": 130 }, { "epoch": 2.768, "eval_loss": 0.10796218365430832, "eval_mean_token_accuracy": 0.9437440276145935, "eval_num_tokens": 10705122.0, "eval_runtime": 5.7398, "eval_samples_per_second": 41.813, "eval_steps_per_second": 2.613, "step": 130 }, { "epoch": 2.8746666666666667, "grad_norm": 0.03785852715373039, "learning_rate": 1.5192246987791981e-06, "loss": 0.1077, "mean_token_accuracy": 0.9437474936246872, "num_tokens": 11117662.0, "step": 135 }, { "epoch": 2.981333333333333, "grad_norm": 0.04199996963143349, "learning_rate": 1.2430787810776555e-07, "loss": 0.1074, "mean_token_accuracy": 0.9436563104391098, "num_tokens": 11530180.0, "step": 140 }, { "epoch": 2.981333333333333, "eval_loss": 0.10795663297176361, "eval_mean_token_accuracy": 0.9438234766324362, "eval_num_tokens": 11530180.0, "eval_runtime": 5.9941, "eval_samples_per_second": 40.039, "eval_steps_per_second": 2.502, "step": 140 }, { "epoch": 3.0, "mean_token_accuracy": 0.9443485651697431, "num_tokens": 11602401.0, "step": 141, "total_flos": 5.846973469153034e+17, "train_loss": 0.12444489327728325, "train_runtime": 1956.6734, "train_samples_per_second": 9.199, "train_steps_per_second": 0.072 } ], "logging_steps": 5, "max_steps": 141, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.846973469153034e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }