lr2.0e-04_assistant_only_lora / trainer_state.json
Gabe-Thomp's picture
Model save
8ff8074 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 10,
"global_step": 141,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10666666666666667,
"grad_norm": 0.9061112999916077,
"learning_rate": 5.333333333333333e-05,
"loss": 0.3768,
"mean_token_accuracy": 0.8932562351226807,
"num_tokens": 412634.0,
"step": 5
},
{
"epoch": 0.21333333333333335,
"grad_norm": 0.13824236392974854,
"learning_rate": 0.00012,
"loss": 0.1635,
"mean_token_accuracy": 0.9263631403446198,
"num_tokens": 825178.0,
"step": 10
},
{
"epoch": 0.21333333333333335,
"eval_loss": 0.13056232035160065,
"eval_mean_token_accuracy": 0.931514581044515,
"eval_num_tokens": 825178.0,
"eval_runtime": 5.6975,
"eval_samples_per_second": 42.124,
"eval_steps_per_second": 2.633,
"step": 10
},
{
"epoch": 0.32,
"grad_norm": 0.09409969300031662,
"learning_rate": 0.0001866666666666667,
"loss": 0.13,
"mean_token_accuracy": 0.9291968494653702,
"num_tokens": 1237729.0,
"step": 15
},
{
"epoch": 0.4266666666666667,
"grad_norm": 0.05856617912650108,
"learning_rate": 0.00019950307753654017,
"loss": 0.1253,
"mean_token_accuracy": 0.9320971354842186,
"num_tokens": 1650270.0,
"step": 20
},
{
"epoch": 0.4266666666666667,
"eval_loss": 0.12424330413341522,
"eval_mean_token_accuracy": 0.9342584053675334,
"eval_num_tokens": 1650270.0,
"eval_runtime": 5.7775,
"eval_samples_per_second": 41.541,
"eval_steps_per_second": 2.596,
"step": 20
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.031423866748809814,
"learning_rate": 0.00019749279121818235,
"loss": 0.1218,
"mean_token_accuracy": 0.9353153184056282,
"num_tokens": 2062754.0,
"step": 25
},
{
"epoch": 0.64,
"grad_norm": 0.01414029486477375,
"learning_rate": 0.00019396926207859084,
"loss": 0.1205,
"mean_token_accuracy": 0.9350377127528191,
"num_tokens": 2475201.0,
"step": 30
},
{
"epoch": 0.64,
"eval_loss": 0.11949822306632996,
"eval_mean_token_accuracy": 0.9356223146120707,
"eval_num_tokens": 2475201.0,
"eval_runtime": 5.6121,
"eval_samples_per_second": 42.765,
"eval_steps_per_second": 2.673,
"step": 30
},
{
"epoch": 0.7466666666666667,
"grad_norm": 0.019041303545236588,
"learning_rate": 0.0001889871808811469,
"loss": 0.119,
"mean_token_accuracy": 0.935609245300293,
"num_tokens": 2887864.0,
"step": 35
},
{
"epoch": 0.8533333333333334,
"grad_norm": 0.019156765192747116,
"learning_rate": 0.0001826238774315995,
"loss": 0.1186,
"mean_token_accuracy": 0.9359540060162544,
"num_tokens": 3300296.0,
"step": 40
},
{
"epoch": 0.8533333333333334,
"eval_loss": 0.11780127137899399,
"eval_mean_token_accuracy": 0.936646310488383,
"eval_num_tokens": 3300296.0,
"eval_runtime": 5.5646,
"eval_samples_per_second": 43.13,
"eval_steps_per_second": 2.696,
"step": 40
},
{
"epoch": 0.96,
"grad_norm": 0.028364377096295357,
"learning_rate": 0.00017497812029677344,
"loss": 0.1174,
"mean_token_accuracy": 0.9368061780929565,
"num_tokens": 3712869.0,
"step": 45
},
{
"epoch": 1.064,
"grad_norm": 0.025021756067872047,
"learning_rate": 0.00016616858375968595,
"loss": 0.1161,
"mean_token_accuracy": 0.937648336092631,
"num_tokens": 4114987.0,
"step": 50
},
{
"epoch": 1.064,
"eval_loss": 0.11562421917915344,
"eval_mean_token_accuracy": 0.9389559427897135,
"eval_num_tokens": 4114987.0,
"eval_runtime": 5.6496,
"eval_samples_per_second": 42.481,
"eval_steps_per_second": 2.655,
"step": 50
},
{
"epoch": 1.1706666666666667,
"grad_norm": 0.02837551198899746,
"learning_rate": 0.0001563320058063622,
"loss": 0.1148,
"mean_token_accuracy": 0.9386155918240547,
"num_tokens": 4527414.0,
"step": 55
},
{
"epoch": 1.2773333333333334,
"grad_norm": 0.025281216949224472,
"learning_rate": 0.0001456210657353163,
"loss": 0.1137,
"mean_token_accuracy": 0.9398613184690475,
"num_tokens": 4939842.0,
"step": 60
},
{
"epoch": 1.2773333333333334,
"eval_loss": 0.11369061470031738,
"eval_mean_token_accuracy": 0.9391787648200989,
"eval_num_tokens": 4939842.0,
"eval_runtime": 5.5173,
"eval_samples_per_second": 43.499,
"eval_steps_per_second": 2.719,
"step": 60
},
{
"epoch": 1.384,
"grad_norm": 0.027760421857237816,
"learning_rate": 0.00013420201433256689,
"loss": 0.1137,
"mean_token_accuracy": 0.9399674132466316,
"num_tokens": 5352381.0,
"step": 65
},
{
"epoch": 1.4906666666666666,
"grad_norm": 0.0254357922822237,
"learning_rate": 0.00012225209339563145,
"loss": 0.1125,
"mean_token_accuracy": 0.9407781735062599,
"num_tokens": 5764966.0,
"step": 70
},
{
"epoch": 1.4906666666666666,
"eval_loss": 0.11183393746614456,
"eval_mean_token_accuracy": 0.9411992589632671,
"eval_num_tokens": 5764966.0,
"eval_runtime": 5.6195,
"eval_samples_per_second": 42.709,
"eval_steps_per_second": 2.669,
"step": 70
},
{
"epoch": 1.5973333333333333,
"grad_norm": 0.036309123039245605,
"learning_rate": 0.00010995678465958168,
"loss": 0.1115,
"mean_token_accuracy": 0.9412863209843636,
"num_tokens": 6177504.0,
"step": 75
},
{
"epoch": 1.704,
"grad_norm": 0.03966047242283821,
"learning_rate": 9.750693082619273e-05,
"loss": 0.1112,
"mean_token_accuracy": 0.9416596934199333,
"num_tokens": 6590159.0,
"step": 80
},
{
"epoch": 1.704,
"eval_loss": 0.11047063767910004,
"eval_mean_token_accuracy": 0.9423920075098674,
"eval_num_tokens": 6590159.0,
"eval_runtime": 6.1817,
"eval_samples_per_second": 38.824,
"eval_steps_per_second": 2.427,
"step": 80
},
{
"epoch": 1.8106666666666666,
"grad_norm": 0.04924263805150986,
"learning_rate": 8.509577338238255e-05,
"loss": 0.1106,
"mean_token_accuracy": 0.9417484939098358,
"num_tokens": 7002597.0,
"step": 85
},
{
"epoch": 1.9173333333333333,
"grad_norm": 0.03291332349181175,
"learning_rate": 7.291595318569951e-05,
"loss": 0.1099,
"mean_token_accuracy": 0.9427696943283081,
"num_tokens": 7415216.0,
"step": 90
},
{
"epoch": 1.9173333333333333,
"eval_loss": 0.10944115370512009,
"eval_mean_token_accuracy": 0.9429703712463379,
"eval_num_tokens": 7415216.0,
"eval_runtime": 5.7041,
"eval_samples_per_second": 42.075,
"eval_steps_per_second": 2.63,
"step": 90
},
{
"epoch": 2.021333333333333,
"grad_norm": 0.046362534165382385,
"learning_rate": 6.115652037253053e-05,
"loss": 0.1089,
"mean_token_accuracy": 0.9430881432997875,
"num_tokens": 7817416.0,
"step": 95
},
{
"epoch": 2.128,
"grad_norm": 0.042689789086580276,
"learning_rate": 5.000000000000002e-05,
"loss": 0.1088,
"mean_token_accuracy": 0.9429372027516365,
"num_tokens": 8229908.0,
"step": 100
},
{
"epoch": 2.128,
"eval_loss": 0.1087011843919754,
"eval_mean_token_accuracy": 0.9434430122375488,
"eval_num_tokens": 8229908.0,
"eval_runtime": 5.3953,
"eval_samples_per_second": 44.483,
"eval_steps_per_second": 2.78,
"step": 100
},
{
"epoch": 2.2346666666666666,
"grad_norm": 0.04179241508245468,
"learning_rate": 3.961955896745224e-05,
"loss": 0.1084,
"mean_token_accuracy": 0.9430723547935486,
"num_tokens": 8642398.0,
"step": 105
},
{
"epoch": 2.3413333333333335,
"grad_norm": 0.043695490807294846,
"learning_rate": 3.0176318191392726e-05,
"loss": 0.1075,
"mean_token_accuracy": 0.9439892217516899,
"num_tokens": 9054924.0,
"step": 110
},
{
"epoch": 2.3413333333333335,
"eval_loss": 0.10814117640256882,
"eval_mean_token_accuracy": 0.9440598924954732,
"eval_num_tokens": 9054924.0,
"eval_runtime": 8.1225,
"eval_samples_per_second": 29.547,
"eval_steps_per_second": 1.847,
"step": 110
},
{
"epoch": 2.448,
"grad_norm": 0.04274023696780205,
"learning_rate": 2.181685175319702e-05,
"loss": 0.1084,
"mean_token_accuracy": 0.9428785502910614,
"num_tokens": 9467498.0,
"step": 115
},
{
"epoch": 2.554666666666667,
"grad_norm": 0.041405290365219116,
"learning_rate": 1.467091183678444e-05,
"loss": 0.108,
"mean_token_accuracy": 0.9433182254433632,
"num_tokens": 9880034.0,
"step": 120
},
{
"epoch": 2.554666666666667,
"eval_loss": 0.10807047039270401,
"eval_mean_token_accuracy": 0.94312850634257,
"eval_num_tokens": 9880034.0,
"eval_runtime": 6.2679,
"eval_samples_per_second": 38.29,
"eval_steps_per_second": 2.393,
"step": 120
},
{
"epoch": 2.6613333333333333,
"grad_norm": 0.05150838941335678,
"learning_rate": 8.849414768832687e-06,
"loss": 0.1082,
"mean_token_accuracy": 0.9431885123252869,
"num_tokens": 10292635.0,
"step": 125
},
{
"epoch": 2.768,
"grad_norm": 0.04310686141252518,
"learning_rate": 4.442719421385922e-06,
"loss": 0.1076,
"mean_token_accuracy": 0.9435940250754357,
"num_tokens": 10705122.0,
"step": 130
},
{
"epoch": 2.768,
"eval_loss": 0.10796218365430832,
"eval_mean_token_accuracy": 0.9437440276145935,
"eval_num_tokens": 10705122.0,
"eval_runtime": 5.7398,
"eval_samples_per_second": 41.813,
"eval_steps_per_second": 2.613,
"step": 130
},
{
"epoch": 2.8746666666666667,
"grad_norm": 0.03785852715373039,
"learning_rate": 1.5192246987791981e-06,
"loss": 0.1077,
"mean_token_accuracy": 0.9437474936246872,
"num_tokens": 11117662.0,
"step": 135
},
{
"epoch": 2.981333333333333,
"grad_norm": 0.04199996963143349,
"learning_rate": 1.2430787810776555e-07,
"loss": 0.1074,
"mean_token_accuracy": 0.9436563104391098,
"num_tokens": 11530180.0,
"step": 140
},
{
"epoch": 2.981333333333333,
"eval_loss": 0.10795663297176361,
"eval_mean_token_accuracy": 0.9438234766324362,
"eval_num_tokens": 11530180.0,
"eval_runtime": 5.9941,
"eval_samples_per_second": 40.039,
"eval_steps_per_second": 2.502,
"step": 140
},
{
"epoch": 3.0,
"mean_token_accuracy": 0.9443485651697431,
"num_tokens": 11602401.0,
"step": 141,
"total_flos": 5.846973469153034e+17,
"train_loss": 0.12444489327728325,
"train_runtime": 1956.6734,
"train_samples_per_second": 9.199,
"train_steps_per_second": 0.072
}
],
"logging_steps": 5,
"max_steps": 141,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.846973469153034e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}