gemma-2b-it-mt / trainer_state.json
u04ob20
model files
82442bd
{
"best_metric": 2.030155658721924,
"best_model_checkpoint": "/uoa/scratch/users/u04ob20/attrib/data/models/google-gemma-2b-it/checkpoint-114",
"epoch": 10.0,
"eval_steps": 57,
"global_step": 1130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.5,
"grad_norm": 17.672748565673828,
"learning_rate": 4.747787610619469e-05,
"loss": 2.9683,
"step": 57
},
{
"epoch": 0.5,
"eval_loss": 2.1323869228363037,
"eval_runtime": 14.1951,
"eval_samples_per_second": 15.639,
"eval_steps_per_second": 0.986,
"step": 57
},
{
"epoch": 1.01,
"grad_norm": 10.006913185119629,
"learning_rate": 4.495575221238939e-05,
"loss": 1.9812,
"step": 114
},
{
"epoch": 1.01,
"eval_loss": 2.030155658721924,
"eval_runtime": 14.0967,
"eval_samples_per_second": 15.748,
"eval_steps_per_second": 0.993,
"step": 114
},
{
"epoch": 1.51,
"grad_norm": 7.1079607009887695,
"learning_rate": 4.243362831858407e-05,
"loss": 1.6206,
"step": 171
},
{
"epoch": 1.51,
"eval_loss": 2.0554354190826416,
"eval_runtime": 14.0805,
"eval_samples_per_second": 15.766,
"eval_steps_per_second": 0.994,
"step": 171
},
{
"epoch": 2.02,
"grad_norm": 8.306120872497559,
"learning_rate": 3.991150442477876e-05,
"loss": 1.6558,
"step": 228
},
{
"epoch": 2.02,
"eval_loss": 2.124750852584839,
"eval_runtime": 14.181,
"eval_samples_per_second": 15.655,
"eval_steps_per_second": 0.987,
"step": 228
},
{
"epoch": 2.52,
"grad_norm": 5.911388874053955,
"learning_rate": 3.7389380530973455e-05,
"loss": 1.0749,
"step": 285
},
{
"epoch": 2.52,
"eval_loss": 2.2880985736846924,
"eval_runtime": 14.046,
"eval_samples_per_second": 15.805,
"eval_steps_per_second": 0.997,
"step": 285
},
{
"epoch": 3.03,
"grad_norm": 5.9703569412231445,
"learning_rate": 3.4867256637168145e-05,
"loss": 1.1099,
"step": 342
},
{
"epoch": 3.03,
"eval_loss": 2.6607654094696045,
"eval_runtime": 14.1568,
"eval_samples_per_second": 15.681,
"eval_steps_per_second": 0.989,
"step": 342
},
{
"epoch": 3.53,
"grad_norm": 7.199086666107178,
"learning_rate": 3.2345132743362834e-05,
"loss": 0.5349,
"step": 399
},
{
"epoch": 3.53,
"eval_loss": 2.9111106395721436,
"eval_runtime": 14.1631,
"eval_samples_per_second": 15.675,
"eval_steps_per_second": 0.988,
"step": 399
},
{
"epoch": 4.04,
"grad_norm": 3.1459338665008545,
"learning_rate": 2.982300884955752e-05,
"loss": 0.5432,
"step": 456
},
{
"epoch": 4.04,
"eval_loss": 3.114436149597168,
"eval_runtime": 14.1302,
"eval_samples_per_second": 15.711,
"eval_steps_per_second": 0.991,
"step": 456
},
{
"epoch": 4.54,
"grad_norm": 3.5249204635620117,
"learning_rate": 2.7300884955752216e-05,
"loss": 0.2523,
"step": 513
},
{
"epoch": 4.54,
"eval_loss": 3.34505033493042,
"eval_runtime": 14.1883,
"eval_samples_per_second": 15.647,
"eval_steps_per_second": 0.987,
"step": 513
},
{
"epoch": 5.04,
"grad_norm": 3.153855085372925,
"learning_rate": 2.4778761061946905e-05,
"loss": 0.2561,
"step": 570
},
{
"epoch": 5.04,
"eval_loss": 3.5140204429626465,
"eval_runtime": 14.128,
"eval_samples_per_second": 15.714,
"eval_steps_per_second": 0.991,
"step": 570
},
{
"epoch": 5.55,
"grad_norm": 3.072230339050293,
"learning_rate": 2.2256637168141594e-05,
"loss": 0.1508,
"step": 627
},
{
"epoch": 5.55,
"eval_loss": 3.5723717212677,
"eval_runtime": 14.0396,
"eval_samples_per_second": 15.812,
"eval_steps_per_second": 0.997,
"step": 627
},
{
"epoch": 6.05,
"grad_norm": 1.96257746219635,
"learning_rate": 1.9734513274336283e-05,
"loss": 0.1365,
"step": 684
},
{
"epoch": 6.05,
"eval_loss": 3.7443270683288574,
"eval_runtime": 14.1133,
"eval_samples_per_second": 15.73,
"eval_steps_per_second": 0.992,
"step": 684
},
{
"epoch": 6.56,
"grad_norm": 2.537320375442505,
"learning_rate": 1.7212389380530976e-05,
"loss": 0.0878,
"step": 741
},
{
"epoch": 6.56,
"eval_loss": 3.926490545272827,
"eval_runtime": 14.1716,
"eval_samples_per_second": 15.665,
"eval_steps_per_second": 0.988,
"step": 741
},
{
"epoch": 7.06,
"grad_norm": 1.6090797185897827,
"learning_rate": 1.4690265486725665e-05,
"loss": 0.0841,
"step": 798
},
{
"epoch": 7.06,
"eval_loss": 3.97700572013855,
"eval_runtime": 14.1937,
"eval_samples_per_second": 15.641,
"eval_steps_per_second": 0.986,
"step": 798
},
{
"epoch": 7.57,
"grad_norm": 1.7380380630493164,
"learning_rate": 1.2168141592920354e-05,
"loss": 0.0587,
"step": 855
},
{
"epoch": 7.57,
"eval_loss": 4.071342468261719,
"eval_runtime": 14.1098,
"eval_samples_per_second": 15.734,
"eval_steps_per_second": 0.992,
"step": 855
},
{
"epoch": 8.07,
"grad_norm": 0.9195989370346069,
"learning_rate": 9.646017699115045e-06,
"loss": 0.0539,
"step": 912
},
{
"epoch": 8.07,
"eval_loss": 4.22251558303833,
"eval_runtime": 14.0932,
"eval_samples_per_second": 15.752,
"eval_steps_per_second": 0.993,
"step": 912
},
{
"epoch": 8.58,
"grad_norm": 1.6740847826004028,
"learning_rate": 7.123893805309735e-06,
"loss": 0.0404,
"step": 969
},
{
"epoch": 8.58,
"eval_loss": 4.388303279876709,
"eval_runtime": 14.0715,
"eval_samples_per_second": 15.777,
"eval_steps_per_second": 0.995,
"step": 969
},
{
"epoch": 9.08,
"grad_norm": 0.768718421459198,
"learning_rate": 4.601769911504425e-06,
"loss": 0.0383,
"step": 1026
},
{
"epoch": 9.08,
"eval_loss": 4.46160364151001,
"eval_runtime": 14.1199,
"eval_samples_per_second": 15.722,
"eval_steps_per_second": 0.992,
"step": 1026
},
{
"epoch": 9.58,
"grad_norm": 0.8811420202255249,
"learning_rate": 2.079646017699115e-06,
"loss": 0.0292,
"step": 1083
},
{
"epoch": 9.58,
"eval_loss": 4.561453819274902,
"eval_runtime": 14.1759,
"eval_samples_per_second": 15.66,
"eval_steps_per_second": 0.988,
"step": 1083
},
{
"epoch": 10.0,
"step": 1130,
"total_flos": 2.40489782575104e+16,
"train_loss": 0.6407461469152332,
"train_runtime": 3370.5966,
"train_samples_per_second": 4.688,
"train_steps_per_second": 0.335
}
],
"logging_steps": 57,
"max_steps": 1130,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 57,
"total_flos": 2.40489782575104e+16,
"train_batch_size": 14,
"trial_name": null,
"trial_params": null
}