byt5_vmw_pt / trainer_state.json
felerminoali's picture
Training in progress, step 7000
541f08f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 24591,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0609979260705136,
"grad_norm": 2.6735310554504395,
"learning_rate": 4.8985401163027127e-05,
"loss": 2.9975,
"step": 500
},
{
"epoch": 0.1219958521410272,
"grad_norm": 2.0044796466827393,
"learning_rate": 4.79687690618519e-05,
"loss": 2.801,
"step": 1000
},
{
"epoch": 0.1829937782115408,
"grad_norm": 2.462843179702759,
"learning_rate": 4.6952136960676673e-05,
"loss": 2.6891,
"step": 1500
},
{
"epoch": 0.2439917042820544,
"grad_norm": 2.019771099090576,
"learning_rate": 4.593550485950145e-05,
"loss": 2.6299,
"step": 2000
},
{
"epoch": 0.304989630352568,
"grad_norm": 2.4739561080932617,
"learning_rate": 4.491887275832622e-05,
"loss": 2.5757,
"step": 2500
},
{
"epoch": 0.3659875564230816,
"grad_norm": 1.741564393043518,
"learning_rate": 4.3902240657150994e-05,
"loss": 2.5364,
"step": 3000
},
{
"epoch": 0.4269854824935952,
"grad_norm": 2.228411912918091,
"learning_rate": 4.288560855597577e-05,
"loss": 2.4804,
"step": 3500
},
{
"epoch": 0.4879834085641088,
"grad_norm": 2.0240838527679443,
"learning_rate": 4.186897645480054e-05,
"loss": 2.453,
"step": 4000
},
{
"epoch": 0.5489813346346224,
"grad_norm": 2.8336830139160156,
"learning_rate": 4.085234435362531e-05,
"loss": 2.4085,
"step": 4500
},
{
"epoch": 0.609979260705136,
"grad_norm": 1.7519323825836182,
"learning_rate": 3.983571225245009e-05,
"loss": 2.3795,
"step": 5000
},
{
"epoch": 0.6709771867756497,
"grad_norm": 2.075009822845459,
"learning_rate": 3.8819080151274854e-05,
"loss": 2.3558,
"step": 5500
},
{
"epoch": 0.7319751128461632,
"grad_norm": 2.2394227981567383,
"learning_rate": 3.7802448050099634e-05,
"loss": 2.3254,
"step": 6000
},
{
"epoch": 0.7929730389166768,
"grad_norm": 2.2436089515686035,
"learning_rate": 3.678581594892441e-05,
"loss": 2.3182,
"step": 6500
},
{
"epoch": 0.8539709649871904,
"grad_norm": 1.8976473808288574,
"learning_rate": 3.5769183847749174e-05,
"loss": 2.2847,
"step": 7000
},
{
"epoch": 0.9149688910577041,
"grad_norm": 2.127561330795288,
"learning_rate": 3.4752551746573955e-05,
"loss": 2.2741,
"step": 7500
},
{
"epoch": 0.9759668171282176,
"grad_norm": 2.080770254135132,
"learning_rate": 3.373591964539872e-05,
"loss": 2.2525,
"step": 8000
},
{
"epoch": 1.0369647431987312,
"grad_norm": 2.3001296520233154,
"learning_rate": 3.2719287544223495e-05,
"loss": 2.1812,
"step": 8500
},
{
"epoch": 1.0979626692692448,
"grad_norm": 2.002516746520996,
"learning_rate": 3.1702655443048275e-05,
"loss": 2.1904,
"step": 9000
},
{
"epoch": 1.1589605953397584,
"grad_norm": 1.7760947942733765,
"learning_rate": 3.068602334187304e-05,
"loss": 2.1648,
"step": 9500
},
{
"epoch": 1.219958521410272,
"grad_norm": 2.029125452041626,
"learning_rate": 2.966939124069782e-05,
"loss": 2.1476,
"step": 10000
},
{
"epoch": 1.2809564474807855,
"grad_norm": 2.6139211654663086,
"learning_rate": 2.865275913952259e-05,
"loss": 2.15,
"step": 10500
},
{
"epoch": 1.3419543735512993,
"grad_norm": 1.8446179628372192,
"learning_rate": 2.7636127038347365e-05,
"loss": 2.1283,
"step": 11000
},
{
"epoch": 1.402952299621813,
"grad_norm": 1.97454035282135,
"learning_rate": 2.6619494937172135e-05,
"loss": 2.1233,
"step": 11500
},
{
"epoch": 1.4639502256923265,
"grad_norm": 2.268068313598633,
"learning_rate": 2.560286283599691e-05,
"loss": 2.1084,
"step": 12000
},
{
"epoch": 1.52494815176284,
"grad_norm": 1.7861677408218384,
"learning_rate": 2.4586230734821682e-05,
"loss": 2.0869,
"step": 12500
},
{
"epoch": 1.5859460778333536,
"grad_norm": 2.0749287605285645,
"learning_rate": 2.3569598633646456e-05,
"loss": 2.0835,
"step": 13000
},
{
"epoch": 1.6469440039038674,
"grad_norm": 2.6761562824249268,
"learning_rate": 2.2552966532471232e-05,
"loss": 2.1048,
"step": 13500
},
{
"epoch": 1.707941929974381,
"grad_norm": 2.0634262561798096,
"learning_rate": 2.1536334431296002e-05,
"loss": 2.0877,
"step": 14000
},
{
"epoch": 1.7689398560448946,
"grad_norm": 2.3483529090881348,
"learning_rate": 2.0519702330120776e-05,
"loss": 2.0864,
"step": 14500
},
{
"epoch": 1.8299377821154081,
"grad_norm": 1.6350581645965576,
"learning_rate": 1.950307022894555e-05,
"loss": 2.0859,
"step": 15000
},
{
"epoch": 1.8909357081859217,
"grad_norm": 2.286836862564087,
"learning_rate": 1.8486438127770323e-05,
"loss": 2.063,
"step": 15500
},
{
"epoch": 1.9519336342564353,
"grad_norm": 4.733737468719482,
"learning_rate": 1.7469806026595096e-05,
"loss": 2.0553,
"step": 16000
},
{
"epoch": 2.012931560326949,
"grad_norm": 2.2189626693725586,
"learning_rate": 1.645317392541987e-05,
"loss": 2.063,
"step": 16500
},
{
"epoch": 2.0739294863974624,
"grad_norm": 2.3642847537994385,
"learning_rate": 1.5436541824244643e-05,
"loss": 2.0198,
"step": 17000
},
{
"epoch": 2.134927412467976,
"grad_norm": 2.2193996906280518,
"learning_rate": 1.4419909723069416e-05,
"loss": 2.0039,
"step": 17500
},
{
"epoch": 2.1959253385384896,
"grad_norm": 2.039994239807129,
"learning_rate": 1.340327762189419e-05,
"loss": 2.0012,
"step": 18000
},
{
"epoch": 2.256923264609003,
"grad_norm": 3.105970859527588,
"learning_rate": 1.2386645520718963e-05,
"loss": 2.0174,
"step": 18500
},
{
"epoch": 2.3179211906795167,
"grad_norm": 2.1234045028686523,
"learning_rate": 1.1370013419543737e-05,
"loss": 2.0009,
"step": 19000
},
{
"epoch": 2.3789191167500303,
"grad_norm": 1.8237278461456299,
"learning_rate": 1.0353381318368509e-05,
"loss": 2.0125,
"step": 19500
},
{
"epoch": 2.439917042820544,
"grad_norm": 2.1688883304595947,
"learning_rate": 9.336749217193284e-06,
"loss": 2.0001,
"step": 20000
},
{
"epoch": 2.500914968891058,
"grad_norm": 2.9593875408172607,
"learning_rate": 8.320117116018055e-06,
"loss": 1.9857,
"step": 20500
},
{
"epoch": 2.561912894961571,
"grad_norm": 1.8724238872528076,
"learning_rate": 7.303485014842829e-06,
"loss": 1.9884,
"step": 21000
},
{
"epoch": 2.622910821032085,
"grad_norm": 1.7542228698730469,
"learning_rate": 6.286852913667603e-06,
"loss": 1.9813,
"step": 21500
},
{
"epoch": 2.6839087471025986,
"grad_norm": 2.1812169551849365,
"learning_rate": 5.270220812492376e-06,
"loss": 1.9735,
"step": 22000
},
{
"epoch": 2.744906673173112,
"grad_norm": 1.6151964664459229,
"learning_rate": 4.253588711317149e-06,
"loss": 1.9882,
"step": 22500
},
{
"epoch": 2.805904599243626,
"grad_norm": 2.106318950653076,
"learning_rate": 3.2369566101419217e-06,
"loss": 1.9771,
"step": 23000
},
{
"epoch": 2.8669025253141394,
"grad_norm": 1.8604987859725952,
"learning_rate": 2.220324508966695e-06,
"loss": 1.9741,
"step": 23500
},
{
"epoch": 2.927900451384653,
"grad_norm": 2.1725010871887207,
"learning_rate": 1.2036924077914685e-06,
"loss": 1.9899,
"step": 24000
},
{
"epoch": 2.9888983774551665,
"grad_norm": 2.769097089767456,
"learning_rate": 1.8706030661624173e-07,
"loss": 1.9704,
"step": 24500
},
{
"epoch": 3.0,
"step": 24591,
"total_flos": 5.415436121066701e+16,
"train_loss": 2.194293288467889,
"train_runtime": 7212.7178,
"train_samples_per_second": 27.274,
"train_steps_per_second": 3.409
}
],
"logging_steps": 500,
"max_steps": 24591,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.415436121066701e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}