GPT2_BABYLM_20000 / trainer_state.json
xiulinyang's picture
Add checkpoint
0490ec1
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 10350,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.966183574879227,
"grad_norm": 18068.357421875,
"learning_rate": 0.0005993999999999999,
"loss": 1.1382,
"step": 1000
},
{
"epoch": 1.0,
"eval_accuracy": 0.31820882561564307,
"eval_loss": 4.372585296630859,
"eval_runtime": 671.6885,
"eval_samples_per_second": 50.702,
"eval_steps_per_second": 1.586,
"step": 1035
},
{
"epoch": 1.9323671497584543,
"grad_norm": 48934.1015625,
"learning_rate": 0.0005358930481283422,
"loss": 1.0291,
"step": 2000
},
{
"epoch": 2.0,
"eval_accuracy": 0.31461109065441656,
"eval_loss": 4.359684944152832,
"eval_runtime": 666.115,
"eval_samples_per_second": 51.126,
"eval_steps_per_second": 1.599,
"step": 2070
},
{
"epoch": 2.898550724637681,
"grad_norm": 52002.11328125,
"learning_rate": 0.00047172192513368985,
"loss": 1.0267,
"step": 3000
},
{
"epoch": 3.0,
"eval_accuracy": 0.31789266625201634,
"eval_loss": 4.333441257476807,
"eval_runtime": 665.863,
"eval_samples_per_second": 51.146,
"eval_steps_per_second": 1.599,
"step": 3105
},
{
"epoch": 3.864734299516908,
"grad_norm": 55701.2734375,
"learning_rate": 0.0004075508021390374,
"loss": 1.0192,
"step": 4000
},
{
"epoch": 4.0,
"eval_accuracy": 0.3215952705041587,
"eval_loss": 4.2829365730285645,
"eval_runtime": 664.6956,
"eval_samples_per_second": 51.235,
"eval_steps_per_second": 1.602,
"step": 4140
},
{
"epoch": 4.830917874396135,
"grad_norm": 83004.7734375,
"learning_rate": 0.000343379679144385,
"loss": 1.0042,
"step": 5000
},
{
"epoch": 5.0,
"eval_accuracy": 0.32670145683844315,
"eval_loss": 4.2311296463012695,
"eval_runtime": 664.7858,
"eval_samples_per_second": 51.229,
"eval_steps_per_second": 1.602,
"step": 5175
},
{
"epoch": 5.797101449275362,
"grad_norm": 21215.708984375,
"learning_rate": 0.0002792085561497326,
"loss": 0.9873,
"step": 6000
},
{
"epoch": 6.0,
"eval_accuracy": 0.33200261385989327,
"eval_loss": 4.1559600830078125,
"eval_runtime": 666.0654,
"eval_samples_per_second": 51.13,
"eval_steps_per_second": 1.599,
"step": 6210
},
{
"epoch": 6.763285024154589,
"grad_norm": 18555.8984375,
"learning_rate": 0.0002150374331550802,
"loss": 0.9662,
"step": 7000
},
{
"epoch": 7.0,
"eval_accuracy": 0.34101654601813886,
"eval_loss": 4.062748908996582,
"eval_runtime": 665.7453,
"eval_samples_per_second": 51.155,
"eval_steps_per_second": 1.6,
"step": 7245
},
{
"epoch": 7.729468599033816,
"grad_norm": 10123.9931640625,
"learning_rate": 0.0001508663101604278,
"loss": 0.9422,
"step": 8000
},
{
"epoch": 8.0,
"eval_accuracy": 0.34926088123762544,
"eval_loss": 3.976839542388916,
"eval_runtime": 665.8149,
"eval_samples_per_second": 51.149,
"eval_steps_per_second": 1.6,
"step": 8280
},
{
"epoch": 8.695652173913043,
"grad_norm": 8124.27392578125,
"learning_rate": 8.66951871657754e-05,
"loss": 0.9207,
"step": 9000
},
{
"epoch": 9.0,
"eval_accuracy": 0.3541727864362461,
"eval_loss": 3.9173777103424072,
"eval_runtime": 665.3441,
"eval_samples_per_second": 51.186,
"eval_steps_per_second": 1.601,
"step": 9315
},
{
"epoch": 9.66183574879227,
"grad_norm": 7507.09521484375,
"learning_rate": 2.252406417112299e-05,
"loss": 0.9058,
"step": 10000
},
{
"epoch": 10.0,
"eval_accuracy": 0.3569124320159682,
"eval_loss": 3.889129400253296,
"eval_runtime": 665.1197,
"eval_samples_per_second": 51.203,
"eval_steps_per_second": 1.601,
"step": 10350
},
{
"epoch": 10.0,
"step": 10350,
"total_flos": 8.6487662592e+16,
"train_loss": 0.990625182810613,
"train_runtime": 16693.5239,
"train_samples_per_second": 19.828,
"train_steps_per_second": 0.62
}
],
"logging_steps": 1000,
"max_steps": 10350,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.6487662592e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}