GPT2_AR_200 / trainer_state.json
xiulinyang's picture
Add checkpoint
183e8c8
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 28450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.351493848857645,
"grad_norm": 12501.5126953125,
"learning_rate": 0.0005993999999999999,
"loss": 0.6616,
"step": 1000
},
{
"epoch": 0.70298769771529,
"grad_norm": 11799.552734375,
"learning_rate": 0.0005781639344262295,
"loss": 0.5298,
"step": 2000
},
{
"epoch": 1.0,
"eval_accuracy": 0.39387861735030005,
"eval_loss": 1.9969476461410522,
"eval_runtime": 10.2477,
"eval_samples_per_second": 58.843,
"eval_steps_per_second": 1.854,
"step": 2845
},
{
"epoch": 1.054481546572935,
"grad_norm": 10105.4248046875,
"learning_rate": 0.0005563060109289616,
"loss": 0.4896,
"step": 3000
},
{
"epoch": 1.40597539543058,
"grad_norm": 9273.8095703125,
"learning_rate": 0.0005344480874316939,
"loss": 0.4687,
"step": 4000
},
{
"epoch": 1.757469244288225,
"grad_norm": 8524.5703125,
"learning_rate": 0.0005125901639344262,
"loss": 0.4512,
"step": 5000
},
{
"epoch": 2.0,
"eval_accuracy": 0.43839510860569947,
"eval_loss": 1.821701169013977,
"eval_runtime": 9.7306,
"eval_samples_per_second": 61.97,
"eval_steps_per_second": 1.953,
"step": 5690
},
{
"epoch": 2.10896309314587,
"grad_norm": 8911.6494140625,
"learning_rate": 0.0004907322404371584,
"loss": 0.437,
"step": 6000
},
{
"epoch": 2.460456942003515,
"grad_norm": 6983.92236328125,
"learning_rate": 0.00046887431693989066,
"loss": 0.4253,
"step": 7000
},
{
"epoch": 2.81195079086116,
"grad_norm": 7203.26025390625,
"learning_rate": 0.00044701639344262294,
"loss": 0.4173,
"step": 8000
},
{
"epoch": 3.0,
"eval_accuracy": 0.4640041800131761,
"eval_loss": 1.729548692703247,
"eval_runtime": 9.8999,
"eval_samples_per_second": 60.909,
"eval_steps_per_second": 1.919,
"step": 8535
},
{
"epoch": 3.1634446397188047,
"grad_norm": 7449.33203125,
"learning_rate": 0.00042515846994535517,
"loss": 0.4092,
"step": 9000
},
{
"epoch": 3.51493848857645,
"grad_norm": 7554.9697265625,
"learning_rate": 0.0004033005464480874,
"loss": 0.4028,
"step": 10000
},
{
"epoch": 3.866432337434095,
"grad_norm": 7199.42626953125,
"learning_rate": 0.0003814426229508197,
"loss": 0.3977,
"step": 11000
},
{
"epoch": 4.0,
"eval_accuracy": 0.4796013409793823,
"eval_loss": 1.6785619258880615,
"eval_runtime": 10.0375,
"eval_samples_per_second": 60.075,
"eval_steps_per_second": 1.893,
"step": 11380
},
{
"epoch": 4.21792618629174,
"grad_norm": 7292.9921875,
"learning_rate": 0.0003595846994535519,
"loss": 0.392,
"step": 12000
},
{
"epoch": 4.569420035149385,
"grad_norm": 6805.3798828125,
"learning_rate": 0.00033772677595628414,
"loss": 0.3879,
"step": 13000
},
{
"epoch": 4.92091388400703,
"grad_norm": 7368.5224609375,
"learning_rate": 0.00031586885245901637,
"loss": 0.3841,
"step": 14000
},
{
"epoch": 5.0,
"eval_accuracy": 0.49063553725177117,
"eval_loss": 1.6362360715866089,
"eval_runtime": 9.729,
"eval_samples_per_second": 61.98,
"eval_steps_per_second": 1.953,
"step": 14225
},
{
"epoch": 5.272407732864675,
"grad_norm": 7300.4189453125,
"learning_rate": 0.0002940109289617486,
"loss": 0.3799,
"step": 15000
},
{
"epoch": 5.62390158172232,
"grad_norm": 6178.0908203125,
"learning_rate": 0.0002721530054644809,
"loss": 0.3764,
"step": 16000
},
{
"epoch": 5.975395430579965,
"grad_norm": 6489.77490234375,
"learning_rate": 0.0002502950819672131,
"loss": 0.3747,
"step": 17000
},
{
"epoch": 6.0,
"eval_accuracy": 0.4991123962704417,
"eval_loss": 1.6064575910568237,
"eval_runtime": 9.7095,
"eval_samples_per_second": 62.104,
"eval_steps_per_second": 1.957,
"step": 17070
},
{
"epoch": 6.3268892794376095,
"grad_norm": 7217.43701171875,
"learning_rate": 0.00022843715846994535,
"loss": 0.3693,
"step": 18000
},
{
"epoch": 6.678383128295255,
"grad_norm": 6669.8017578125,
"learning_rate": 0.00020657923497267757,
"loss": 0.3677,
"step": 19000
},
{
"epoch": 7.0,
"eval_accuracy": 0.5070570175865616,
"eval_loss": 1.5771993398666382,
"eval_runtime": 9.7124,
"eval_samples_per_second": 62.086,
"eval_steps_per_second": 1.956,
"step": 19915
},
{
"epoch": 7.0298769771529,
"grad_norm": 7067.46533203125,
"learning_rate": 0.00018472131147540983,
"loss": 0.3655,
"step": 20000
},
{
"epoch": 7.381370826010545,
"grad_norm": 6705.837890625,
"learning_rate": 0.00016286338797814206,
"loss": 0.3605,
"step": 21000
},
{
"epoch": 7.73286467486819,
"grad_norm": 6395.66162109375,
"learning_rate": 0.00014100546448087432,
"loss": 0.3586,
"step": 22000
},
{
"epoch": 8.0,
"eval_accuracy": 0.5143298510708039,
"eval_loss": 1.5533959865570068,
"eval_runtime": 9.7631,
"eval_samples_per_second": 61.763,
"eval_steps_per_second": 1.946,
"step": 22760
},
{
"epoch": 8.084358523725834,
"grad_norm": 7270.5849609375,
"learning_rate": 0.00011914754098360655,
"loss": 0.3549,
"step": 23000
},
{
"epoch": 8.43585237258348,
"grad_norm": 7109.36328125,
"learning_rate": 9.728961748633879e-05,
"loss": 0.3524,
"step": 24000
},
{
"epoch": 8.787346221441124,
"grad_norm": 6891.5283203125,
"learning_rate": 7.543169398907103e-05,
"loss": 0.3504,
"step": 25000
},
{
"epoch": 9.0,
"eval_accuracy": 0.5215475135736841,
"eval_loss": 1.5324053764343262,
"eval_runtime": 9.718,
"eval_samples_per_second": 62.05,
"eval_steps_per_second": 1.955,
"step": 25605
},
{
"epoch": 9.13884007029877,
"grad_norm": 7154.31298828125,
"learning_rate": 5.357377049180328e-05,
"loss": 0.3472,
"step": 26000
},
{
"epoch": 9.490333919156415,
"grad_norm": 7199.95068359375,
"learning_rate": 3.171584699453552e-05,
"loss": 0.3449,
"step": 27000
},
{
"epoch": 9.84182776801406,
"grad_norm": 6995.279296875,
"learning_rate": 9.857923497267758e-06,
"loss": 0.3437,
"step": 28000
},
{
"epoch": 10.0,
"eval_accuracy": 0.5249648690662798,
"eval_loss": 1.523409128189087,
"eval_runtime": 9.7518,
"eval_samples_per_second": 61.835,
"eval_steps_per_second": 1.948,
"step": 28450
},
{
"epoch": 10.0,
"step": 28450,
"total_flos": 2.3783323336704e+17,
"train_loss": 0.40263440499196784,
"train_runtime": 24635.8124,
"train_samples_per_second": 36.947,
"train_steps_per_second": 1.155
}
],
"logging_steps": 1000,
"max_steps": 28450,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.3783323336704e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}