t5_236k / trainer_state.json
jurgiraud's picture
Upload folder using huggingface_hub
162ee30 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 8.0,
"eval_steps": 5000,
"global_step": 157336,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.2542329791020491,
"grad_norm": 0.8224548697471619,
"learning_rate": 1.937686026100893e-05,
"loss": 0.4274,
"step": 5000
},
{
"epoch": 0.2542329791020491,
"eval_bleu": 44.9035,
"eval_gen_len": 55.7883,
"eval_loss": 0.8018454313278198,
"eval_runtime": 349.8887,
"eval_samples_per_second": 4.144,
"eval_steps_per_second": 0.346,
"step": 5000
},
{
"epoch": 0.5084659582040982,
"grad_norm": 0.7412045001983643,
"learning_rate": 1.8741127985957416e-05,
"loss": 0.3884,
"step": 10000
},
{
"epoch": 0.5084659582040982,
"eval_bleu": 45.2602,
"eval_gen_len": 55.9462,
"eval_loss": 0.7863603234291077,
"eval_runtime": 356.7073,
"eval_samples_per_second": 4.065,
"eval_steps_per_second": 0.339,
"step": 10000
},
{
"epoch": 0.7626989373061474,
"grad_norm": 0.9902288317680359,
"learning_rate": 1.8105522908239844e-05,
"loss": 0.3703,
"step": 15000
},
{
"epoch": 0.7626989373061474,
"eval_bleu": 45.5987,
"eval_gen_len": 55.831,
"eval_loss": 0.7793292999267578,
"eval_runtime": 352.9131,
"eval_samples_per_second": 4.109,
"eval_steps_per_second": 0.343,
"step": 15000
},
{
"epoch": 1.0169319164081965,
"grad_norm": 0.823268473148346,
"learning_rate": 1.7469790633188332e-05,
"loss": 0.3626,
"step": 20000
},
{
"epoch": 1.0169319164081965,
"eval_bleu": 45.7527,
"eval_gen_len": 55.9131,
"eval_loss": 0.7690481543540955,
"eval_runtime": 354.1353,
"eval_samples_per_second": 4.094,
"eval_steps_per_second": 0.342,
"step": 20000
},
{
"epoch": 1.2711648955102457,
"grad_norm": 0.8465375900268555,
"learning_rate": 1.683418555547076e-05,
"loss": 0.3434,
"step": 25000
},
{
"epoch": 1.2711648955102457,
"eval_bleu": 45.9693,
"eval_gen_len": 55.9407,
"eval_loss": 0.7745993733406067,
"eval_runtime": 354.895,
"eval_samples_per_second": 4.086,
"eval_steps_per_second": 0.341,
"step": 25000
},
{
"epoch": 1.5253978746122947,
"grad_norm": 0.7541704177856445,
"learning_rate": 1.6198453280419245e-05,
"loss": 0.3389,
"step": 30000
},
{
"epoch": 1.5253978746122947,
"eval_bleu": 46.1257,
"eval_gen_len": 55.9559,
"eval_loss": 0.7699302434921265,
"eval_runtime": 354.6656,
"eval_samples_per_second": 4.088,
"eval_steps_per_second": 0.341,
"step": 30000
},
{
"epoch": 1.7796308537143437,
"grad_norm": 0.7378148436546326,
"learning_rate": 1.556272100536773e-05,
"loss": 0.3375,
"step": 35000
},
{
"epoch": 1.7796308537143437,
"eval_bleu": 46.1114,
"eval_gen_len": 55.8379,
"eval_loss": 0.7595505714416504,
"eval_runtime": 353.6224,
"eval_samples_per_second": 4.1,
"eval_steps_per_second": 0.342,
"step": 35000
},
{
"epoch": 2.033863832816393,
"grad_norm": 0.9236812591552734,
"learning_rate": 1.4927115927650158e-05,
"loss": 0.3306,
"step": 40000
},
{
"epoch": 2.033863832816393,
"eval_bleu": 46.1398,
"eval_gen_len": 55.8455,
"eval_loss": 0.7678882479667664,
"eval_runtime": 354.3507,
"eval_samples_per_second": 4.092,
"eval_steps_per_second": 0.341,
"step": 40000
},
{
"epoch": 2.288096811918442,
"grad_norm": 1.1531308889389038,
"learning_rate": 1.4291383652598642e-05,
"loss": 0.3187,
"step": 45000
},
{
"epoch": 2.288096811918442,
"eval_bleu": 46.1836,
"eval_gen_len": 55.8855,
"eval_loss": 0.7600361704826355,
"eval_runtime": 354.4781,
"eval_samples_per_second": 4.091,
"eval_steps_per_second": 0.341,
"step": 45000
},
{
"epoch": 2.5423297910204914,
"grad_norm": 0.6744846105575562,
"learning_rate": 1.3655651377547128e-05,
"loss": 0.3169,
"step": 50000
},
{
"epoch": 2.5423297910204914,
"eval_bleu": 46.554,
"eval_gen_len": 55.7834,
"eval_loss": 0.7513669729232788,
"eval_runtime": 345.0185,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.351,
"step": 50000
},
{
"epoch": 2.79656277012254,
"grad_norm": 0.7795997858047485,
"learning_rate": 1.3020046299829556e-05,
"loss": 0.3166,
"step": 55000
},
{
"epoch": 2.79656277012254,
"eval_bleu": 46.3029,
"eval_gen_len": 55.7586,
"eval_loss": 0.75509113073349,
"eval_runtime": 342.8695,
"eval_samples_per_second": 4.229,
"eval_steps_per_second": 0.353,
"step": 55000
},
{
"epoch": 3.0507957492245894,
"grad_norm": 0.8248696327209473,
"learning_rate": 1.238431402477804e-05,
"loss": 0.312,
"step": 60000
},
{
"epoch": 3.0507957492245894,
"eval_bleu": 46.2729,
"eval_gen_len": 55.8938,
"eval_loss": 0.753489077091217,
"eval_runtime": 344.7281,
"eval_samples_per_second": 4.206,
"eval_steps_per_second": 0.351,
"step": 60000
},
{
"epoch": 3.3050287283266386,
"grad_norm": 0.6925950646400452,
"learning_rate": 1.1748708947060469e-05,
"loss": 0.3043,
"step": 65000
},
{
"epoch": 3.3050287283266386,
"eval_bleu": 46.5336,
"eval_gen_len": 55.809,
"eval_loss": 0.7513247132301331,
"eval_runtime": 342.2971,
"eval_samples_per_second": 4.236,
"eval_steps_per_second": 0.353,
"step": 65000
},
{
"epoch": 3.5592617074286874,
"grad_norm": 0.7592390179634094,
"learning_rate": 1.1112976672008955e-05,
"loss": 0.3034,
"step": 70000
},
{
"epoch": 3.5592617074286874,
"eval_bleu": 46.3724,
"eval_gen_len": 55.8724,
"eval_loss": 0.7483436465263367,
"eval_runtime": 345.9725,
"eval_samples_per_second": 4.191,
"eval_steps_per_second": 0.35,
"step": 70000
},
{
"epoch": 3.8134946865307366,
"grad_norm": 0.6906684637069702,
"learning_rate": 1.047724439695744e-05,
"loss": 0.3022,
"step": 75000
},
{
"epoch": 3.8134946865307366,
"eval_bleu": 46.3098,
"eval_gen_len": 55.7759,
"eval_loss": 0.7495469450950623,
"eval_runtime": 345.9396,
"eval_samples_per_second": 4.191,
"eval_steps_per_second": 0.35,
"step": 75000
},
{
"epoch": 4.067727665632786,
"grad_norm": 0.8161213397979736,
"learning_rate": 9.841512121905925e-06,
"loss": 0.3008,
"step": 80000
},
{
"epoch": 4.067727665632786,
"eval_bleu": 46.3194,
"eval_gen_len": 55.829,
"eval_loss": 0.7491657137870789,
"eval_runtime": 345.4241,
"eval_samples_per_second": 4.198,
"eval_steps_per_second": 0.35,
"step": 80000
},
{
"epoch": 4.321960644734835,
"grad_norm": 0.641299307346344,
"learning_rate": 9.205907044188356e-06,
"loss": 0.2931,
"step": 85000
},
{
"epoch": 4.321960644734835,
"eval_bleu": 46.4319,
"eval_gen_len": 55.9069,
"eval_loss": 0.7467553615570068,
"eval_runtime": 349.087,
"eval_samples_per_second": 4.154,
"eval_steps_per_second": 0.347,
"step": 85000
},
{
"epoch": 4.576193623836884,
"grad_norm": 0.8405129909515381,
"learning_rate": 8.57017476913684e-06,
"loss": 0.2944,
"step": 90000
},
{
"epoch": 4.576193623836884,
"eval_bleu": 46.456,
"eval_gen_len": 55.9034,
"eval_loss": 0.7442417740821838,
"eval_runtime": 347.6322,
"eval_samples_per_second": 4.171,
"eval_steps_per_second": 0.348,
"step": 90000
},
{
"epoch": 4.8304266029389336,
"grad_norm": 1.2027479410171509,
"learning_rate": 7.934442494085324e-06,
"loss": 0.2928,
"step": 95000
},
{
"epoch": 4.8304266029389336,
"eval_bleu": 46.4492,
"eval_gen_len": 55.7993,
"eval_loss": 0.7427089214324951,
"eval_runtime": 345.6033,
"eval_samples_per_second": 4.196,
"eval_steps_per_second": 0.35,
"step": 95000
},
{
"epoch": 5.084659582040983,
"grad_norm": 0.668217658996582,
"learning_rate": 7.29871021903381e-06,
"loss": 0.2898,
"step": 100000
},
{
"epoch": 5.084659582040983,
"eval_bleu": 46.5755,
"eval_gen_len": 55.7524,
"eval_loss": 0.7419084906578064,
"eval_runtime": 342.1745,
"eval_samples_per_second": 4.238,
"eval_steps_per_second": 0.354,
"step": 100000
},
{
"epoch": 5.338892561143031,
"grad_norm": 1.0696251392364502,
"learning_rate": 6.663105141316239e-06,
"loss": 0.2851,
"step": 105000
},
{
"epoch": 5.338892561143031,
"eval_bleu": 46.6577,
"eval_gen_len": 55.9538,
"eval_loss": 0.7440945506095886,
"eval_runtime": 348.3083,
"eval_samples_per_second": 4.163,
"eval_steps_per_second": 0.347,
"step": 105000
},
{
"epoch": 5.59312554024508,
"grad_norm": 0.6697210073471069,
"learning_rate": 6.027372866264724e-06,
"loss": 0.286,
"step": 110000
},
{
"epoch": 5.59312554024508,
"eval_bleu": 46.7734,
"eval_gen_len": 55.7655,
"eval_loss": 0.7419018149375916,
"eval_runtime": 344.8268,
"eval_samples_per_second": 4.205,
"eval_steps_per_second": 0.351,
"step": 110000
},
{
"epoch": 5.84735851934713,
"grad_norm": 1.0238195657730103,
"learning_rate": 5.391767788547152e-06,
"loss": 0.2862,
"step": 115000
},
{
"epoch": 5.84735851934713,
"eval_bleu": 46.5343,
"eval_gen_len": 55.7931,
"eval_loss": 0.738459587097168,
"eval_runtime": 344.0608,
"eval_samples_per_second": 4.214,
"eval_steps_per_second": 0.352,
"step": 115000
},
{
"epoch": 6.101591498449179,
"grad_norm": 0.6626740097999573,
"learning_rate": 4.756035513495637e-06,
"loss": 0.2851,
"step": 120000
},
{
"epoch": 6.101591498449179,
"eval_bleu": 46.5618,
"eval_gen_len": 55.7979,
"eval_loss": 0.7425189018249512,
"eval_runtime": 344.7989,
"eval_samples_per_second": 4.205,
"eval_steps_per_second": 0.351,
"step": 120000
},
{
"epoch": 6.355824477551228,
"grad_norm": 0.9636191129684448,
"learning_rate": 4.1203032384441225e-06,
"loss": 0.283,
"step": 125000
},
{
"epoch": 6.355824477551228,
"eval_bleu": 46.6226,
"eval_gen_len": 55.7945,
"eval_loss": 0.743998646736145,
"eval_runtime": 345.3796,
"eval_samples_per_second": 4.198,
"eval_steps_per_second": 0.35,
"step": 125000
},
{
"epoch": 6.610057456653277,
"grad_norm": 0.9589861631393433,
"learning_rate": 3.4845709633926077e-06,
"loss": 0.2795,
"step": 130000
},
{
"epoch": 6.610057456653277,
"eval_bleu": 46.6222,
"eval_gen_len": 55.7572,
"eval_loss": 0.7409077882766724,
"eval_runtime": 345.3114,
"eval_samples_per_second": 4.199,
"eval_steps_per_second": 0.35,
"step": 130000
},
{
"epoch": 6.8642904357553265,
"grad_norm": 1.038360834121704,
"learning_rate": 2.848965885675036e-06,
"loss": 0.2814,
"step": 135000
},
{
"epoch": 6.8642904357553265,
"eval_bleu": 46.6826,
"eval_gen_len": 55.7393,
"eval_loss": 0.7406843304634094,
"eval_runtime": 344.9932,
"eval_samples_per_second": 4.203,
"eval_steps_per_second": 0.351,
"step": 135000
},
{
"epoch": 7.118523414857376,
"grad_norm": 0.8131181001663208,
"learning_rate": 2.2131064132895776e-06,
"loss": 0.2802,
"step": 140000
},
{
"epoch": 7.118523414857376,
"eval_bleu": 46.6691,
"eval_gen_len": 55.7793,
"eval_loss": 0.7410460114479065,
"eval_runtime": 344.2601,
"eval_samples_per_second": 4.212,
"eval_steps_per_second": 0.351,
"step": 140000
},
{
"epoch": 7.372756393959424,
"grad_norm": 0.8258042335510254,
"learning_rate": 1.5775013355720064e-06,
"loss": 0.2781,
"step": 145000
},
{
"epoch": 7.372756393959424,
"eval_bleu": 46.6998,
"eval_gen_len": 55.7331,
"eval_loss": 0.7418521642684937,
"eval_runtime": 344.3472,
"eval_samples_per_second": 4.211,
"eval_steps_per_second": 0.351,
"step": 145000
},
{
"epoch": 7.626989373061473,
"grad_norm": 0.849775493144989,
"learning_rate": 9.417690605204915e-07,
"loss": 0.2765,
"step": 150000
},
{
"epoch": 7.626989373061473,
"eval_bleu": 46.6978,
"eval_gen_len": 55.7703,
"eval_loss": 0.7417660355567932,
"eval_runtime": 345.0669,
"eval_samples_per_second": 4.202,
"eval_steps_per_second": 0.351,
"step": 150000
},
{
"epoch": 7.8812223521635225,
"grad_norm": 1.0750905275344849,
"learning_rate": 3.060367854689766e-07,
"loss": 0.2777,
"step": 155000
},
{
"epoch": 7.8812223521635225,
"eval_bleu": 46.6901,
"eval_gen_len": 55.769,
"eval_loss": 0.7415376305580139,
"eval_runtime": 345.2438,
"eval_samples_per_second": 4.2,
"eval_steps_per_second": 0.35,
"step": 155000
}
],
"logging_steps": 5000,
"max_steps": 157336,
"num_input_tokens_seen": 0,
"num_train_epochs": 8,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5274231723938816e+17,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}