mt5-finetuned-easy-v2 / trainer_state.json
kallacharanteja's picture
easy_final — 2 epochs done
003bd44 verified
{
"best_global_step": 2000,
"best_metric": 4.951307773590088,
"best_model_checkpoint": "/kaggle/working/checkpoints/checkpoint-2000",
"epoch": 2.0,
"eval_steps": 500,
"global_step": 2452,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04079967360261118,
"grad_norm": 541.5139770507812,
"learning_rate": 3.310810810810811e-05,
"loss": 116.98884765625,
"step": 50
},
{
"epoch": 0.08159934720522236,
"grad_norm": 285.58978271484375,
"learning_rate": 4.947434819175778e-05,
"loss": 110.386572265625,
"step": 100
},
{
"epoch": 0.12239902080783353,
"grad_norm": 268.13824462890625,
"learning_rate": 4.8423044575273343e-05,
"loss": 103.394189453125,
"step": 150
},
{
"epoch": 0.1631986944104447,
"grad_norm": 321.2715148925781,
"learning_rate": 4.7371740958788904e-05,
"loss": 94.964775390625,
"step": 200
},
{
"epoch": 0.2039983680130559,
"grad_norm": 254.8035125732422,
"learning_rate": 4.6320437342304465e-05,
"loss": 89.278173828125,
"step": 250
},
{
"epoch": 0.24479804161566707,
"grad_norm": 125.82566833496094,
"learning_rate": 4.526913372582002e-05,
"loss": 83.7441796875,
"step": 300
},
{
"epoch": 0.28559771521827826,
"grad_norm": 220.39035034179688,
"learning_rate": 4.421783010933558e-05,
"loss": 77.793349609375,
"step": 350
},
{
"epoch": 0.3263973888208894,
"grad_norm": 171.42626953125,
"learning_rate": 4.316652649285114e-05,
"loss": 73.357890625,
"step": 400
},
{
"epoch": 0.3671970624235006,
"grad_norm": 98.98644256591797,
"learning_rate": 4.2115222876366694e-05,
"loss": 69.091845703125,
"step": 450
},
{
"epoch": 0.4079967360261118,
"grad_norm": 146.9388885498047,
"learning_rate": 4.1063919259882255e-05,
"loss": 65.099111328125,
"step": 500
},
{
"epoch": 0.4079967360261118,
"eval_loss": 10.7504301071167,
"eval_runtime": 13.6455,
"eval_samples_per_second": 58.114,
"eval_steps_per_second": 7.328,
"step": 500
},
{
"epoch": 0.44879640962872297,
"grad_norm": 156.0268096923828,
"learning_rate": 4.0012615643397815e-05,
"loss": 60.6353271484375,
"step": 550
},
{
"epoch": 0.48959608323133413,
"grad_norm": 47.11336135864258,
"learning_rate": 3.8961312026913376e-05,
"loss": 55.8525439453125,
"step": 600
},
{
"epoch": 0.5303957568339454,
"grad_norm": 102.2573471069336,
"learning_rate": 3.791000841042893e-05,
"loss": 51.6678466796875,
"step": 650
},
{
"epoch": 0.5711954304365565,
"grad_norm": 21.42934799194336,
"learning_rate": 3.685870479394449e-05,
"loss": 47.7211572265625,
"step": 700
},
{
"epoch": 0.6119951040391677,
"grad_norm": 18.592626571655273,
"learning_rate": 3.580740117746005e-05,
"loss": 44.35724609375,
"step": 750
},
{
"epoch": 0.6527947776417788,
"grad_norm": 47.07696533203125,
"learning_rate": 3.475609756097561e-05,
"loss": 41.478544921875,
"step": 800
},
{
"epoch": 0.69359445124439,
"grad_norm": 9.859332084655762,
"learning_rate": 3.370479394449117e-05,
"loss": 39.09307861328125,
"step": 850
},
{
"epoch": 0.7343941248470012,
"grad_norm": 5.591737747192383,
"learning_rate": 3.2653490328006734e-05,
"loss": 37.41315185546875,
"step": 900
},
{
"epoch": 0.7751937984496124,
"grad_norm": 21.721343994140625,
"learning_rate": 3.1602186711522294e-05,
"loss": 35.67253173828125,
"step": 950
},
{
"epoch": 0.8159934720522236,
"grad_norm": 4.840476989746094,
"learning_rate": 3.055088309503785e-05,
"loss": 34.03028564453125,
"step": 1000
},
{
"epoch": 0.8159934720522236,
"eval_loss": 6.185283184051514,
"eval_runtime": 13.5808,
"eval_samples_per_second": 58.391,
"eval_steps_per_second": 7.363,
"step": 1000
},
{
"epoch": 0.8567931456548348,
"grad_norm": 2.8973324298858643,
"learning_rate": 2.949957947855341e-05,
"loss": 33.30818603515625,
"step": 1050
},
{
"epoch": 0.8975928192574459,
"grad_norm": 1.8784422874450684,
"learning_rate": 2.844827586206897e-05,
"loss": 32.28,
"step": 1100
},
{
"epoch": 0.9383924928600571,
"grad_norm": 4.1729865074157715,
"learning_rate": 2.7396972245584523e-05,
"loss": 31.48105712890625,
"step": 1150
},
{
"epoch": 0.9791921664626683,
"grad_norm": 2.748990058898926,
"learning_rate": 2.6345668629100084e-05,
"loss": 30.878486328125,
"step": 1200
},
{
"epoch": 1.0195838433292534,
"grad_norm": 1.7622333765029907,
"learning_rate": 2.5294365012615645e-05,
"loss": 30.14660400390625,
"step": 1250
},
{
"epoch": 1.0603835169318645,
"grad_norm": 2.544262647628784,
"learning_rate": 2.4243061396131202e-05,
"loss": 29.9266455078125,
"step": 1300
},
{
"epoch": 1.1011831905344758,
"grad_norm": 1.752083659172058,
"learning_rate": 2.3191757779646763e-05,
"loss": 29.6922900390625,
"step": 1350
},
{
"epoch": 1.1419828641370868,
"grad_norm": 1.3774715662002563,
"learning_rate": 2.2140454163162324e-05,
"loss": 29.419208984375,
"step": 1400
},
{
"epoch": 1.182782537739698,
"grad_norm": 1.3631178140640259,
"learning_rate": 2.1089150546677884e-05,
"loss": 29.1697119140625,
"step": 1450
},
{
"epoch": 1.2235822113423094,
"grad_norm": 1.5763075351715088,
"learning_rate": 2.003784693019344e-05,
"loss": 28.77746826171875,
"step": 1500
},
{
"epoch": 1.2235822113423094,
"eval_loss": 5.188190460205078,
"eval_runtime": 13.686,
"eval_samples_per_second": 57.942,
"eval_steps_per_second": 7.307,
"step": 1500
},
{
"epoch": 1.2643818849449204,
"grad_norm": 1.314376950263977,
"learning_rate": 1.8986543313709e-05,
"loss": 28.54221923828125,
"step": 1550
},
{
"epoch": 1.3051815585475315,
"grad_norm": 8.528578758239746,
"learning_rate": 1.793523969722456e-05,
"loss": 28.40603515625,
"step": 1600
},
{
"epoch": 1.3459812321501428,
"grad_norm": 8.729783058166504,
"learning_rate": 1.6883936080740117e-05,
"loss": 28.39514892578125,
"step": 1650
},
{
"epoch": 1.386780905752754,
"grad_norm": 1.1846702098846436,
"learning_rate": 1.5832632464255678e-05,
"loss": 28.02635009765625,
"step": 1700
},
{
"epoch": 1.427580579355365,
"grad_norm": 6.678456783294678,
"learning_rate": 1.4781328847771237e-05,
"loss": 27.882392578125,
"step": 1750
},
{
"epoch": 1.4683802529579764,
"grad_norm": 1.1704760789871216,
"learning_rate": 1.3730025231286797e-05,
"loss": 27.820341796875,
"step": 1800
},
{
"epoch": 1.5091799265605874,
"grad_norm": 4.103906154632568,
"learning_rate": 1.2678721614802355e-05,
"loss": 27.7095068359375,
"step": 1850
},
{
"epoch": 1.5499796001631987,
"grad_norm": 1.0563730001449585,
"learning_rate": 1.1627417998317915e-05,
"loss": 27.6540380859375,
"step": 1900
},
{
"epoch": 1.59077927376581,
"grad_norm": 1.4707422256469727,
"learning_rate": 1.0576114381833474e-05,
"loss": 27.472509765625,
"step": 1950
},
{
"epoch": 1.631578947368421,
"grad_norm": 1.8527077436447144,
"learning_rate": 9.524810765349033e-06,
"loss": 27.4132763671875,
"step": 2000
},
{
"epoch": 1.631578947368421,
"eval_loss": 4.951307773590088,
"eval_runtime": 13.75,
"eval_samples_per_second": 57.673,
"eval_steps_per_second": 7.273,
"step": 2000
},
{
"epoch": 1.672378620971032,
"grad_norm": 1.2665544748306274,
"learning_rate": 8.473507148864592e-06,
"loss": 27.26576416015625,
"step": 2050
},
{
"epoch": 1.7131782945736433,
"grad_norm": 1.6418918371200562,
"learning_rate": 7.422203532380152e-06,
"loss": 27.26886474609375,
"step": 2100
},
{
"epoch": 1.7539779681762546,
"grad_norm": 1.1888850927352905,
"learning_rate": 6.370899915895712e-06,
"loss": 27.21632568359375,
"step": 2150
},
{
"epoch": 1.794777641778866,
"grad_norm": 1.3705418109893799,
"learning_rate": 5.31959629941127e-06,
"loss": 27.19008544921875,
"step": 2200
},
{
"epoch": 1.835577315381477,
"grad_norm": 1.1785783767700195,
"learning_rate": 4.26829268292683e-06,
"loss": 27.07623046875,
"step": 2250
},
{
"epoch": 1.876376988984088,
"grad_norm": 1.0306082963943481,
"learning_rate": 3.2169890664423886e-06,
"loss": 27.08888916015625,
"step": 2300
},
{
"epoch": 1.9171766625866993,
"grad_norm": 5.090639114379883,
"learning_rate": 2.165685449957948e-06,
"loss": 26.97588623046875,
"step": 2350
},
{
"epoch": 1.9579763361893106,
"grad_norm": 1.3182185888290405,
"learning_rate": 1.1143818334735072e-06,
"loss": 27.0556298828125,
"step": 2400
},
{
"epoch": 1.9987760097919218,
"grad_norm": 1.3252017498016357,
"learning_rate": 6.307821698906644e-08,
"loss": 26.97030517578125,
"step": 2450
},
{
"epoch": 2.0,
"step": 2452,
"total_flos": 2.081454582398976e+16,
"train_loss": 44.073582747243165,
"train_runtime": 6114.8596,
"train_samples_per_second": 25.649,
"train_steps_per_second": 0.401
}
],
"logging_steps": 50,
"max_steps": 2452,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.081454582398976e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}