mt5-finetuned-easy-v2 / trainer_state.json

easy_final — 2 epochs done

003bd44 verified about 2 months ago

10.9 kB

	{
	"best_global_step": 2000,
	"best_metric": 4.951307773590088,
	"best_model_checkpoint": "/kaggle/working/checkpoints/checkpoint-2000",
	"epoch": 2.0,
	"eval_steps": 500,
	"global_step": 2452,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.04079967360261118,
	"grad_norm": 541.5139770507812,
	"learning_rate": 3.310810810810811e-05,
	"loss": 116.98884765625,
	"step": 50
	},
	{
	"epoch": 0.08159934720522236,
	"grad_norm": 285.58978271484375,
	"learning_rate": 4.947434819175778e-05,
	"loss": 110.386572265625,
	"step": 100
	},
	{
	"epoch": 0.12239902080783353,
	"grad_norm": 268.13824462890625,
	"learning_rate": 4.8423044575273343e-05,
	"loss": 103.394189453125,
	"step": 150
	},
	{
	"epoch": 0.1631986944104447,
	"grad_norm": 321.2715148925781,
	"learning_rate": 4.7371740958788904e-05,
	"loss": 94.964775390625,
	"step": 200
	},
	{
	"epoch": 0.2039983680130559,
	"grad_norm": 254.8035125732422,
	"learning_rate": 4.6320437342304465e-05,
	"loss": 89.278173828125,
	"step": 250
	},
	{
	"epoch": 0.24479804161566707,
	"grad_norm": 125.82566833496094,
	"learning_rate": 4.526913372582002e-05,
	"loss": 83.7441796875,
	"step": 300
	},
	{
	"epoch": 0.28559771521827826,
	"grad_norm": 220.39035034179688,
	"learning_rate": 4.421783010933558e-05,
	"loss": 77.793349609375,
	"step": 350
	},
	{
	"epoch": 0.3263973888208894,
	"grad_norm": 171.42626953125,
	"learning_rate": 4.316652649285114e-05,
	"loss": 73.357890625,
	"step": 400
	},
	{
	"epoch": 0.3671970624235006,
	"grad_norm": 98.98644256591797,
	"learning_rate": 4.2115222876366694e-05,
	"loss": 69.091845703125,
	"step": 450
	},
	{
	"epoch": 0.4079967360261118,
	"grad_norm": 146.9388885498047,
	"learning_rate": 4.1063919259882255e-05,
	"loss": 65.099111328125,
	"step": 500
	},
	{
	"epoch": 0.4079967360261118,
	"eval_loss": 10.7504301071167,
	"eval_runtime": 13.6455,
	"eval_samples_per_second": 58.114,
	"eval_steps_per_second": 7.328,
	"step": 500
	},
	{
	"epoch": 0.44879640962872297,
	"grad_norm": 156.0268096923828,
	"learning_rate": 4.0012615643397815e-05,
	"loss": 60.6353271484375,
	"step": 550
	},
	{
	"epoch": 0.48959608323133413,
	"grad_norm": 47.11336135864258,
	"learning_rate": 3.8961312026913376e-05,
	"loss": 55.8525439453125,
	"step": 600
	},
	{
	"epoch": 0.5303957568339454,
	"grad_norm": 102.2573471069336,
	"learning_rate": 3.791000841042893e-05,
	"loss": 51.6678466796875,
	"step": 650
	},
	{
	"epoch": 0.5711954304365565,
	"grad_norm": 21.42934799194336,
	"learning_rate": 3.685870479394449e-05,
	"loss": 47.7211572265625,
	"step": 700
	},
	{
	"epoch": 0.6119951040391677,
	"grad_norm": 18.592626571655273,
	"learning_rate": 3.580740117746005e-05,
	"loss": 44.35724609375,
	"step": 750
	},
	{
	"epoch": 0.6527947776417788,
	"grad_norm": 47.07696533203125,
	"learning_rate": 3.475609756097561e-05,
	"loss": 41.478544921875,
	"step": 800
	},
	{
	"epoch": 0.69359445124439,
	"grad_norm": 9.859332084655762,
	"learning_rate": 3.370479394449117e-05,
	"loss": 39.09307861328125,
	"step": 850
	},
	{
	"epoch": 0.7343941248470012,
	"grad_norm": 5.591737747192383,
	"learning_rate": 3.2653490328006734e-05,
	"loss": 37.41315185546875,
	"step": 900
	},
	{
	"epoch": 0.7751937984496124,
	"grad_norm": 21.721343994140625,
	"learning_rate": 3.1602186711522294e-05,
	"loss": 35.67253173828125,
	"step": 950
	},
	{
	"epoch": 0.8159934720522236,
	"grad_norm": 4.840476989746094,
	"learning_rate": 3.055088309503785e-05,
	"loss": 34.03028564453125,
	"step": 1000
	},
	{
	"epoch": 0.8159934720522236,
	"eval_loss": 6.185283184051514,
	"eval_runtime": 13.5808,
	"eval_samples_per_second": 58.391,
	"eval_steps_per_second": 7.363,
	"step": 1000
	},
	{
	"epoch": 0.8567931456548348,
	"grad_norm": 2.8973324298858643,
	"learning_rate": 2.949957947855341e-05,
	"loss": 33.30818603515625,
	"step": 1050
	},
	{
	"epoch": 0.8975928192574459,
	"grad_norm": 1.8784422874450684,
	"learning_rate": 2.844827586206897e-05,
	"loss": 32.28,
	"step": 1100
	},
	{
	"epoch": 0.9383924928600571,
	"grad_norm": 4.1729865074157715,
	"learning_rate": 2.7396972245584523e-05,
	"loss": 31.48105712890625,
	"step": 1150
	},
	{
	"epoch": 0.9791921664626683,
	"grad_norm": 2.748990058898926,
	"learning_rate": 2.6345668629100084e-05,
	"loss": 30.878486328125,
	"step": 1200
	},
	{
	"epoch": 1.0195838433292534,
	"grad_norm": 1.7622333765029907,
	"learning_rate": 2.5294365012615645e-05,
	"loss": 30.14660400390625,
	"step": 1250
	},
	{
	"epoch": 1.0603835169318645,
	"grad_norm": 2.544262647628784,
	"learning_rate": 2.4243061396131202e-05,
	"loss": 29.9266455078125,
	"step": 1300
	},
	{
	"epoch": 1.1011831905344758,
	"grad_norm": 1.752083659172058,
	"learning_rate": 2.3191757779646763e-05,
	"loss": 29.6922900390625,
	"step": 1350
	},
	{
	"epoch": 1.1419828641370868,
	"grad_norm": 1.3774715662002563,
	"learning_rate": 2.2140454163162324e-05,
	"loss": 29.419208984375,
	"step": 1400
	},
	{
	"epoch": 1.182782537739698,
	"grad_norm": 1.3631178140640259,
	"learning_rate": 2.1089150546677884e-05,
	"loss": 29.1697119140625,
	"step": 1450
	},
	{
	"epoch": 1.2235822113423094,
	"grad_norm": 1.5763075351715088,
	"learning_rate": 2.003784693019344e-05,
	"loss": 28.77746826171875,
	"step": 1500
	},
	{
	"epoch": 1.2235822113423094,
	"eval_loss": 5.188190460205078,
	"eval_runtime": 13.686,
	"eval_samples_per_second": 57.942,
	"eval_steps_per_second": 7.307,
	"step": 1500
	},
	{
	"epoch": 1.2643818849449204,
	"grad_norm": 1.314376950263977,
	"learning_rate": 1.8986543313709e-05,
	"loss": 28.54221923828125,
	"step": 1550
	},
	{
	"epoch": 1.3051815585475315,
	"grad_norm": 8.528578758239746,
	"learning_rate": 1.793523969722456e-05,
	"loss": 28.40603515625,
	"step": 1600
	},
	{
	"epoch": 1.3459812321501428,
	"grad_norm": 8.729783058166504,
	"learning_rate": 1.6883936080740117e-05,
	"loss": 28.39514892578125,
	"step": 1650
	},
	{
	"epoch": 1.386780905752754,
	"grad_norm": 1.1846702098846436,
	"learning_rate": 1.5832632464255678e-05,
	"loss": 28.02635009765625,
	"step": 1700
	},
	{
	"epoch": 1.427580579355365,
	"grad_norm": 6.678456783294678,
	"learning_rate": 1.4781328847771237e-05,
	"loss": 27.882392578125,
	"step": 1750
	},
	{
	"epoch": 1.4683802529579764,
	"grad_norm": 1.1704760789871216,
	"learning_rate": 1.3730025231286797e-05,
	"loss": 27.820341796875,
	"step": 1800
	},
	{
	"epoch": 1.5091799265605874,
	"grad_norm": 4.103906154632568,
	"learning_rate": 1.2678721614802355e-05,
	"loss": 27.7095068359375,
	"step": 1850
	},
	{
	"epoch": 1.5499796001631987,
	"grad_norm": 1.0563730001449585,
	"learning_rate": 1.1627417998317915e-05,
	"loss": 27.6540380859375,
	"step": 1900
	},
	{
	"epoch": 1.59077927376581,
	"grad_norm": 1.4707422256469727,
	"learning_rate": 1.0576114381833474e-05,
	"loss": 27.472509765625,
	"step": 1950
	},
	{
	"epoch": 1.631578947368421,
	"grad_norm": 1.8527077436447144,
	"learning_rate": 9.524810765349033e-06,
	"loss": 27.4132763671875,
	"step": 2000
	},
	{
	"epoch": 1.631578947368421,
	"eval_loss": 4.951307773590088,
	"eval_runtime": 13.75,
	"eval_samples_per_second": 57.673,
	"eval_steps_per_second": 7.273,
	"step": 2000
	},
	{
	"epoch": 1.672378620971032,
	"grad_norm": 1.2665544748306274,
	"learning_rate": 8.473507148864592e-06,
	"loss": 27.26576416015625,
	"step": 2050
	},
	{
	"epoch": 1.7131782945736433,
	"grad_norm": 1.6418918371200562,
	"learning_rate": 7.422203532380152e-06,
	"loss": 27.26886474609375,
	"step": 2100
	},
	{
	"epoch": 1.7539779681762546,
	"grad_norm": 1.1888850927352905,
	"learning_rate": 6.370899915895712e-06,
	"loss": 27.21632568359375,
	"step": 2150
	},
	{
	"epoch": 1.794777641778866,
	"grad_norm": 1.3705418109893799,
	"learning_rate": 5.31959629941127e-06,
	"loss": 27.19008544921875,
	"step": 2200
	},
	{
	"epoch": 1.835577315381477,
	"grad_norm": 1.1785783767700195,
	"learning_rate": 4.26829268292683e-06,
	"loss": 27.07623046875,
	"step": 2250
	},
	{
	"epoch": 1.876376988984088,
	"grad_norm": 1.0306082963943481,
	"learning_rate": 3.2169890664423886e-06,
	"loss": 27.08888916015625,
	"step": 2300
	},
	{
	"epoch": 1.9171766625866993,
	"grad_norm": 5.090639114379883,
	"learning_rate": 2.165685449957948e-06,
	"loss": 26.97588623046875,
	"step": 2350
	},
	{
	"epoch": 1.9579763361893106,
	"grad_norm": 1.3182185888290405,
	"learning_rate": 1.1143818334735072e-06,
	"loss": 27.0556298828125,
	"step": 2400
	},
	{
	"epoch": 1.9987760097919218,
	"grad_norm": 1.3252017498016357,
	"learning_rate": 6.307821698906644e-08,
	"loss": 26.97030517578125,
	"step": 2450
	},
	{
	"epoch": 2.0,
	"step": 2452,
	"total_flos": 2.081454582398976e+16,
	"train_loss": 44.073582747243165,
	"train_runtime": 6114.8596,
	"train_samples_per_second": 25.649,
	"train_steps_per_second": 0.401
	}
	],
	"logging_steps": 50,
	"max_steps": 2452,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 2,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 2.081454582398976e+16,
	"train_batch_size": 16,
	"trial_name": null,
	"trial_params": null
	}