hubert-base-ser / trainer_state.json
ZipperDeng's picture
End of training
cd63b02 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9988571428571429,
"eval_steps": 10,
"global_step": 437,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022857142857142857,
"grad_norm": 6.883157253265381,
"learning_rate": 9.77116704805492e-05,
"loss": 0.9709,
"step": 10
},
{
"epoch": 0.022857142857142857,
"eval_accuracy": 0.6398571133613586,
"eval_loss": 0.8923419117927551,
"eval_runtime": 252.6626,
"eval_samples_per_second": 27.705,
"eval_steps_per_second": 6.926,
"step": 10
},
{
"epoch": 0.045714285714285714,
"grad_norm": 4.793847560882568,
"learning_rate": 9.542334096109841e-05,
"loss": 0.9219,
"step": 20
},
{
"epoch": 0.045714285714285714,
"eval_accuracy": 0.7664285898208618,
"eval_loss": 0.6903320550918579,
"eval_runtime": 260.5483,
"eval_samples_per_second": 26.866,
"eval_steps_per_second": 6.717,
"step": 20
},
{
"epoch": 0.06857142857142857,
"grad_norm": 6.191551685333252,
"learning_rate": 9.31350114416476e-05,
"loss": 0.7112,
"step": 30
},
{
"epoch": 0.06857142857142857,
"eval_accuracy": 0.7908571362495422,
"eval_loss": 0.5838488936424255,
"eval_runtime": 254.6091,
"eval_samples_per_second": 27.493,
"eval_steps_per_second": 6.873,
"step": 30
},
{
"epoch": 0.09142857142857143,
"grad_norm": 9.833272933959961,
"learning_rate": 9.08466819221968e-05,
"loss": 0.567,
"step": 40
},
{
"epoch": 0.09142857142857143,
"eval_accuracy": 0.8158571720123291,
"eval_loss": 0.5405334830284119,
"eval_runtime": 263.3184,
"eval_samples_per_second": 26.584,
"eval_steps_per_second": 6.646,
"step": 40
},
{
"epoch": 0.11428571428571428,
"grad_norm": 9.925666809082031,
"learning_rate": 8.878718535469108e-05,
"loss": 0.6184,
"step": 50
},
{
"epoch": 0.11428571428571428,
"eval_accuracy": 0.8581428527832031,
"eval_loss": 0.41476812958717346,
"eval_runtime": 259.1036,
"eval_samples_per_second": 27.016,
"eval_steps_per_second": 6.754,
"step": 50
},
{
"epoch": 0.13714285714285715,
"grad_norm": 3.723980665206909,
"learning_rate": 8.649885583524028e-05,
"loss": 0.5291,
"step": 60
},
{
"epoch": 0.13714285714285715,
"eval_accuracy": 0.8511428833007812,
"eval_loss": 0.44439756870269775,
"eval_runtime": 253.5826,
"eval_samples_per_second": 27.604,
"eval_steps_per_second": 6.901,
"step": 60
},
{
"epoch": 0.16,
"grad_norm": 10.508088111877441,
"learning_rate": 8.421052631578948e-05,
"loss": 0.533,
"step": 70
},
{
"epoch": 0.16,
"eval_accuracy": 0.8271428346633911,
"eval_loss": 0.4642958641052246,
"eval_runtime": 260.9488,
"eval_samples_per_second": 26.825,
"eval_steps_per_second": 6.706,
"step": 70
},
{
"epoch": 0.18285714285714286,
"grad_norm": 7.824756622314453,
"learning_rate": 8.192219679633868e-05,
"loss": 0.4753,
"step": 80
},
{
"epoch": 0.18285714285714286,
"eval_accuracy": 0.876714289188385,
"eval_loss": 0.35598087310791016,
"eval_runtime": 262.7831,
"eval_samples_per_second": 26.638,
"eval_steps_per_second": 6.659,
"step": 80
},
{
"epoch": 0.2057142857142857,
"grad_norm": 5.332316875457764,
"learning_rate": 7.963386727688788e-05,
"loss": 0.4252,
"step": 90
},
{
"epoch": 0.2057142857142857,
"eval_accuracy": 0.8102856874465942,
"eval_loss": 0.5888535380363464,
"eval_runtime": 262.7552,
"eval_samples_per_second": 26.641,
"eval_steps_per_second": 6.66,
"step": 90
},
{
"epoch": 0.22857142857142856,
"grad_norm": 17.482688903808594,
"learning_rate": 7.734553775743708e-05,
"loss": 0.5007,
"step": 100
},
{
"epoch": 0.22857142857142856,
"eval_accuracy": 0.8662857413291931,
"eval_loss": 0.38821107149124146,
"eval_runtime": 261.4572,
"eval_samples_per_second": 26.773,
"eval_steps_per_second": 6.693,
"step": 100
},
{
"epoch": 0.25142857142857145,
"grad_norm": 8.691084861755371,
"learning_rate": 7.505720823798627e-05,
"loss": 0.5605,
"step": 110
},
{
"epoch": 0.25142857142857145,
"eval_accuracy": 0.8921428322792053,
"eval_loss": 0.32210296392440796,
"eval_runtime": 261.1514,
"eval_samples_per_second": 26.804,
"eval_steps_per_second": 6.701,
"step": 110
},
{
"epoch": 0.2742857142857143,
"grad_norm": 11.754142761230469,
"learning_rate": 7.276887871853547e-05,
"loss": 0.4875,
"step": 120
},
{
"epoch": 0.2742857142857143,
"eval_accuracy": 0.8558571338653564,
"eval_loss": 0.36388570070266724,
"eval_runtime": 265.2182,
"eval_samples_per_second": 26.393,
"eval_steps_per_second": 6.598,
"step": 120
},
{
"epoch": 0.29714285714285715,
"grad_norm": 7.222925662994385,
"learning_rate": 7.048054919908466e-05,
"loss": 0.4277,
"step": 130
},
{
"epoch": 0.29714285714285715,
"eval_accuracy": 0.8745714426040649,
"eval_loss": 0.35708051919937134,
"eval_runtime": 264.6016,
"eval_samples_per_second": 26.455,
"eval_steps_per_second": 6.614,
"step": 130
},
{
"epoch": 0.32,
"grad_norm": 6.181695938110352,
"learning_rate": 6.819221967963387e-05,
"loss": 0.3415,
"step": 140
},
{
"epoch": 0.32,
"eval_accuracy": 0.8861428499221802,
"eval_loss": 0.33818891644477844,
"eval_runtime": 262.5039,
"eval_samples_per_second": 26.666,
"eval_steps_per_second": 6.667,
"step": 140
},
{
"epoch": 0.34285714285714286,
"grad_norm": 8.087543487548828,
"learning_rate": 6.590389016018307e-05,
"loss": 0.413,
"step": 150
},
{
"epoch": 0.34285714285714286,
"eval_accuracy": 0.9104285836219788,
"eval_loss": 0.2596481442451477,
"eval_runtime": 265.6837,
"eval_samples_per_second": 26.347,
"eval_steps_per_second": 6.587,
"step": 150
},
{
"epoch": 0.3657142857142857,
"grad_norm": 11.313796997070312,
"learning_rate": 6.361556064073226e-05,
"loss": 0.377,
"step": 160
},
{
"epoch": 0.3657142857142857,
"eval_accuracy": 0.8711428642272949,
"eval_loss": 0.3518799841403961,
"eval_runtime": 264.3798,
"eval_samples_per_second": 26.477,
"eval_steps_per_second": 6.619,
"step": 160
},
{
"epoch": 0.38857142857142857,
"grad_norm": 7.65640115737915,
"learning_rate": 6.132723112128147e-05,
"loss": 0.4219,
"step": 170
},
{
"epoch": 0.38857142857142857,
"eval_accuracy": 0.8947142958641052,
"eval_loss": 0.2979215681552887,
"eval_runtime": 262.8341,
"eval_samples_per_second": 26.633,
"eval_steps_per_second": 6.658,
"step": 170
},
{
"epoch": 0.4114285714285714,
"grad_norm": 6.2714433670043945,
"learning_rate": 5.903890160183066e-05,
"loss": 0.3317,
"step": 180
},
{
"epoch": 0.4114285714285714,
"eval_accuracy": 0.9225714206695557,
"eval_loss": 0.22266168892383575,
"eval_runtime": 265.1248,
"eval_samples_per_second": 26.403,
"eval_steps_per_second": 6.601,
"step": 180
},
{
"epoch": 0.4342857142857143,
"grad_norm": 8.710111618041992,
"learning_rate": 5.675057208237986e-05,
"loss": 0.3131,
"step": 190
},
{
"epoch": 0.4342857142857143,
"eval_accuracy": 0.8692857027053833,
"eval_loss": 0.3680011034011841,
"eval_runtime": 260.0056,
"eval_samples_per_second": 26.923,
"eval_steps_per_second": 6.731,
"step": 190
},
{
"epoch": 0.45714285714285713,
"grad_norm": 4.041360378265381,
"learning_rate": 5.446224256292907e-05,
"loss": 0.3266,
"step": 200
},
{
"epoch": 0.45714285714285713,
"eval_accuracy": 0.9308571219444275,
"eval_loss": 0.20981180667877197,
"eval_runtime": 256.153,
"eval_samples_per_second": 27.327,
"eval_steps_per_second": 6.832,
"step": 200
},
{
"epoch": 0.48,
"grad_norm": 10.932918548583984,
"learning_rate": 5.217391304347826e-05,
"loss": 0.3306,
"step": 210
},
{
"epoch": 0.48,
"eval_accuracy": 0.8824285864830017,
"eval_loss": 0.3848917782306671,
"eval_runtime": 253.9958,
"eval_samples_per_second": 27.56,
"eval_steps_per_second": 6.89,
"step": 210
},
{
"epoch": 0.5028571428571429,
"grad_norm": 9.440160751342773,
"learning_rate": 4.9885583524027466e-05,
"loss": 0.3037,
"step": 220
},
{
"epoch": 0.5028571428571429,
"eval_accuracy": 0.9024285674095154,
"eval_loss": 0.28518444299697876,
"eval_runtime": 259.3612,
"eval_samples_per_second": 26.989,
"eval_steps_per_second": 6.747,
"step": 220
},
{
"epoch": 0.5257142857142857,
"grad_norm": 9.196854591369629,
"learning_rate": 4.759725400457666e-05,
"loss": 0.3086,
"step": 230
},
{
"epoch": 0.5257142857142857,
"eval_accuracy": 0.9121428728103638,
"eval_loss": 0.272481232881546,
"eval_runtime": 254.9581,
"eval_samples_per_second": 27.455,
"eval_steps_per_second": 6.864,
"step": 230
},
{
"epoch": 0.5485714285714286,
"grad_norm": 6.610895156860352,
"learning_rate": 4.530892448512586e-05,
"loss": 0.2576,
"step": 240
},
{
"epoch": 0.5485714285714286,
"eval_accuracy": 0.9355714321136475,
"eval_loss": 0.18688350915908813,
"eval_runtime": 255.2292,
"eval_samples_per_second": 27.426,
"eval_steps_per_second": 6.857,
"step": 240
},
{
"epoch": 0.5714285714285714,
"grad_norm": 15.24905014038086,
"learning_rate": 4.302059496567506e-05,
"loss": 0.2469,
"step": 250
},
{
"epoch": 0.5714285714285714,
"eval_accuracy": 0.9242857098579407,
"eval_loss": 0.2262311726808548,
"eval_runtime": 254.9064,
"eval_samples_per_second": 27.461,
"eval_steps_per_second": 6.865,
"step": 250
},
{
"epoch": 0.5942857142857143,
"grad_norm": 9.8357515335083,
"learning_rate": 4.073226544622426e-05,
"loss": 0.2405,
"step": 260
},
{
"epoch": 0.5942857142857143,
"eval_accuracy": 0.9347142577171326,
"eval_loss": 0.19631564617156982,
"eval_runtime": 271.1966,
"eval_samples_per_second": 25.812,
"eval_steps_per_second": 6.453,
"step": 260
},
{
"epoch": 0.6171428571428571,
"grad_norm": 19.872060775756836,
"learning_rate": 3.844393592677346e-05,
"loss": 0.2802,
"step": 270
},
{
"epoch": 0.6171428571428571,
"eval_accuracy": 0.8804285526275635,
"eval_loss": 0.3679888844490051,
"eval_runtime": 256.0669,
"eval_samples_per_second": 27.337,
"eval_steps_per_second": 6.834,
"step": 270
},
{
"epoch": 0.64,
"grad_norm": 3.6445915699005127,
"learning_rate": 3.6155606407322653e-05,
"loss": 0.2442,
"step": 280
},
{
"epoch": 0.64,
"eval_accuracy": 0.9292857050895691,
"eval_loss": 0.20533673465251923,
"eval_runtime": 255.7952,
"eval_samples_per_second": 27.366,
"eval_steps_per_second": 6.841,
"step": 280
},
{
"epoch": 0.6628571428571428,
"grad_norm": 8.114418983459473,
"learning_rate": 3.3867276887871856e-05,
"loss": 0.2302,
"step": 290
},
{
"epoch": 0.6628571428571428,
"eval_accuracy": 0.8967142701148987,
"eval_loss": 0.3355866074562073,
"eval_runtime": 257.891,
"eval_samples_per_second": 27.143,
"eval_steps_per_second": 6.786,
"step": 290
},
{
"epoch": 0.6857142857142857,
"grad_norm": 5.993322372436523,
"learning_rate": 3.157894736842105e-05,
"loss": 0.2492,
"step": 300
},
{
"epoch": 0.6857142857142857,
"eval_accuracy": 0.9371428489685059,
"eval_loss": 0.18795913457870483,
"eval_runtime": 254.5882,
"eval_samples_per_second": 27.495,
"eval_steps_per_second": 6.874,
"step": 300
},
{
"epoch": 0.7085714285714285,
"grad_norm": 6.529418468475342,
"learning_rate": 2.9290617848970254e-05,
"loss": 0.2089,
"step": 310
},
{
"epoch": 0.7085714285714285,
"eval_accuracy": 0.928857147693634,
"eval_loss": 0.2076321393251419,
"eval_runtime": 260.5938,
"eval_samples_per_second": 26.862,
"eval_steps_per_second": 6.715,
"step": 310
},
{
"epoch": 0.7314285714285714,
"grad_norm": 6.433741092681885,
"learning_rate": 2.7002288329519453e-05,
"loss": 0.2824,
"step": 320
},
{
"epoch": 0.7314285714285714,
"eval_accuracy": 0.930142879486084,
"eval_loss": 0.1999480277299881,
"eval_runtime": 255.2396,
"eval_samples_per_second": 27.425,
"eval_steps_per_second": 6.856,
"step": 320
},
{
"epoch": 0.7542857142857143,
"grad_norm": 5.394837379455566,
"learning_rate": 2.4713958810068652e-05,
"loss": 0.2009,
"step": 330
},
{
"epoch": 0.7542857142857143,
"eval_accuracy": 0.9521428346633911,
"eval_loss": 0.14918017387390137,
"eval_runtime": 258.1497,
"eval_samples_per_second": 27.116,
"eval_steps_per_second": 6.779,
"step": 330
},
{
"epoch": 0.7771428571428571,
"grad_norm": 5.843348503112793,
"learning_rate": 2.242562929061785e-05,
"loss": 0.2001,
"step": 340
},
{
"epoch": 0.7771428571428571,
"eval_accuracy": 0.951714277267456,
"eval_loss": 0.14960123598575592,
"eval_runtime": 253.1262,
"eval_samples_per_second": 27.654,
"eval_steps_per_second": 6.914,
"step": 340
},
{
"epoch": 0.8,
"grad_norm": 7.778473377227783,
"learning_rate": 2.0137299771167047e-05,
"loss": 0.2298,
"step": 350
},
{
"epoch": 0.8,
"eval_accuracy": 0.9490000009536743,
"eval_loss": 0.15794885158538818,
"eval_runtime": 258.4154,
"eval_samples_per_second": 27.088,
"eval_steps_per_second": 6.772,
"step": 350
},
{
"epoch": 0.8228571428571428,
"grad_norm": 7.672749042510986,
"learning_rate": 1.784897025171625e-05,
"loss": 0.1802,
"step": 360
},
{
"epoch": 0.8228571428571428,
"eval_accuracy": 0.9501428604125977,
"eval_loss": 0.15056686103343964,
"eval_runtime": 253.0586,
"eval_samples_per_second": 27.662,
"eval_steps_per_second": 6.915,
"step": 360
},
{
"epoch": 0.8457142857142858,
"grad_norm": 7.994875431060791,
"learning_rate": 1.5560640732265445e-05,
"loss": 0.1914,
"step": 370
},
{
"epoch": 0.8457142857142858,
"eval_accuracy": 0.9311428666114807,
"eval_loss": 0.20363783836364746,
"eval_runtime": 261.3379,
"eval_samples_per_second": 26.785,
"eval_steps_per_second": 6.696,
"step": 370
},
{
"epoch": 0.8685714285714285,
"grad_norm": 3.988149404525757,
"learning_rate": 1.3272311212814645e-05,
"loss": 0.1897,
"step": 380
},
{
"epoch": 0.8685714285714285,
"eval_accuracy": 0.9382857084274292,
"eval_loss": 0.18375040590763092,
"eval_runtime": 256.8539,
"eval_samples_per_second": 27.253,
"eval_steps_per_second": 6.813,
"step": 380
},
{
"epoch": 0.8914285714285715,
"grad_norm": 7.280108451843262,
"learning_rate": 1.0983981693363844e-05,
"loss": 0.1203,
"step": 390
},
{
"epoch": 0.8914285714285715,
"eval_accuracy": 0.9504285454750061,
"eval_loss": 0.1459112912416458,
"eval_runtime": 256.3941,
"eval_samples_per_second": 27.302,
"eval_steps_per_second": 6.825,
"step": 390
},
{
"epoch": 0.9142857142857143,
"grad_norm": 6.386229991912842,
"learning_rate": 8.695652173913044e-06,
"loss": 0.1372,
"step": 400
},
{
"epoch": 0.9142857142857143,
"eval_accuracy": 0.9418571591377258,
"eval_loss": 0.1748434156179428,
"eval_runtime": 266.7645,
"eval_samples_per_second": 26.24,
"eval_steps_per_second": 6.56,
"step": 400
},
{
"epoch": 0.9371428571428572,
"grad_norm": 7.714508056640625,
"learning_rate": 6.407322654462243e-06,
"loss": 0.1942,
"step": 410
},
{
"epoch": 0.9371428571428572,
"eval_accuracy": 0.9405714273452759,
"eval_loss": 0.18131674826145172,
"eval_runtime": 266.6389,
"eval_samples_per_second": 26.253,
"eval_steps_per_second": 6.563,
"step": 410
},
{
"epoch": 0.96,
"grad_norm": 4.493211269378662,
"learning_rate": 4.118993135011442e-06,
"loss": 0.1886,
"step": 420
},
{
"epoch": 0.96,
"eval_accuracy": 0.9509999752044678,
"eval_loss": 0.15357272326946259,
"eval_runtime": 273.0321,
"eval_samples_per_second": 25.638,
"eval_steps_per_second": 6.41,
"step": 420
},
{
"epoch": 0.9828571428571429,
"grad_norm": 4.66563606262207,
"learning_rate": 1.8306636155606409e-06,
"loss": 0.1872,
"step": 430
},
{
"epoch": 0.9828571428571429,
"eval_accuracy": 0.952571451663971,
"eval_loss": 0.1465713381767273,
"eval_runtime": 266.7172,
"eval_samples_per_second": 26.245,
"eval_steps_per_second": 6.561,
"step": 430
},
{
"epoch": 0.9988571428571429,
"step": 437,
"total_flos": 1.3128537437918904e+18,
"train_loss": 0.3557066834218442,
"train_runtime": 12202.3201,
"train_samples_per_second": 2.295,
"train_steps_per_second": 0.036
}
],
"logging_steps": 10,
"max_steps": 437,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3128537437918904e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}