bert-base-mti881 / trainer_state.json
Ben10x's picture
End of training
a3e2852 verified
{
"best_global_step": 8775,
"best_metric": 2.2569968700408936,
"best_model_checkpoint": "./output/bert-base-mti881/checkpoint-8775",
"epoch": 15.0,
"eval_steps": 500,
"global_step": 43875,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.17094017094017094,
"grad_norm": 1.6328529119491577,
"learning_rate": 4.943019943019943e-05,
"loss": 2.469,
"step": 500
},
{
"epoch": 0.3418803418803419,
"grad_norm": 1.7880568504333496,
"learning_rate": 4.886039886039887e-05,
"loss": 2.3525,
"step": 1000
},
{
"epoch": 0.5128205128205128,
"grad_norm": 2.440093994140625,
"learning_rate": 4.829059829059829e-05,
"loss": 2.331,
"step": 1500
},
{
"epoch": 0.6837606837606838,
"grad_norm": 2.336617946624756,
"learning_rate": 4.772079772079772e-05,
"loss": 2.3127,
"step": 2000
},
{
"epoch": 0.8547008547008547,
"grad_norm": 2.22334885597229,
"learning_rate": 4.7150997150997157e-05,
"loss": 2.2999,
"step": 2500
},
{
"epoch": 1.0,
"eval_accuracy": 0.8566329784624442,
"eval_f1": 0.5589353612167302,
"eval_loss": 2.286189317703247,
"eval_precision": 0.5451906796742724,
"eval_recall": 0.5733909946578479,
"eval_runtime": 6.4563,
"eval_samples_per_second": 452.889,
"eval_steps_per_second": 56.689,
"step": 2925
},
{
"epoch": 1.0256410256410255,
"grad_norm": 1.5501340627670288,
"learning_rate": 4.6581196581196586e-05,
"loss": 2.2831,
"step": 3000
},
{
"epoch": 1.1965811965811965,
"grad_norm": 1.7197738885879517,
"learning_rate": 4.6011396011396016e-05,
"loss": 2.2335,
"step": 3500
},
{
"epoch": 1.3675213675213675,
"grad_norm": 2.045734405517578,
"learning_rate": 4.544159544159544e-05,
"loss": 2.2371,
"step": 4000
},
{
"epoch": 1.5384615384615383,
"grad_norm": 1.6353585720062256,
"learning_rate": 4.4871794871794874e-05,
"loss": 2.2339,
"step": 4500
},
{
"epoch": 1.7094017094017095,
"grad_norm": 2.460322141647339,
"learning_rate": 4.4301994301994304e-05,
"loss": 2.233,
"step": 5000
},
{
"epoch": 1.8803418803418803,
"grad_norm": 1.5123356580734253,
"learning_rate": 4.3732193732193733e-05,
"loss": 2.2263,
"step": 5500
},
{
"epoch": 2.0,
"eval_accuracy": 0.8603247804543002,
"eval_f1": 0.5899178255372945,
"eval_loss": 2.275588274002075,
"eval_precision": 0.5522597825282936,
"eval_recall": 0.6330874247434919,
"eval_runtime": 5.6077,
"eval_samples_per_second": 521.421,
"eval_steps_per_second": 65.267,
"step": 5850
},
{
"epoch": 2.051282051282051,
"grad_norm": 1.0685631036758423,
"learning_rate": 4.316239316239317e-05,
"loss": 2.2079,
"step": 6000
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.3912978172302246,
"learning_rate": 4.259259259259259e-05,
"loss": 2.1651,
"step": 6500
},
{
"epoch": 2.393162393162393,
"grad_norm": 1.7378512620925903,
"learning_rate": 4.202279202279202e-05,
"loss": 2.1688,
"step": 7000
},
{
"epoch": 2.564102564102564,
"grad_norm": 2.20090913772583,
"learning_rate": 4.145299145299146e-05,
"loss": 2.1664,
"step": 7500
},
{
"epoch": 2.735042735042735,
"grad_norm": 1.6386638879776,
"learning_rate": 4.088319088319089e-05,
"loss": 2.1683,
"step": 8000
},
{
"epoch": 2.905982905982906,
"grad_norm": 0.7773854732513428,
"learning_rate": 4.031339031339032e-05,
"loss": 2.1624,
"step": 8500
},
{
"epoch": 3.0,
"eval_accuracy": 0.8788941765196487,
"eval_f1": 0.6219396959024139,
"eval_loss": 2.2569968700408936,
"eval_precision": 0.6301679867699539,
"eval_recall": 0.6139235139489527,
"eval_runtime": 5.624,
"eval_samples_per_second": 519.914,
"eval_steps_per_second": 65.078,
"step": 8775
},
{
"epoch": 3.076923076923077,
"grad_norm": 2.1296703815460205,
"learning_rate": 3.974358974358974e-05,
"loss": 2.1407,
"step": 9000
},
{
"epoch": 3.247863247863248,
"grad_norm": 3.029876708984375,
"learning_rate": 3.9173789173789176e-05,
"loss": 2.1139,
"step": 9500
},
{
"epoch": 3.4188034188034186,
"grad_norm": 2.393371820449829,
"learning_rate": 3.8603988603988605e-05,
"loss": 2.117,
"step": 10000
},
{
"epoch": 3.58974358974359,
"grad_norm": 3.3726866245269775,
"learning_rate": 3.8034188034188035e-05,
"loss": 2.1141,
"step": 10500
},
{
"epoch": 3.7606837606837606,
"grad_norm": 1.123772382736206,
"learning_rate": 3.746438746438747e-05,
"loss": 2.1151,
"step": 11000
},
{
"epoch": 3.931623931623932,
"grad_norm": 2.8514039516448975,
"learning_rate": 3.6894586894586894e-05,
"loss": 2.1192,
"step": 11500
},
{
"epoch": 4.0,
"eval_accuracy": 0.8776431339842026,
"eval_f1": 0.6255963151834184,
"eval_loss": 2.269813299179077,
"eval_precision": 0.6073624530863212,
"eval_recall": 0.6449588739082507,
"eval_runtime": 5.588,
"eval_samples_per_second": 523.26,
"eval_steps_per_second": 65.497,
"step": 11700
},
{
"epoch": 4.102564102564102,
"grad_norm": 0.6382957100868225,
"learning_rate": 3.6324786324786323e-05,
"loss": 2.0942,
"step": 12000
},
{
"epoch": 4.273504273504273,
"grad_norm": 2.4572439193725586,
"learning_rate": 3.575498575498576e-05,
"loss": 2.079,
"step": 12500
},
{
"epoch": 4.444444444444445,
"grad_norm": 4.030599117279053,
"learning_rate": 3.518518518518519e-05,
"loss": 2.0824,
"step": 13000
},
{
"epoch": 4.615384615384615,
"grad_norm": 1.3176660537719727,
"learning_rate": 3.461538461538462e-05,
"loss": 2.0844,
"step": 13500
},
{
"epoch": 4.786324786324786,
"grad_norm": 2.164088010787964,
"learning_rate": 3.404558404558404e-05,
"loss": 2.0804,
"step": 14000
},
{
"epoch": 4.957264957264957,
"grad_norm": 6.9171552658081055,
"learning_rate": 3.347578347578348e-05,
"loss": 2.0896,
"step": 14500
},
{
"epoch": 5.0,
"eval_accuracy": 0.8791149487317863,
"eval_f1": 0.6324646008618922,
"eval_loss": 2.2901737689971924,
"eval_precision": 0.612869869551384,
"eval_recall": 0.6533536843890444,
"eval_runtime": 5.621,
"eval_samples_per_second": 520.196,
"eval_steps_per_second": 65.113,
"step": 14625
},
{
"epoch": 5.128205128205128,
"grad_norm": 2.0550243854522705,
"learning_rate": 3.290598290598291e-05,
"loss": 2.0631,
"step": 15000
},
{
"epoch": 5.299145299145299,
"grad_norm": 1.2494322061538696,
"learning_rate": 3.2336182336182337e-05,
"loss": 2.0561,
"step": 15500
},
{
"epoch": 5.47008547008547,
"grad_norm": 2.4397966861724854,
"learning_rate": 3.176638176638177e-05,
"loss": 2.058,
"step": 16000
},
{
"epoch": 5.641025641025641,
"grad_norm": 2.813675880432129,
"learning_rate": 3.1196581196581195e-05,
"loss": 2.0611,
"step": 16500
},
{
"epoch": 5.811965811965812,
"grad_norm": 1.493696928024292,
"learning_rate": 3.0626780626780625e-05,
"loss": 2.0609,
"step": 17000
},
{
"epoch": 5.982905982905983,
"grad_norm": 2.580273389816284,
"learning_rate": 3.005698005698006e-05,
"loss": 2.0621,
"step": 17500
},
{
"epoch": 6.0,
"eval_accuracy": 0.8811141637639209,
"eval_f1": 0.6368790156637131,
"eval_loss": 2.2965099811553955,
"eval_precision": 0.630865224625624,
"eval_recall": 0.6430085644026117,
"eval_runtime": 5.5652,
"eval_samples_per_second": 525.412,
"eval_steps_per_second": 65.766,
"step": 17550
},
{
"epoch": 6.153846153846154,
"grad_norm": 3.136852741241455,
"learning_rate": 2.948717948717949e-05,
"loss": 2.0441,
"step": 18000
},
{
"epoch": 6.3247863247863245,
"grad_norm": 1.3632102012634277,
"learning_rate": 2.8917378917378917e-05,
"loss": 2.0433,
"step": 18500
},
{
"epoch": 6.495726495726496,
"grad_norm": 4.941199779510498,
"learning_rate": 2.8347578347578346e-05,
"loss": 2.0427,
"step": 19000
},
{
"epoch": 6.666666666666667,
"grad_norm": 2.8133013248443604,
"learning_rate": 2.777777777777778e-05,
"loss": 2.0436,
"step": 19500
},
{
"epoch": 6.837606837606837,
"grad_norm": 1.1807732582092285,
"learning_rate": 2.720797720797721e-05,
"loss": 2.0442,
"step": 20000
},
{
"epoch": 7.0,
"eval_accuracy": 0.8829661973212971,
"eval_f1": 0.6478284496091627,
"eval_loss": 2.306105852127075,
"eval_precision": 0.6388293487221764,
"eval_recall": 0.657084711269397,
"eval_runtime": 5.5992,
"eval_samples_per_second": 522.219,
"eval_steps_per_second": 65.367,
"step": 20475
},
{
"epoch": 7.0085470085470085,
"grad_norm": 1.7212845087051392,
"learning_rate": 2.6638176638176638e-05,
"loss": 2.0458,
"step": 20500
},
{
"epoch": 7.17948717948718,
"grad_norm": 2.134288787841797,
"learning_rate": 2.606837606837607e-05,
"loss": 2.0322,
"step": 21000
},
{
"epoch": 7.35042735042735,
"grad_norm": 2.6075599193573,
"learning_rate": 2.54985754985755e-05,
"loss": 2.033,
"step": 21500
},
{
"epoch": 7.521367521367521,
"grad_norm": 0.940613329410553,
"learning_rate": 2.492877492877493e-05,
"loss": 2.0315,
"step": 22000
},
{
"epoch": 7.6923076923076925,
"grad_norm": 5.997873783111572,
"learning_rate": 2.435897435897436e-05,
"loss": 2.0317,
"step": 22500
},
{
"epoch": 7.863247863247864,
"grad_norm": 1.9498519897460938,
"learning_rate": 2.3789173789173792e-05,
"loss": 2.0301,
"step": 23000
},
{
"epoch": 8.0,
"eval_accuracy": 0.8818132757690232,
"eval_f1": 0.6476386036960986,
"eval_loss": 2.3260273933410645,
"eval_precision": 0.6279366090626742,
"eval_recall": 0.6686169761723056,
"eval_runtime": 5.5616,
"eval_samples_per_second": 525.751,
"eval_steps_per_second": 65.809,
"step": 23400
},
{
"epoch": 8.034188034188034,
"grad_norm": 3.1696274280548096,
"learning_rate": 2.321937321937322e-05,
"loss": 2.0325,
"step": 23500
},
{
"epoch": 8.205128205128204,
"grad_norm": 0.9211856126785278,
"learning_rate": 2.264957264957265e-05,
"loss": 2.0222,
"step": 24000
},
{
"epoch": 8.376068376068377,
"grad_norm": 2.332916259765625,
"learning_rate": 2.207977207977208e-05,
"loss": 2.0244,
"step": 24500
},
{
"epoch": 8.547008547008547,
"grad_norm": 1.2731038331985474,
"learning_rate": 2.150997150997151e-05,
"loss": 2.0242,
"step": 25000
},
{
"epoch": 8.717948717948717,
"grad_norm": 0.8299376964569092,
"learning_rate": 2.0940170940170943e-05,
"loss": 2.0238,
"step": 25500
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.503308892250061,
"learning_rate": 2.037037037037037e-05,
"loss": 2.0242,
"step": 26000
},
{
"epoch": 9.0,
"eval_accuracy": 0.8830275229357798,
"eval_f1": 0.6493926454127109,
"eval_loss": 2.3398172855377197,
"eval_precision": 0.6353017521090201,
"eval_recall": 0.6641227847027897,
"eval_runtime": 5.6249,
"eval_samples_per_second": 519.833,
"eval_steps_per_second": 65.068,
"step": 26325
},
{
"epoch": 9.05982905982906,
"grad_norm": 1.7587120532989502,
"learning_rate": 1.9800569800569802e-05,
"loss": 2.0226,
"step": 26500
},
{
"epoch": 9.23076923076923,
"grad_norm": 0.7542155385017395,
"learning_rate": 1.923076923076923e-05,
"loss": 2.0177,
"step": 27000
},
{
"epoch": 9.401709401709402,
"grad_norm": 0.33988329768180847,
"learning_rate": 1.866096866096866e-05,
"loss": 2.0203,
"step": 27500
},
{
"epoch": 9.572649572649572,
"grad_norm": 1.8626066446304321,
"learning_rate": 1.8091168091168094e-05,
"loss": 2.0175,
"step": 28000
},
{
"epoch": 9.743589743589745,
"grad_norm": 2.40765118598938,
"learning_rate": 1.752136752136752e-05,
"loss": 2.0183,
"step": 28500
},
{
"epoch": 9.914529914529915,
"grad_norm": 2.155571222305298,
"learning_rate": 1.6951566951566953e-05,
"loss": 2.0173,
"step": 29000
},
{
"epoch": 10.0,
"eval_accuracy": 0.8841559142422607,
"eval_f1": 0.652157598499062,
"eval_loss": 2.3391082286834717,
"eval_precision": 0.641486220472441,
"eval_recall": 0.6631900279827017,
"eval_runtime": 5.5617,
"eval_samples_per_second": 525.742,
"eval_steps_per_second": 65.808,
"step": 29250
},
{
"epoch": 10.085470085470085,
"grad_norm": 1.149816870689392,
"learning_rate": 1.6381766381766382e-05,
"loss": 2.0171,
"step": 29500
},
{
"epoch": 10.256410256410255,
"grad_norm": 0.5041487812995911,
"learning_rate": 1.581196581196581e-05,
"loss": 2.0133,
"step": 30000
},
{
"epoch": 10.427350427350428,
"grad_norm": 6.211667537689209,
"learning_rate": 1.5242165242165243e-05,
"loss": 2.0144,
"step": 30500
},
{
"epoch": 10.598290598290598,
"grad_norm": 0.1538165956735611,
"learning_rate": 1.4672364672364672e-05,
"loss": 2.0135,
"step": 31000
},
{
"epoch": 10.76923076923077,
"grad_norm": 1.0518053770065308,
"learning_rate": 1.4102564102564104e-05,
"loss": 2.0128,
"step": 31500
},
{
"epoch": 10.94017094017094,
"grad_norm": 1.116525650024414,
"learning_rate": 1.3532763532763535e-05,
"loss": 2.0132,
"step": 32000
},
{
"epoch": 11.0,
"eval_accuracy": 0.8832973556395035,
"eval_f1": 0.6500785318674052,
"eval_loss": 2.3498170375823975,
"eval_precision": 0.634142407870333,
"eval_recall": 0.6668362587975918,
"eval_runtime": 5.7697,
"eval_samples_per_second": 506.782,
"eval_steps_per_second": 63.434,
"step": 32175
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.1830213963985443,
"learning_rate": 1.2962962962962962e-05,
"loss": 2.0121,
"step": 32500
},
{
"epoch": 11.282051282051283,
"grad_norm": 2.5111734867095947,
"learning_rate": 1.2393162393162394e-05,
"loss": 2.0103,
"step": 33000
},
{
"epoch": 11.452991452991453,
"grad_norm": 3.7082180976867676,
"learning_rate": 1.1823361823361825e-05,
"loss": 2.0103,
"step": 33500
},
{
"epoch": 11.623931623931623,
"grad_norm": 1.1296755075454712,
"learning_rate": 1.1253561253561254e-05,
"loss": 2.011,
"step": 34000
},
{
"epoch": 11.794871794871796,
"grad_norm": 2.4463248252868652,
"learning_rate": 1.0683760683760684e-05,
"loss": 2.0093,
"step": 34500
},
{
"epoch": 11.965811965811966,
"grad_norm": 0.03058500401675701,
"learning_rate": 1.0113960113960115e-05,
"loss": 2.0097,
"step": 35000
},
{
"epoch": 12.0,
"eval_accuracy": 0.8845851935436393,
"eval_f1": 0.6505743299483937,
"eval_loss": 2.355226993560791,
"eval_precision": 0.6388230486309767,
"eval_recall": 0.6627660476553888,
"eval_runtime": 5.5805,
"eval_samples_per_second": 523.964,
"eval_steps_per_second": 65.585,
"step": 35100
},
{
"epoch": 12.136752136752136,
"grad_norm": 1.262992024421692,
"learning_rate": 9.544159544159544e-06,
"loss": 2.0083,
"step": 35500
},
{
"epoch": 12.307692307692308,
"grad_norm": 0.350888192653656,
"learning_rate": 8.974358974358976e-06,
"loss": 2.0082,
"step": 36000
},
{
"epoch": 12.478632478632479,
"grad_norm": 0.7504994869232178,
"learning_rate": 8.404558404558405e-06,
"loss": 2.0089,
"step": 36500
},
{
"epoch": 12.649572649572649,
"grad_norm": 2.052617311477661,
"learning_rate": 7.834757834757835e-06,
"loss": 2.0072,
"step": 37000
},
{
"epoch": 12.820512820512821,
"grad_norm": 0.4613409638404846,
"learning_rate": 7.264957264957266e-06,
"loss": 2.0073,
"step": 37500
},
{
"epoch": 12.991452991452991,
"grad_norm": 4.136294364929199,
"learning_rate": 6.695156695156696e-06,
"loss": 2.007,
"step": 38000
},
{
"epoch": 13.0,
"eval_accuracy": 0.8839228769072266,
"eval_f1": 0.6545124566903151,
"eval_loss": 2.3634226322174072,
"eval_precision": 0.6372178941450486,
"eval_recall": 0.6727719833799711,
"eval_runtime": 5.622,
"eval_samples_per_second": 520.097,
"eval_steps_per_second": 65.101,
"step": 38025
},
{
"epoch": 13.162393162393162,
"grad_norm": 0.16694723069667816,
"learning_rate": 6.1253561253561255e-06,
"loss": 2.0057,
"step": 38500
},
{
"epoch": 13.333333333333334,
"grad_norm": 0.8811143636703491,
"learning_rate": 5.555555555555556e-06,
"loss": 2.0065,
"step": 39000
},
{
"epoch": 13.504273504273504,
"grad_norm": 0.4992905855178833,
"learning_rate": 4.985754985754986e-06,
"loss": 2.0068,
"step": 39500
},
{
"epoch": 13.675213675213675,
"grad_norm": 0.6530119180679321,
"learning_rate": 4.415954415954416e-06,
"loss": 2.0052,
"step": 40000
},
{
"epoch": 13.846153846153847,
"grad_norm": 2.222022771835327,
"learning_rate": 3.846153846153847e-06,
"loss": 2.0062,
"step": 40500
},
{
"epoch": 14.0,
"eval_accuracy": 0.884462542314674,
"eval_f1": 0.6561026065370293,
"eval_loss": 2.3629047870635986,
"eval_precision": 0.6406237375777653,
"eval_recall": 0.6723480030526584,
"eval_runtime": 5.6036,
"eval_samples_per_second": 521.81,
"eval_steps_per_second": 65.316,
"step": 40950
},
{
"epoch": 14.017094017094017,
"grad_norm": 0.11298029124736786,
"learning_rate": 3.2763532763532763e-06,
"loss": 2.0064,
"step": 41000
},
{
"epoch": 14.188034188034187,
"grad_norm": 0.11808889359235764,
"learning_rate": 2.7065527065527066e-06,
"loss": 2.0048,
"step": 41500
},
{
"epoch": 14.35897435897436,
"grad_norm": 0.051862556487321854,
"learning_rate": 2.136752136752137e-06,
"loss": 2.0052,
"step": 42000
},
{
"epoch": 14.52991452991453,
"grad_norm": 0.021300671622157097,
"learning_rate": 1.566951566951567e-06,
"loss": 2.0053,
"step": 42500
},
{
"epoch": 14.7008547008547,
"grad_norm": 0.11307813972234726,
"learning_rate": 9.971509971509971e-07,
"loss": 2.005,
"step": 43000
},
{
"epoch": 14.871794871794872,
"grad_norm": 1.3423974514007568,
"learning_rate": 4.273504273504274e-07,
"loss": 2.0041,
"step": 43500
},
{
"epoch": 15.0,
"eval_accuracy": 0.8847446401412942,
"eval_f1": 0.6565610672834661,
"eval_loss": 2.365044116973877,
"eval_precision": 0.6400386535674022,
"eval_recall": 0.673959128296447,
"eval_runtime": 5.6195,
"eval_samples_per_second": 520.335,
"eval_steps_per_second": 65.131,
"step": 43875
},
{
"epoch": 15.0,
"step": 43875,
"total_flos": 1.39563382170006e+16,
"train_loss": 2.07735239021323,
"train_runtime": 2398.7609,
"train_samples_per_second": 146.319,
"train_steps_per_second": 18.291
}
],
"logging_steps": 500,
"max_steps": 43875,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.39563382170006e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}