roberta-base-mnli / trainer_state.json
ZhangYunchenY
[Model] roberta-base-mnli
3cccf02
{
"best_metric": 0.8775343861436576,
"best_model_checkpoint": "./fp32/models/mnli-roberta-base/checkpoint-57000",
"epoch": 4.644719687092568,
"global_step": 57000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08,
"learning_rate": 1.3579576317218903e-06,
"loss": 1.0971,
"step": 1000
},
{
"epoch": 0.08,
"eval_accuracy": 0.5194090677534386,
"eval_loss": 1.0465178489685059,
"eval_runtime": 29.1271,
"eval_samples_per_second": 336.972,
"eval_steps_per_second": 10.54,
"step": 1000
},
{
"epoch": 0.16,
"learning_rate": 2.7159152634437806e-06,
"loss": 0.7469,
"step": 2000
},
{
"epoch": 0.16,
"eval_accuracy": 0.7842078451349974,
"eval_loss": 0.5564178228378296,
"eval_runtime": 27.2282,
"eval_samples_per_second": 360.471,
"eval_steps_per_second": 11.275,
"step": 2000
},
{
"epoch": 0.24,
"learning_rate": 4.073872895165672e-06,
"loss": 0.5572,
"step": 3000
},
{
"epoch": 0.24,
"eval_accuracy": 0.8114111054508405,
"eval_loss": 0.4900236427783966,
"eval_runtime": 27.0307,
"eval_samples_per_second": 363.106,
"eval_steps_per_second": 11.357,
"step": 3000
},
{
"epoch": 0.33,
"learning_rate": 5.431830526887561e-06,
"loss": 0.5075,
"step": 4000
},
{
"epoch": 0.33,
"eval_accuracy": 0.834538970962812,
"eval_loss": 0.4278336763381958,
"eval_runtime": 25.1831,
"eval_samples_per_second": 389.746,
"eval_steps_per_second": 12.191,
"step": 4000
},
{
"epoch": 0.41,
"learning_rate": 6.7897881586094514e-06,
"loss": 0.4763,
"step": 5000
},
{
"epoch": 0.41,
"eval_accuracy": 0.8364747834946511,
"eval_loss": 0.41645798087120056,
"eval_runtime": 26.3531,
"eval_samples_per_second": 372.441,
"eval_steps_per_second": 11.649,
"step": 5000
},
{
"epoch": 0.49,
"learning_rate": 8.147745790331343e-06,
"loss": 0.4613,
"step": 6000
},
{
"epoch": 0.49,
"eval_accuracy": 0.8478858889454916,
"eval_loss": 0.39705243706703186,
"eval_runtime": 27.5189,
"eval_samples_per_second": 356.663,
"eval_steps_per_second": 11.156,
"step": 6000
},
{
"epoch": 0.57,
"learning_rate": 9.505703422053234e-06,
"loss": 0.4539,
"step": 7000
},
{
"epoch": 0.57,
"eval_accuracy": 0.8448293428425879,
"eval_loss": 0.40062254667282104,
"eval_runtime": 25.5016,
"eval_samples_per_second": 384.878,
"eval_steps_per_second": 12.038,
"step": 7000
},
{
"epoch": 0.65,
"learning_rate": 9.94486632684906e-06,
"loss": 0.4441,
"step": 8000
},
{
"epoch": 0.65,
"eval_accuracy": 0.8479877738155884,
"eval_loss": 0.39510154724121094,
"eval_runtime": 26.4014,
"eval_samples_per_second": 371.76,
"eval_steps_per_second": 11.628,
"step": 8000
},
{
"epoch": 0.73,
"learning_rate": 9.85817816151739e-06,
"loss": 0.4293,
"step": 9000
},
{
"epoch": 0.73,
"eval_accuracy": 0.8596026490066225,
"eval_loss": 0.3864976763725281,
"eval_runtime": 27.4299,
"eval_samples_per_second": 357.821,
"eval_steps_per_second": 11.192,
"step": 9000
},
{
"epoch": 0.81,
"learning_rate": 9.771489996185721e-06,
"loss": 0.4237,
"step": 10000
},
{
"epoch": 0.81,
"eval_accuracy": 0.8576668364747835,
"eval_loss": 0.3852238059043884,
"eval_runtime": 26.1569,
"eval_samples_per_second": 375.236,
"eval_steps_per_second": 11.737,
"step": 10000
},
{
"epoch": 0.9,
"learning_rate": 9.684801830854052e-06,
"loss": 0.4214,
"step": 11000
},
{
"epoch": 0.9,
"eval_accuracy": 0.8607233825776872,
"eval_loss": 0.3725791275501251,
"eval_runtime": 28.6257,
"eval_samples_per_second": 342.874,
"eval_steps_per_second": 10.725,
"step": 11000
},
{
"epoch": 0.98,
"learning_rate": 9.598113665522383e-06,
"loss": 0.4147,
"step": 12000
},
{
"epoch": 0.98,
"eval_accuracy": 0.862761079979623,
"eval_loss": 0.3637676239013672,
"eval_runtime": 24.8449,
"eval_samples_per_second": 395.052,
"eval_steps_per_second": 12.357,
"step": 12000
},
{
"epoch": 1.06,
"learning_rate": 9.511425500190714e-06,
"loss": 0.3707,
"step": 13000
},
{
"epoch": 1.06,
"eval_accuracy": 0.8629648497198166,
"eval_loss": 0.3756280243396759,
"eval_runtime": 28.9144,
"eval_samples_per_second": 339.451,
"eval_steps_per_second": 10.618,
"step": 13000
},
{
"epoch": 1.14,
"learning_rate": 9.424737334859045e-06,
"loss": 0.3565,
"step": 14000
},
{
"epoch": 1.14,
"eval_accuracy": 0.8644931227712684,
"eval_loss": 0.3675437867641449,
"eval_runtime": 28.4842,
"eval_samples_per_second": 344.578,
"eval_steps_per_second": 10.778,
"step": 14000
},
{
"epoch": 1.22,
"learning_rate": 9.338049169527376e-06,
"loss": 0.3662,
"step": 15000
},
{
"epoch": 1.22,
"eval_accuracy": 0.866225165562914,
"eval_loss": 0.3639264404773712,
"eval_runtime": 29.9404,
"eval_samples_per_second": 327.818,
"eval_steps_per_second": 10.254,
"step": 15000
},
{
"epoch": 1.3,
"learning_rate": 9.251361004195707e-06,
"loss": 0.3561,
"step": 16000
},
{
"epoch": 1.3,
"eval_accuracy": 0.8615384615384616,
"eval_loss": 0.36105281114578247,
"eval_runtime": 24.6832,
"eval_samples_per_second": 397.639,
"eval_steps_per_second": 12.438,
"step": 16000
},
{
"epoch": 1.39,
"learning_rate": 9.16467283886404e-06,
"loss": 0.3605,
"step": 17000
},
{
"epoch": 1.39,
"eval_accuracy": 0.8665308201732043,
"eval_loss": 0.3572224974632263,
"eval_runtime": 25.556,
"eval_samples_per_second": 384.059,
"eval_steps_per_second": 12.013,
"step": 17000
},
{
"epoch": 1.47,
"learning_rate": 9.07798467353237e-06,
"loss": 0.3525,
"step": 18000
},
{
"epoch": 1.47,
"eval_accuracy": 0.8622516556291391,
"eval_loss": 0.3711968660354614,
"eval_runtime": 27.8478,
"eval_samples_per_second": 352.452,
"eval_steps_per_second": 11.024,
"step": 18000
},
{
"epoch": 1.55,
"learning_rate": 8.9912965082007e-06,
"loss": 0.3489,
"step": 19000
},
{
"epoch": 1.55,
"eval_accuracy": 0.8704024452368824,
"eval_loss": 0.371066153049469,
"eval_runtime": 26.8458,
"eval_samples_per_second": 365.606,
"eval_steps_per_second": 11.436,
"step": 19000
},
{
"epoch": 1.63,
"learning_rate": 8.904608342869031e-06,
"loss": 0.3553,
"step": 20000
},
{
"epoch": 1.63,
"eval_accuracy": 0.8635761589403973,
"eval_loss": 0.365778386592865,
"eval_runtime": 27.8617,
"eval_samples_per_second": 352.276,
"eval_steps_per_second": 11.019,
"step": 20000
},
{
"epoch": 1.71,
"learning_rate": 8.817920177537364e-06,
"loss": 0.3478,
"step": 21000
},
{
"epoch": 1.71,
"eval_accuracy": 0.8650025471217524,
"eval_loss": 0.365774542093277,
"eval_runtime": 25.6814,
"eval_samples_per_second": 382.183,
"eval_steps_per_second": 11.954,
"step": 21000
},
{
"epoch": 1.79,
"learning_rate": 8.731232012205695e-06,
"loss": 0.3416,
"step": 22000
},
{
"epoch": 1.79,
"eval_accuracy": 0.86571574121243,
"eval_loss": 0.38820841908454895,
"eval_runtime": 25.5524,
"eval_samples_per_second": 384.112,
"eval_steps_per_second": 12.015,
"step": 22000
},
{
"epoch": 1.87,
"learning_rate": 8.644543846874025e-06,
"loss": 0.3463,
"step": 23000
},
{
"epoch": 1.87,
"eval_accuracy": 0.8714212939378503,
"eval_loss": 0.3499988317489624,
"eval_runtime": 27.3842,
"eval_samples_per_second": 358.418,
"eval_steps_per_second": 11.211,
"step": 23000
},
{
"epoch": 1.96,
"learning_rate": 8.557855681542356e-06,
"loss": 0.3429,
"step": 24000
},
{
"epoch": 1.96,
"eval_accuracy": 0.8645950076413652,
"eval_loss": 0.37212345004081726,
"eval_runtime": 27.0146,
"eval_samples_per_second": 363.322,
"eval_steps_per_second": 11.364,
"step": 24000
},
{
"epoch": 2.04,
"learning_rate": 8.471167516210688e-06,
"loss": 0.3177,
"step": 25000
},
{
"epoch": 2.04,
"eval_accuracy": 0.8691798267957208,
"eval_loss": 0.37507861852645874,
"eval_runtime": 27.0938,
"eval_samples_per_second": 362.26,
"eval_steps_per_second": 11.331,
"step": 25000
},
{
"epoch": 2.12,
"learning_rate": 8.38447935087902e-06,
"loss": 0.2821,
"step": 26000
},
{
"epoch": 2.12,
"eval_accuracy": 0.8716250636780438,
"eval_loss": 0.37794411182403564,
"eval_runtime": 27.0755,
"eval_samples_per_second": 362.505,
"eval_steps_per_second": 11.339,
"step": 26000
},
{
"epoch": 2.2,
"learning_rate": 8.29779118554735e-06,
"loss": 0.2726,
"step": 27000
},
{
"epoch": 2.2,
"eval_accuracy": 0.8691798267957208,
"eval_loss": 0.38138309121131897,
"eval_runtime": 28.6712,
"eval_samples_per_second": 342.329,
"eval_steps_per_second": 10.708,
"step": 27000
},
{
"epoch": 2.28,
"learning_rate": 8.211103020215681e-06,
"loss": 0.2743,
"step": 28000
},
{
"epoch": 2.28,
"eval_accuracy": 0.8757004584819155,
"eval_loss": 0.3701411783695221,
"eval_runtime": 28.7005,
"eval_samples_per_second": 341.981,
"eval_steps_per_second": 10.697,
"step": 28000
},
{
"epoch": 2.36,
"learning_rate": 8.124414854884012e-06,
"loss": 0.2814,
"step": 29000
},
{
"epoch": 2.36,
"eval_accuracy": 0.8675496688741722,
"eval_loss": 0.40641501545906067,
"eval_runtime": 27.6676,
"eval_samples_per_second": 354.747,
"eval_steps_per_second": 11.096,
"step": 29000
},
{
"epoch": 2.44,
"learning_rate": 8.037726689552343e-06,
"loss": 0.2718,
"step": 30000
},
{
"epoch": 2.44,
"eval_accuracy": 0.8726439123790117,
"eval_loss": 0.39712002873420715,
"eval_runtime": 30.6579,
"eval_samples_per_second": 320.146,
"eval_steps_per_second": 10.014,
"step": 30000
},
{
"epoch": 2.53,
"learning_rate": 7.951038524220674e-06,
"loss": 0.2798,
"step": 31000
},
{
"epoch": 2.53,
"eval_accuracy": 0.8732552215995925,
"eval_loss": 0.3579646944999695,
"eval_runtime": 29.1251,
"eval_samples_per_second": 336.994,
"eval_steps_per_second": 10.541,
"step": 31000
},
{
"epoch": 2.61,
"learning_rate": 7.864350358889005e-06,
"loss": 0.2808,
"step": 32000
},
{
"epoch": 2.61,
"eval_accuracy": 0.8733571064696892,
"eval_loss": 0.3730563819408417,
"eval_runtime": 28.2414,
"eval_samples_per_second": 347.54,
"eval_steps_per_second": 10.871,
"step": 32000
},
{
"epoch": 2.69,
"learning_rate": 7.777662193557336e-06,
"loss": 0.2854,
"step": 33000
},
{
"epoch": 2.69,
"eval_accuracy": 0.8724401426388181,
"eval_loss": 0.3838872015476227,
"eval_runtime": 28.3639,
"eval_samples_per_second": 346.039,
"eval_steps_per_second": 10.824,
"step": 33000
},
{
"epoch": 2.77,
"learning_rate": 7.690974028225667e-06,
"loss": 0.2802,
"step": 34000
},
{
"epoch": 2.77,
"eval_accuracy": 0.8763117677024962,
"eval_loss": 0.3689836263656616,
"eval_runtime": 31.7713,
"eval_samples_per_second": 308.927,
"eval_steps_per_second": 9.663,
"step": 34000
},
{
"epoch": 2.85,
"learning_rate": 7.604285862893998e-06,
"loss": 0.2825,
"step": 35000
},
{
"epoch": 2.85,
"eval_accuracy": 0.872032603158431,
"eval_loss": 0.37332993745803833,
"eval_runtime": 28.2862,
"eval_samples_per_second": 346.99,
"eval_steps_per_second": 10.853,
"step": 35000
},
{
"epoch": 2.93,
"learning_rate": 7.51759769756233e-06,
"loss": 0.2811,
"step": 36000
},
{
"epoch": 2.93,
"eval_accuracy": 0.872032603158431,
"eval_loss": 0.3704814016819,
"eval_runtime": 27.6376,
"eval_samples_per_second": 355.133,
"eval_steps_per_second": 11.108,
"step": 36000
},
{
"epoch": 3.01,
"learning_rate": 7.43090953223066e-06,
"loss": 0.2733,
"step": 37000
},
{
"epoch": 3.01,
"eval_accuracy": 0.8758023433520122,
"eval_loss": 0.4095652401447296,
"eval_runtime": 26.7806,
"eval_samples_per_second": 366.496,
"eval_steps_per_second": 11.464,
"step": 37000
},
{
"epoch": 3.1,
"learning_rate": 7.344221366898991e-06,
"loss": 0.2186,
"step": 38000
},
{
"epoch": 3.1,
"eval_accuracy": 0.8766174223127865,
"eval_loss": 0.4097856283187866,
"eval_runtime": 30.263,
"eval_samples_per_second": 324.324,
"eval_steps_per_second": 10.144,
"step": 38000
},
{
"epoch": 3.18,
"learning_rate": 7.257533201567322e-06,
"loss": 0.2228,
"step": 39000
},
{
"epoch": 3.18,
"eval_accuracy": 0.8757004584819155,
"eval_loss": 0.41501158475875854,
"eval_runtime": 27.8042,
"eval_samples_per_second": 353.004,
"eval_steps_per_second": 11.042,
"step": 39000
},
{
"epoch": 3.26,
"learning_rate": 7.170845036235654e-06,
"loss": 0.2247,
"step": 40000
},
{
"epoch": 3.26,
"eval_accuracy": 0.8736627610799796,
"eval_loss": 0.4023166596889496,
"eval_runtime": 28.6307,
"eval_samples_per_second": 342.813,
"eval_steps_per_second": 10.723,
"step": 40000
},
{
"epoch": 3.34,
"learning_rate": 7.084156870903984e-06,
"loss": 0.2236,
"step": 41000
},
{
"epoch": 3.34,
"eval_accuracy": 0.873458991339786,
"eval_loss": 0.401944100856781,
"eval_runtime": 42.8415,
"eval_samples_per_second": 229.1,
"eval_steps_per_second": 7.166,
"step": 41000
},
{
"epoch": 3.42,
"learning_rate": 6.997468705572315e-06,
"loss": 0.2316,
"step": 42000
},
{
"epoch": 3.42,
"eval_accuracy": 0.875394803871625,
"eval_loss": 0.39935216307640076,
"eval_runtime": 27.9679,
"eval_samples_per_second": 350.938,
"eval_steps_per_second": 10.977,
"step": 42000
},
{
"epoch": 3.5,
"learning_rate": 6.910780540240647e-06,
"loss": 0.227,
"step": 43000
},
{
"epoch": 3.5,
"eval_accuracy": 0.8735608762098829,
"eval_loss": 0.40627339482307434,
"eval_runtime": 27.7905,
"eval_samples_per_second": 353.179,
"eval_steps_per_second": 11.047,
"step": 43000
},
{
"epoch": 3.59,
"learning_rate": 6.824092374908978e-06,
"loss": 0.225,
"step": 44000
},
{
"epoch": 3.59,
"eval_accuracy": 0.8738665308201732,
"eval_loss": 0.3787411153316498,
"eval_runtime": 26.634,
"eval_samples_per_second": 368.513,
"eval_steps_per_second": 11.527,
"step": 44000
},
{
"epoch": 3.67,
"learning_rate": 6.737404209577309e-06,
"loss": 0.2213,
"step": 45000
},
{
"epoch": 3.67,
"eval_accuracy": 0.8741721854304636,
"eval_loss": 0.4053351879119873,
"eval_runtime": 26.9451,
"eval_samples_per_second": 364.259,
"eval_steps_per_second": 11.394,
"step": 45000
},
{
"epoch": 3.75,
"learning_rate": 6.6507160442456394e-06,
"loss": 0.228,
"step": 46000
},
{
"epoch": 3.75,
"eval_accuracy": 0.8737646459500764,
"eval_loss": 0.4027246832847595,
"eval_runtime": 26.7635,
"eval_samples_per_second": 366.731,
"eval_steps_per_second": 11.471,
"step": 46000
},
{
"epoch": 3.83,
"learning_rate": 6.564027878913971e-06,
"loss": 0.2227,
"step": 47000
},
{
"epoch": 3.83,
"eval_accuracy": 0.87396841569027,
"eval_loss": 0.4178619384765625,
"eval_runtime": 29.4635,
"eval_samples_per_second": 333.124,
"eval_steps_per_second": 10.42,
"step": 47000
},
{
"epoch": 3.91,
"learning_rate": 6.477339713582302e-06,
"loss": 0.227,
"step": 48000
},
{
"epoch": 3.91,
"eval_accuracy": 0.8752929190015283,
"eval_loss": 0.3969513773918152,
"eval_runtime": 29.294,
"eval_samples_per_second": 335.052,
"eval_steps_per_second": 10.48,
"step": 48000
},
{
"epoch": 3.99,
"learning_rate": 6.390651548250633e-06,
"loss": 0.2257,
"step": 49000
},
{
"epoch": 3.99,
"eval_accuracy": 0.8767193071828834,
"eval_loss": 0.41893526911735535,
"eval_runtime": 30.0922,
"eval_samples_per_second": 326.164,
"eval_steps_per_second": 10.202,
"step": 49000
},
{
"epoch": 4.07,
"learning_rate": 6.303963382918964e-06,
"loss": 0.1808,
"step": 50000
},
{
"epoch": 4.07,
"eval_accuracy": 0.8714212939378503,
"eval_loss": 0.4699897766113281,
"eval_runtime": 27.6446,
"eval_samples_per_second": 355.042,
"eval_steps_per_second": 11.105,
"step": 50000
},
{
"epoch": 4.16,
"learning_rate": 6.217275217587295e-06,
"loss": 0.1821,
"step": 51000
},
{
"epoch": 4.16,
"eval_accuracy": 0.87396841569027,
"eval_loss": 0.4464610815048218,
"eval_runtime": 26.6712,
"eval_samples_per_second": 368.001,
"eval_steps_per_second": 11.511,
"step": 51000
},
{
"epoch": 4.24,
"learning_rate": 6.130587052255626e-06,
"loss": 0.1788,
"step": 52000
},
{
"epoch": 4.24,
"eval_accuracy": 0.8727457972491085,
"eval_loss": 0.46066877245903015,
"eval_runtime": 27.6226,
"eval_samples_per_second": 355.326,
"eval_steps_per_second": 11.114,
"step": 52000
},
{
"epoch": 4.32,
"learning_rate": 6.043898886923958e-06,
"loss": 0.1822,
"step": 53000
},
{
"epoch": 4.32,
"eval_accuracy": 0.8737646459500764,
"eval_loss": 0.46094420552253723,
"eval_runtime": 27.8257,
"eval_samples_per_second": 352.732,
"eval_steps_per_second": 11.033,
"step": 53000
},
{
"epoch": 4.4,
"learning_rate": 5.957210721592289e-06,
"loss": 0.181,
"step": 54000
},
{
"epoch": 4.4,
"eval_accuracy": 0.8774325012735609,
"eval_loss": 0.4538358151912689,
"eval_runtime": 27.7177,
"eval_samples_per_second": 354.106,
"eval_steps_per_second": 11.076,
"step": 54000
},
{
"epoch": 4.48,
"learning_rate": 5.870522556260619e-06,
"loss": 0.1843,
"step": 55000
},
{
"epoch": 4.48,
"eval_accuracy": 0.8711156393275599,
"eval_loss": 0.47502145171165466,
"eval_runtime": 28.0796,
"eval_samples_per_second": 349.543,
"eval_steps_per_second": 10.933,
"step": 55000
},
{
"epoch": 4.56,
"learning_rate": 5.78383439092895e-06,
"loss": 0.1842,
"step": 56000
},
{
"epoch": 4.56,
"eval_accuracy": 0.8742740703005604,
"eval_loss": 0.44675213098526,
"eval_runtime": 27.0482,
"eval_samples_per_second": 362.871,
"eval_steps_per_second": 11.35,
"step": 56000
},
{
"epoch": 4.64,
"learning_rate": 5.697146225597282e-06,
"loss": 0.1828,
"step": 57000
},
{
"epoch": 4.64,
"eval_accuracy": 0.8775343861436576,
"eval_loss": 0.45801690220832825,
"eval_runtime": 26.6517,
"eval_samples_per_second": 368.27,
"eval_steps_per_second": 11.519,
"step": 57000
}
],
"max_steps": 122720,
"num_train_epochs": 10,
"total_flos": 1.1997919225713254e+17,
"trial_name": null,
"trial_params": null
}