PyTorch
bert
MechBERT-cased / trainer_state.json
pkumar-hf's picture
Public Release
e4bbf72 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 74.9400479616307,
"global_step": 125000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.6,
"learning_rate": 7.832e-06,
"loss": 3.1441,
"step": 1000
},
{
"epoch": 1.2,
"learning_rate": 1.5832e-05,
"loss": 1.8595,
"step": 2000
},
{
"epoch": 1.8,
"learning_rate": 2.3832e-05,
"loss": 1.5378,
"step": 3000
},
{
"epoch": 2.4,
"learning_rate": 3.1832e-05,
"loss": 1.43,
"step": 4000
},
{
"epoch": 3.0,
"learning_rate": 3.9816e-05,
"loss": 1.3543,
"step": 5000
},
{
"epoch": 3.6,
"learning_rate": 4.7816000000000004e-05,
"loss": 1.2961,
"step": 6000
},
{
"epoch": 4.2,
"learning_rate": 5.5808000000000005e-05,
"loss": 1.2511,
"step": 7000
},
{
"epoch": 4.8,
"learning_rate": 6.380800000000001e-05,
"loss": 1.2147,
"step": 8000
},
{
"epoch": 5.4,
"learning_rate": 7.1808e-05,
"loss": 1.1854,
"step": 9000
},
{
"epoch": 6.0,
"learning_rate": 7.980000000000002e-05,
"loss": 1.1596,
"step": 10000
},
{
"epoch": 6.59,
"learning_rate": 7.93224347826087e-05,
"loss": 1.1377,
"step": 11000
},
{
"epoch": 7.19,
"learning_rate": 7.862747826086958e-05,
"loss": 1.1184,
"step": 12000
},
{
"epoch": 7.79,
"learning_rate": 7.793252173913044e-05,
"loss": 1.102,
"step": 13000
},
{
"epoch": 8.39,
"learning_rate": 7.72368695652174e-05,
"loss": 1.0876,
"step": 14000
},
{
"epoch": 8.99,
"learning_rate": 7.654191304347826e-05,
"loss": 1.0765,
"step": 15000
},
{
"epoch": 9.59,
"learning_rate": 7.584695652173914e-05,
"loss": 1.0655,
"step": 16000
},
{
"epoch": 10.19,
"learning_rate": 7.5152e-05,
"loss": 1.0555,
"step": 17000
},
{
"epoch": 10.79,
"learning_rate": 7.445773913043479e-05,
"loss": 1.0478,
"step": 18000
},
{
"epoch": 11.39,
"learning_rate": 7.376208695652175e-05,
"loss": 1.0403,
"step": 19000
},
{
"epoch": 11.99,
"learning_rate": 7.306713043478261e-05,
"loss": 1.0334,
"step": 20000
},
{
"epoch": 12.59,
"learning_rate": 7.237217391304349e-05,
"loss": 1.026,
"step": 21000
},
{
"epoch": 13.19,
"learning_rate": 7.167791304347826e-05,
"loss": 1.0195,
"step": 22000
},
{
"epoch": 13.79,
"learning_rate": 7.098226086956523e-05,
"loss": 1.0144,
"step": 23000
},
{
"epoch": 14.39,
"learning_rate": 7.028730434782609e-05,
"loss": 1.0086,
"step": 24000
},
{
"epoch": 14.99,
"learning_rate": 6.959234782608696e-05,
"loss": 1.0045,
"step": 25000
},
{
"epoch": 15.59,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9939,
"step": 26000
},
{
"epoch": 16.19,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.989,
"step": 27000
},
{
"epoch": 16.79,
"learning_rate": 2.4e-05,
"loss": 0.9883,
"step": 28000
},
{
"epoch": 17.39,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9875,
"step": 29000
},
{
"epoch": 17.99,
"learning_rate": 4e-05,
"loss": 0.9861,
"step": 30000
},
{
"epoch": 18.59,
"learning_rate": 4.8e-05,
"loss": 0.9855,
"step": 31000
},
{
"epoch": 19.18,
"learning_rate": 5.6e-05,
"loss": 0.9845,
"step": 32000
},
{
"epoch": 19.78,
"learning_rate": 6.400000000000001e-05,
"loss": 0.9839,
"step": 33000
},
{
"epoch": 20.38,
"learning_rate": 7.2e-05,
"loss": 0.9832,
"step": 34000
},
{
"epoch": 20.98,
"learning_rate": 8e-05,
"loss": 0.982,
"step": 35000
},
{
"epoch": 21.58,
"learning_rate": 7.930434782608697e-05,
"loss": 0.9802,
"step": 36000
},
{
"epoch": 22.18,
"learning_rate": 7.860869565217392e-05,
"loss": 0.9762,
"step": 37000
},
{
"epoch": 22.78,
"learning_rate": 7.791304347826088e-05,
"loss": 0.9723,
"step": 38000
},
{
"epoch": 23.38,
"learning_rate": 7.721739130434783e-05,
"loss": 0.9698,
"step": 39000
},
{
"epoch": 23.98,
"learning_rate": 7.652173913043479e-05,
"loss": 0.9667,
"step": 40000
},
{
"epoch": 24.58,
"learning_rate": 7.582608695652174e-05,
"loss": 0.9637,
"step": 41000
},
{
"epoch": 25.18,
"learning_rate": 7.51304347826087e-05,
"loss": 0.9614,
"step": 42000
},
{
"epoch": 25.78,
"learning_rate": 7.443478260869565e-05,
"loss": 0.9578,
"step": 43000
},
{
"epoch": 26.38,
"learning_rate": 7.373913043478261e-05,
"loss": 0.9553,
"step": 44000
},
{
"epoch": 26.98,
"learning_rate": 7.304347826086957e-05,
"loss": 0.9532,
"step": 45000
},
{
"epoch": 27.58,
"learning_rate": 7.856e-06,
"loss": 0.9441,
"step": 46000
},
{
"epoch": 28.18,
"learning_rate": 1.5856e-05,
"loss": 0.9399,
"step": 47000
},
{
"epoch": 28.78,
"learning_rate": 2.3840000000000002e-05,
"loss": 0.9378,
"step": 48000
},
{
"epoch": 29.38,
"learning_rate": 3.184000000000001e-05,
"loss": 0.9373,
"step": 49000
},
{
"epoch": 29.98,
"learning_rate": 3.9824000000000004e-05,
"loss": 0.9379,
"step": 50000
},
{
"epoch": 30.58,
"learning_rate": 4.7824e-05,
"loss": 0.9376,
"step": 51000
},
{
"epoch": 31.18,
"learning_rate": 5.5816e-05,
"loss": 0.9384,
"step": 52000
},
{
"epoch": 31.77,
"learning_rate": 6.380800000000001e-05,
"loss": 0.9393,
"step": 53000
},
{
"epoch": 32.37,
"learning_rate": 7.1808e-05,
"loss": 0.9401,
"step": 54000
},
{
"epoch": 32.97,
"learning_rate": 7.9792e-05,
"loss": 0.9397,
"step": 55000
},
{
"epoch": 33.57,
"learning_rate": 7.932313043478262e-05,
"loss": 0.9393,
"step": 56000
},
{
"epoch": 34.17,
"learning_rate": 7.862747826086958e-05,
"loss": 0.9383,
"step": 57000
},
{
"epoch": 34.77,
"learning_rate": 7.793182608695653e-05,
"loss": 0.9362,
"step": 58000
},
{
"epoch": 35.37,
"learning_rate": 7.72368695652174e-05,
"loss": 0.9342,
"step": 59000
},
{
"epoch": 35.97,
"learning_rate": 7.654191304347826e-05,
"loss": 0.933,
"step": 60000
},
{
"epoch": 36.57,
"learning_rate": 7.584626086956523e-05,
"loss": 0.9295,
"step": 61000
},
{
"epoch": 37.17,
"learning_rate": 7.51513043478261e-05,
"loss": 0.9283,
"step": 62000
},
{
"epoch": 37.77,
"learning_rate": 7.445565217391305e-05,
"loss": 0.9268,
"step": 63000
},
{
"epoch": 38.37,
"learning_rate": 7.376069565217393e-05,
"loss": 0.9252,
"step": 64000
},
{
"epoch": 38.97,
"learning_rate": 7.306573913043479e-05,
"loss": 0.923,
"step": 65000
},
{
"epoch": 39.57,
"learning_rate": 7.237147826086958e-05,
"loss": 0.922,
"step": 66000
},
{
"epoch": 40.17,
"learning_rate": 7.167652173913044e-05,
"loss": 0.9201,
"step": 67000
},
{
"epoch": 40.77,
"learning_rate": 7.09808695652174e-05,
"loss": 0.9182,
"step": 68000
},
{
"epoch": 41.37,
"learning_rate": 7.028591304347828e-05,
"loss": 0.9166,
"step": 69000
},
{
"epoch": 41.97,
"learning_rate": 6.959095652173914e-05,
"loss": 0.9159,
"step": 70000
},
{
"epoch": 42.57,
"learning_rate": 6.8896e-05,
"loss": 0.9135,
"step": 71000
},
{
"epoch": 43.17,
"learning_rate": 6.820104347826087e-05,
"loss": 0.9131,
"step": 72000
},
{
"epoch": 43.76,
"learning_rate": 6.750539130434784e-05,
"loss": 0.9104,
"step": 73000
},
{
"epoch": 44.36,
"learning_rate": 6.680973913043478e-05,
"loss": 0.91,
"step": 74000
},
{
"epoch": 44.96,
"learning_rate": 6.611547826086957e-05,
"loss": 0.9084,
"step": 75000
},
{
"epoch": 45.56,
"learning_rate": 6.541982608695652e-05,
"loss": 0.9072,
"step": 76000
},
{
"epoch": 46.16,
"learning_rate": 6.472556521739131e-05,
"loss": 0.9065,
"step": 77000
},
{
"epoch": 46.76,
"learning_rate": 6.402991304347827e-05,
"loss": 0.9041,
"step": 78000
},
{
"epoch": 47.36,
"learning_rate": 6.333426086956522e-05,
"loss": 0.9029,
"step": 79000
},
{
"epoch": 47.96,
"learning_rate": 6.264000000000001e-05,
"loss": 0.9022,
"step": 80000
},
{
"epoch": 48.56,
"learning_rate": 6.194434782608696e-05,
"loss": 0.9007,
"step": 81000
},
{
"epoch": 49.16,
"learning_rate": 6.124939130434783e-05,
"loss": 0.8998,
"step": 82000
},
{
"epoch": 49.76,
"learning_rate": 6.05544347826087e-05,
"loss": 0.8986,
"step": 83000
},
{
"epoch": 50.36,
"learning_rate": 5.985878260869566e-05,
"loss": 0.8966,
"step": 84000
},
{
"epoch": 50.96,
"learning_rate": 5.916382608695653e-05,
"loss": 0.8967,
"step": 85000
},
{
"epoch": 51.56,
"learning_rate": 5.8468869565217395e-05,
"loss": 0.8952,
"step": 86000
},
{
"epoch": 52.16,
"learning_rate": 5.777321739130435e-05,
"loss": 0.8937,
"step": 87000
},
{
"epoch": 52.76,
"learning_rate": 5.7078260869565226e-05,
"loss": 0.8934,
"step": 88000
},
{
"epoch": 53.36,
"learning_rate": 5.6382608695652174e-05,
"loss": 0.8928,
"step": 89000
},
{
"epoch": 53.96,
"learning_rate": 5.568765217391304e-05,
"loss": 0.8909,
"step": 90000
},
{
"epoch": 54.56,
"learning_rate": 5.499269565217392e-05,
"loss": 0.89,
"step": 91000
},
{
"epoch": 55.16,
"learning_rate": 5.429773913043479e-05,
"loss": 0.8889,
"step": 92000
},
{
"epoch": 55.76,
"learning_rate": 5.360208695652175e-05,
"loss": 0.8883,
"step": 93000
},
{
"epoch": 56.35,
"learning_rate": 5.2907826086956524e-05,
"loss": 0.8873,
"step": 94000
},
{
"epoch": 56.95,
"learning_rate": 5.22128695652174e-05,
"loss": 0.8862,
"step": 95000
},
{
"epoch": 57.55,
"learning_rate": 5.151721739130435e-05,
"loss": 0.8857,
"step": 96000
},
{
"epoch": 58.15,
"learning_rate": 5.082156521739131e-05,
"loss": 0.8849,
"step": 97000
},
{
"epoch": 58.75,
"learning_rate": 5.012660869565218e-05,
"loss": 0.8829,
"step": 98000
},
{
"epoch": 59.35,
"learning_rate": 4.943234782608696e-05,
"loss": 0.8836,
"step": 99000
},
{
"epoch": 59.95,
"learning_rate": 4.873669565217392e-05,
"loss": 0.8818,
"step": 100000
},
{
"epoch": 60.55,
"learning_rate": 4.804173913043479e-05,
"loss": 0.8811,
"step": 101000
},
{
"epoch": 61.15,
"learning_rate": 4.734608695652174e-05,
"loss": 0.8805,
"step": 102000
},
{
"epoch": 61.75,
"learning_rate": 4.6651130434782615e-05,
"loss": 0.8797,
"step": 103000
},
{
"epoch": 62.35,
"learning_rate": 4.595547826086957e-05,
"loss": 0.8791,
"step": 104000
},
{
"epoch": 62.95,
"learning_rate": 4.526121739130435e-05,
"loss": 0.8782,
"step": 105000
},
{
"epoch": 63.55,
"learning_rate": 4.456626086956522e-05,
"loss": 0.8767,
"step": 106000
},
{
"epoch": 64.15,
"learning_rate": 4.3870608695652176e-05,
"loss": 0.8769,
"step": 107000
},
{
"epoch": 64.75,
"learning_rate": 4.317495652173914e-05,
"loss": 0.8754,
"step": 108000
},
{
"epoch": 65.35,
"learning_rate": 4.248000000000001e-05,
"loss": 0.8746,
"step": 109000
},
{
"epoch": 65.95,
"learning_rate": 4.1785043478260875e-05,
"loss": 0.8752,
"step": 110000
},
{
"epoch": 66.55,
"learning_rate": 4.108939130434783e-05,
"loss": 0.8735,
"step": 111000
},
{
"epoch": 67.15,
"learning_rate": 4.0395130434782606e-05,
"loss": 0.8733,
"step": 112000
},
{
"epoch": 67.75,
"learning_rate": 3.970017391304348e-05,
"loss": 0.8727,
"step": 113000
},
{
"epoch": 68.35,
"learning_rate": 3.900521739130435e-05,
"loss": 0.8715,
"step": 114000
},
{
"epoch": 68.94,
"learning_rate": 3.830956521739131e-05,
"loss": 0.8721,
"step": 115000
},
{
"epoch": 69.54,
"learning_rate": 3.7614608695652174e-05,
"loss": 0.8712,
"step": 116000
},
{
"epoch": 70.14,
"learning_rate": 3.691965217391304e-05,
"loss": 0.8701,
"step": 117000
},
{
"epoch": 70.74,
"learning_rate": 3.6224000000000004e-05,
"loss": 0.8692,
"step": 118000
},
{
"epoch": 71.34,
"learning_rate": 3.552904347826087e-05,
"loss": 0.8688,
"step": 119000
},
{
"epoch": 71.94,
"learning_rate": 3.483408695652174e-05,
"loss": 0.8693,
"step": 120000
},
{
"epoch": 72.54,
"learning_rate": 3.41384347826087e-05,
"loss": 0.8675,
"step": 121000
},
{
"epoch": 73.14,
"learning_rate": 3.3443478260869565e-05,
"loss": 0.8667,
"step": 122000
},
{
"epoch": 73.74,
"learning_rate": 3.274852173913044e-05,
"loss": 0.8667,
"step": 123000
},
{
"epoch": 74.34,
"learning_rate": 3.2052869565217396e-05,
"loss": 0.8662,
"step": 124000
},
{
"epoch": 74.94,
"learning_rate": 3.1357913043478265e-05,
"loss": 0.8651,
"step": 125000
},
{
"epoch": 74.94,
"step": 125000,
"total_flos": 6.735180748506844e+19,
"train_loss": 0.06944640869140625,
"train_runtime": 3177.0339,
"train_samples_per_second": 80578.303,
"train_steps_per_second": 39.345
}
],
"max_steps": 125000,
"num_train_epochs": 75,
"total_flos": 6.735180748506844e+19,
"trial_name": null,
"trial_params": null
}