BerTeleo / trainer_state.json
gustoudu81's picture
Upload folder using huggingface_hub
aea6eb7 verified
{
"best_global_step": 67000,
"best_metric": 1.3910651206970215,
"best_model_checkpoint": "/home/auguste/Desktop/eDNA/TeleoClassification/scripts/DNABert2/experiments/masking_training/outputs/masking_teleo/checkpoints/checkpoint-67000",
"epoch": 108.06451612903226,
"eval_steps": 1000,
"global_step": 67000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016129032258064516,
"grad_norm": 107.56168365478516,
"learning_rate": 2e-05,
"loss": 7.9233,
"step": 1
},
{
"epoch": 1.6129032258064515,
"grad_norm": 16.32627296447754,
"learning_rate": 1.9785161290322584e-05,
"loss": 3.0779,
"step": 1000
},
{
"epoch": 1.6129032258064515,
"eval_loss": 2.738837718963623,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.2955,
"eval_samples_per_second": 1766.611,
"eval_steps_per_second": 111.682,
"step": 1000
},
{
"epoch": 3.225806451612903,
"grad_norm": 12.881124496459961,
"learning_rate": 1.9570107526881724e-05,
"loss": 2.506,
"step": 2000
},
{
"epoch": 3.225806451612903,
"eval_loss": 2.4902684688568115,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.2968,
"eval_samples_per_second": 1758.892,
"eval_steps_per_second": 111.194,
"step": 2000
},
{
"epoch": 4.838709677419355,
"grad_norm": 12.914713859558105,
"learning_rate": 1.935505376344086e-05,
"loss": 2.734,
"step": 3000
},
{
"epoch": 4.838709677419355,
"eval_loss": 2.305058479309082,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3006,
"eval_samples_per_second": 1736.761,
"eval_steps_per_second": 109.795,
"step": 3000
},
{
"epoch": 6.451612903225806,
"grad_norm": 13.617836952209473,
"learning_rate": 1.914e-05,
"loss": 2.2267,
"step": 4000
},
{
"epoch": 6.451612903225806,
"eval_loss": 2.3899621963500977,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3004,
"eval_samples_per_second": 1737.628,
"eval_steps_per_second": 109.85,
"step": 4000
},
{
"epoch": 8.064516129032258,
"grad_norm": 11.493875503540039,
"learning_rate": 1.892494623655914e-05,
"loss": 2.1095,
"step": 5000
},
{
"epoch": 8.064516129032258,
"eval_loss": 2.1791865825653076,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.2998,
"eval_samples_per_second": 1740.976,
"eval_steps_per_second": 110.062,
"step": 5000
},
{
"epoch": 9.67741935483871,
"grad_norm": 16.104379653930664,
"learning_rate": 1.870989247311828e-05,
"loss": 1.9622,
"step": 6000
},
{
"epoch": 9.67741935483871,
"eval_loss": 2.0534751415252686,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3144,
"eval_samples_per_second": 1660.314,
"eval_steps_per_second": 104.962,
"step": 6000
},
{
"epoch": 11.290322580645162,
"grad_norm": 15.933501243591309,
"learning_rate": 1.8494838709677422e-05,
"loss": 1.8713,
"step": 7000
},
{
"epoch": 11.290322580645162,
"eval_loss": 2.1255111694335938,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.31,
"eval_samples_per_second": 1684.034,
"eval_steps_per_second": 106.462,
"step": 7000
},
{
"epoch": 12.903225806451612,
"grad_norm": 9.397466659545898,
"learning_rate": 1.8279784946236562e-05,
"loss": 1.7906,
"step": 8000
},
{
"epoch": 12.903225806451612,
"eval_loss": 1.9397249221801758,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3026,
"eval_samples_per_second": 1724.803,
"eval_steps_per_second": 109.039,
"step": 8000
},
{
"epoch": 14.516129032258064,
"grad_norm": 14.291478157043457,
"learning_rate": 1.8064731182795702e-05,
"loss": 1.7149,
"step": 9000
},
{
"epoch": 14.516129032258064,
"eval_loss": 1.8910889625549316,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3013,
"eval_samples_per_second": 1732.385,
"eval_steps_per_second": 109.519,
"step": 9000
},
{
"epoch": 16.129032258064516,
"grad_norm": 15.776030540466309,
"learning_rate": 1.784967741935484e-05,
"loss": 1.634,
"step": 10000
},
{
"epoch": 16.129032258064516,
"eval_loss": 1.893878698348999,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3023,
"eval_samples_per_second": 1726.506,
"eval_steps_per_second": 109.147,
"step": 10000
},
{
"epoch": 17.741935483870968,
"grad_norm": 12.53177547454834,
"learning_rate": 1.763462365591398e-05,
"loss": 1.5991,
"step": 11000
},
{
"epoch": 17.741935483870968,
"eval_loss": 1.8701565265655518,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3035,
"eval_samples_per_second": 1720.089,
"eval_steps_per_second": 108.741,
"step": 11000
},
{
"epoch": 19.35483870967742,
"grad_norm": 13.62909984588623,
"learning_rate": 1.741956989247312e-05,
"loss": 1.5008,
"step": 12000
},
{
"epoch": 19.35483870967742,
"eval_loss": 1.7572582960128784,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3051,
"eval_samples_per_second": 1710.701,
"eval_steps_per_second": 108.148,
"step": 12000
},
{
"epoch": 20.967741935483872,
"grad_norm": 13.886764526367188,
"learning_rate": 1.720451612903226e-05,
"loss": 1.4469,
"step": 13000
},
{
"epoch": 20.967741935483872,
"eval_loss": 1.7456613779067993,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3048,
"eval_samples_per_second": 1712.389,
"eval_steps_per_second": 108.254,
"step": 13000
},
{
"epoch": 22.580645161290324,
"grad_norm": 16.04749870300293,
"learning_rate": 1.6989462365591397e-05,
"loss": 1.404,
"step": 14000
},
{
"epoch": 22.580645161290324,
"eval_loss": 1.7826714515686035,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3034,
"eval_samples_per_second": 1720.509,
"eval_steps_per_second": 108.768,
"step": 14000
},
{
"epoch": 24.193548387096776,
"grad_norm": 14.932185173034668,
"learning_rate": 1.6774408602150537e-05,
"loss": 1.3552,
"step": 15000
},
{
"epoch": 24.193548387096776,
"eval_loss": 1.7234201431274414,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3061,
"eval_samples_per_second": 1705.173,
"eval_steps_per_second": 107.798,
"step": 15000
},
{
"epoch": 25.806451612903224,
"grad_norm": 8.178566932678223,
"learning_rate": 1.6559354838709676e-05,
"loss": 1.313,
"step": 16000
},
{
"epoch": 25.806451612903224,
"eval_loss": 1.8201613426208496,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3194,
"eval_samples_per_second": 1634.314,
"eval_steps_per_second": 103.319,
"step": 16000
},
{
"epoch": 27.419354838709676,
"grad_norm": 16.086894989013672,
"learning_rate": 1.634430107526882e-05,
"loss": 1.2751,
"step": 17000
},
{
"epoch": 27.419354838709676,
"eval_loss": 1.6344752311706543,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3053,
"eval_samples_per_second": 1709.792,
"eval_steps_per_second": 108.09,
"step": 17000
},
{
"epoch": 29.032258064516128,
"grad_norm": 9.854013442993164,
"learning_rate": 1.612924731182796e-05,
"loss": 1.2377,
"step": 18000
},
{
"epoch": 29.032258064516128,
"eval_loss": 1.6381661891937256,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.343,
"eval_samples_per_second": 1521.702,
"eval_steps_per_second": 96.2,
"step": 18000
},
{
"epoch": 30.64516129032258,
"grad_norm": 13.270648956298828,
"learning_rate": 1.59141935483871e-05,
"loss": 1.1772,
"step": 19000
},
{
"epoch": 30.64516129032258,
"eval_loss": 1.6601710319519043,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3037,
"eval_samples_per_second": 1718.661,
"eval_steps_per_second": 108.651,
"step": 19000
},
{
"epoch": 32.25806451612903,
"grad_norm": 20.389537811279297,
"learning_rate": 1.569913978494624e-05,
"loss": 1.176,
"step": 20000
},
{
"epoch": 32.25806451612903,
"eval_loss": 1.6632287502288818,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3045,
"eval_samples_per_second": 1714.128,
"eval_steps_per_second": 108.364,
"step": 20000
},
{
"epoch": 33.87096774193548,
"grad_norm": 11.229137420654297,
"learning_rate": 1.548408602150538e-05,
"loss": 1.1184,
"step": 21000
},
{
"epoch": 33.87096774193548,
"eval_loss": 1.7555991411209106,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3043,
"eval_samples_per_second": 1715.561,
"eval_steps_per_second": 108.455,
"step": 21000
},
{
"epoch": 35.483870967741936,
"grad_norm": 10.823155403137207,
"learning_rate": 1.5269032258064518e-05,
"loss": 1.0793,
"step": 22000
},
{
"epoch": 35.483870967741936,
"eval_loss": 1.6087384223937988,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3059,
"eval_samples_per_second": 1706.414,
"eval_steps_per_second": 107.877,
"step": 22000
},
{
"epoch": 37.096774193548384,
"grad_norm": 6.54916524887085,
"learning_rate": 1.5053978494623658e-05,
"loss": 1.0632,
"step": 23000
},
{
"epoch": 37.096774193548384,
"eval_loss": 1.6815119981765747,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3044,
"eval_samples_per_second": 1714.683,
"eval_steps_per_second": 108.399,
"step": 23000
},
{
"epoch": 38.70967741935484,
"grad_norm": 14.550293922424316,
"learning_rate": 1.4838924731182798e-05,
"loss": 1.0185,
"step": 24000
},
{
"epoch": 38.70967741935484,
"eval_loss": 1.6611889600753784,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3028,
"eval_samples_per_second": 1724.12,
"eval_steps_per_second": 108.996,
"step": 24000
},
{
"epoch": 40.32258064516129,
"grad_norm": 14.825828552246094,
"learning_rate": 1.4623870967741937e-05,
"loss": 1.0148,
"step": 25000
},
{
"epoch": 40.32258064516129,
"eval_loss": 1.5314302444458008,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3061,
"eval_samples_per_second": 1705.416,
"eval_steps_per_second": 107.814,
"step": 25000
},
{
"epoch": 41.935483870967744,
"grad_norm": 15.808582305908203,
"learning_rate": 1.4408817204301075e-05,
"loss": 0.9492,
"step": 26000
},
{
"epoch": 41.935483870967744,
"eval_loss": 1.717032790184021,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3131,
"eval_samples_per_second": 1667.386,
"eval_steps_per_second": 105.409,
"step": 26000
},
{
"epoch": 43.54838709677419,
"grad_norm": 13.56778621673584,
"learning_rate": 1.4193763440860215e-05,
"loss": 0.9352,
"step": 27000
},
{
"epoch": 43.54838709677419,
"eval_loss": 1.631635069847107,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3065,
"eval_samples_per_second": 1703.186,
"eval_steps_per_second": 107.673,
"step": 27000
},
{
"epoch": 45.16129032258065,
"grad_norm": 14.375411987304688,
"learning_rate": 1.3978709677419355e-05,
"loss": 0.9287,
"step": 28000
},
{
"epoch": 45.16129032258065,
"eval_loss": 1.643862247467041,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3075,
"eval_samples_per_second": 1697.381,
"eval_steps_per_second": 107.306,
"step": 28000
},
{
"epoch": 46.774193548387096,
"grad_norm": 12.451338768005371,
"learning_rate": 1.3763655913978495e-05,
"loss": 0.9052,
"step": 29000
},
{
"epoch": 46.774193548387096,
"eval_loss": 1.4976590871810913,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3151,
"eval_samples_per_second": 1656.651,
"eval_steps_per_second": 104.731,
"step": 29000
},
{
"epoch": 48.38709677419355,
"grad_norm": 15.790621757507324,
"learning_rate": 1.3548602150537636e-05,
"loss": 0.8897,
"step": 30000
},
{
"epoch": 48.38709677419355,
"eval_loss": 1.544758915901184,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3045,
"eval_samples_per_second": 1714.222,
"eval_steps_per_second": 108.37,
"step": 30000
},
{
"epoch": 50.0,
"grad_norm": 15.337139129638672,
"learning_rate": 1.3333548387096776e-05,
"loss": 0.9353,
"step": 31000
},
{
"epoch": 50.0,
"eval_loss": 1.7019206285476685,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3181,
"eval_samples_per_second": 1640.743,
"eval_steps_per_second": 103.725,
"step": 31000
},
{
"epoch": 51.61290322580645,
"grad_norm": 17.48087501525879,
"learning_rate": 1.3118494623655916e-05,
"loss": 0.8976,
"step": 32000
},
{
"epoch": 51.61290322580645,
"eval_loss": 1.6256884336471558,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3131,
"eval_samples_per_second": 1667.367,
"eval_steps_per_second": 105.408,
"step": 32000
},
{
"epoch": 53.225806451612904,
"grad_norm": 15.387638092041016,
"learning_rate": 1.2903440860215055e-05,
"loss": 0.8414,
"step": 33000
},
{
"epoch": 53.225806451612904,
"eval_loss": 1.5139249563217163,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3191,
"eval_samples_per_second": 1635.972,
"eval_steps_per_second": 103.424,
"step": 33000
},
{
"epoch": 54.83870967741935,
"grad_norm": 15.2994384765625,
"learning_rate": 1.2688387096774195e-05,
"loss": 0.7897,
"step": 34000
},
{
"epoch": 54.83870967741935,
"eval_loss": 1.7013849020004272,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3268,
"eval_samples_per_second": 1597.145,
"eval_steps_per_second": 100.969,
"step": 34000
},
{
"epoch": 56.45161290322581,
"grad_norm": 14.40909481048584,
"learning_rate": 1.2473333333333335e-05,
"loss": 0.8627,
"step": 35000
},
{
"epoch": 56.45161290322581,
"eval_loss": 1.7141073942184448,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3089,
"eval_samples_per_second": 1689.899,
"eval_steps_per_second": 106.833,
"step": 35000
},
{
"epoch": 58.064516129032256,
"grad_norm": 19.243818283081055,
"learning_rate": 1.2258279569892474e-05,
"loss": 0.9135,
"step": 36000
},
{
"epoch": 58.064516129032256,
"eval_loss": 1.678747296333313,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3278,
"eval_samples_per_second": 1592.41,
"eval_steps_per_second": 100.67,
"step": 36000
},
{
"epoch": 59.67741935483871,
"grad_norm": 14.35431957244873,
"learning_rate": 1.2043225806451614e-05,
"loss": 0.9226,
"step": 37000
},
{
"epoch": 59.67741935483871,
"eval_loss": 1.9941015243530273,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3066,
"eval_samples_per_second": 1702.667,
"eval_steps_per_second": 107.64,
"step": 37000
},
{
"epoch": 61.29032258064516,
"grad_norm": 16.02369499206543,
"learning_rate": 1.1828172043010752e-05,
"loss": 0.8849,
"step": 38000
},
{
"epoch": 61.29032258064516,
"eval_loss": 1.5911988019943237,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3059,
"eval_samples_per_second": 1706.321,
"eval_steps_per_second": 107.871,
"step": 38000
},
{
"epoch": 62.903225806451616,
"grad_norm": 24.164094924926758,
"learning_rate": 1.1613118279569892e-05,
"loss": 0.7974,
"step": 39000
},
{
"epoch": 62.903225806451616,
"eval_loss": 1.5700287818908691,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3059,
"eval_samples_per_second": 1706.437,
"eval_steps_per_second": 107.878,
"step": 39000
},
{
"epoch": 64.51612903225806,
"grad_norm": 10.7676420211792,
"learning_rate": 1.1398064516129033e-05,
"loss": 0.7892,
"step": 40000
},
{
"epoch": 64.51612903225806,
"eval_loss": 1.6208666563034058,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3135,
"eval_samples_per_second": 1665.325,
"eval_steps_per_second": 105.279,
"step": 40000
},
{
"epoch": 66.12903225806451,
"grad_norm": 8.90040111541748,
"learning_rate": 1.1183010752688173e-05,
"loss": 0.7728,
"step": 41000
},
{
"epoch": 66.12903225806451,
"eval_loss": 1.5275108814239502,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3064,
"eval_samples_per_second": 1703.395,
"eval_steps_per_second": 107.686,
"step": 41000
},
{
"epoch": 67.74193548387096,
"grad_norm": 16.836742401123047,
"learning_rate": 1.0967956989247313e-05,
"loss": 0.7309,
"step": 42000
},
{
"epoch": 67.74193548387096,
"eval_loss": 1.6568617820739746,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3052,
"eval_samples_per_second": 1710.328,
"eval_steps_per_second": 108.124,
"step": 42000
},
{
"epoch": 69.35483870967742,
"grad_norm": 16.19956398010254,
"learning_rate": 1.0752903225806453e-05,
"loss": 0.6891,
"step": 43000
},
{
"epoch": 69.35483870967742,
"eval_loss": 1.4376003742218018,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3272,
"eval_samples_per_second": 1595.464,
"eval_steps_per_second": 100.863,
"step": 43000
},
{
"epoch": 70.96774193548387,
"grad_norm": 19.571664810180664,
"learning_rate": 1.0537849462365592e-05,
"loss": 0.6732,
"step": 44000
},
{
"epoch": 70.96774193548387,
"eval_loss": 1.6094655990600586,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3144,
"eval_samples_per_second": 1660.491,
"eval_steps_per_second": 104.974,
"step": 44000
},
{
"epoch": 72.58064516129032,
"grad_norm": 11.60450267791748,
"learning_rate": 1.0322795698924732e-05,
"loss": 0.6475,
"step": 45000
},
{
"epoch": 72.58064516129032,
"eval_loss": 1.569161295890808,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3241,
"eval_samples_per_second": 1610.77,
"eval_steps_per_second": 101.83,
"step": 45000
},
{
"epoch": 74.19354838709677,
"grad_norm": 14.973388671875,
"learning_rate": 1.0107741935483872e-05,
"loss": 0.674,
"step": 46000
},
{
"epoch": 74.19354838709677,
"eval_loss": 1.4532381296157837,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3117,
"eval_samples_per_second": 1674.469,
"eval_steps_per_second": 105.857,
"step": 46000
},
{
"epoch": 75.80645161290323,
"grad_norm": 19.416486740112305,
"learning_rate": 9.892688172043012e-06,
"loss": 0.6339,
"step": 47000
},
{
"epoch": 75.80645161290323,
"eval_loss": 1.5601801872253418,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3055,
"eval_samples_per_second": 1708.783,
"eval_steps_per_second": 108.027,
"step": 47000
},
{
"epoch": 77.41935483870968,
"grad_norm": 12.237533569335938,
"learning_rate": 9.677634408602151e-06,
"loss": 0.628,
"step": 48000
},
{
"epoch": 77.41935483870968,
"eval_loss": 1.5352447032928467,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3243,
"eval_samples_per_second": 1609.448,
"eval_steps_per_second": 101.747,
"step": 48000
},
{
"epoch": 79.03225806451613,
"grad_norm": 8.90131664276123,
"learning_rate": 9.462580645161291e-06,
"loss": 0.6123,
"step": 49000
},
{
"epoch": 79.03225806451613,
"eval_loss": 1.6023005247116089,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3064,
"eval_samples_per_second": 1703.925,
"eval_steps_per_second": 107.719,
"step": 49000
},
{
"epoch": 80.64516129032258,
"grad_norm": 19.542125701904297,
"learning_rate": 9.24752688172043e-06,
"loss": 0.5913,
"step": 50000
},
{
"epoch": 80.64516129032258,
"eval_loss": 1.4985138177871704,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3143,
"eval_samples_per_second": 1660.843,
"eval_steps_per_second": 104.996,
"step": 50000
},
{
"epoch": 82.25806451612904,
"grad_norm": 15.9403715133667,
"learning_rate": 9.03247311827957e-06,
"loss": 0.5919,
"step": 51000
},
{
"epoch": 82.25806451612904,
"eval_loss": 1.557279109954834,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3138,
"eval_samples_per_second": 1663.684,
"eval_steps_per_second": 105.175,
"step": 51000
},
{
"epoch": 83.87096774193549,
"grad_norm": 16.341463088989258,
"learning_rate": 8.81741935483871e-06,
"loss": 0.5849,
"step": 52000
},
{
"epoch": 83.87096774193549,
"eval_loss": 1.744088888168335,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3074,
"eval_samples_per_second": 1698.241,
"eval_steps_per_second": 107.36,
"step": 52000
},
{
"epoch": 85.48387096774194,
"grad_norm": 17.496572494506836,
"learning_rate": 8.60236559139785e-06,
"loss": 0.5798,
"step": 53000
},
{
"epoch": 85.48387096774194,
"eval_loss": 1.5605759620666504,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3312,
"eval_samples_per_second": 1576.078,
"eval_steps_per_second": 99.637,
"step": 53000
},
{
"epoch": 87.09677419354838,
"grad_norm": 22.154132843017578,
"learning_rate": 8.38731182795699e-06,
"loss": 0.5627,
"step": 54000
},
{
"epoch": 87.09677419354838,
"eval_loss": 1.486401081085205,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3046,
"eval_samples_per_second": 1713.463,
"eval_steps_per_second": 108.322,
"step": 54000
},
{
"epoch": 88.70967741935483,
"grad_norm": 12.007641792297363,
"learning_rate": 8.17225806451613e-06,
"loss": 0.5926,
"step": 55000
},
{
"epoch": 88.70967741935483,
"eval_loss": 1.533622145652771,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3189,
"eval_samples_per_second": 1636.753,
"eval_steps_per_second": 103.473,
"step": 55000
},
{
"epoch": 90.3225806451613,
"grad_norm": 16.921255111694336,
"learning_rate": 7.957204301075269e-06,
"loss": 0.5737,
"step": 56000
},
{
"epoch": 90.3225806451613,
"eval_loss": 1.595588207244873,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3248,
"eval_samples_per_second": 1607.001,
"eval_steps_per_second": 101.592,
"step": 56000
},
{
"epoch": 91.93548387096774,
"grad_norm": 14.567840576171875,
"learning_rate": 7.74215053763441e-06,
"loss": 0.5521,
"step": 57000
},
{
"epoch": 91.93548387096774,
"eval_loss": 1.6286988258361816,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.308,
"eval_samples_per_second": 1694.658,
"eval_steps_per_second": 107.134,
"step": 57000
},
{
"epoch": 93.54838709677419,
"grad_norm": 7.83158016204834,
"learning_rate": 7.5270967741935486e-06,
"loss": 0.5672,
"step": 58000
},
{
"epoch": 93.54838709677419,
"eval_loss": 1.6612709760665894,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3047,
"eval_samples_per_second": 1713.282,
"eval_steps_per_second": 108.311,
"step": 58000
},
{
"epoch": 95.16129032258064,
"grad_norm": 20.766202926635742,
"learning_rate": 7.312043010752688e-06,
"loss": 0.5685,
"step": 59000
},
{
"epoch": 95.16129032258064,
"eval_loss": 1.5319266319274902,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3061,
"eval_samples_per_second": 1705.367,
"eval_steps_per_second": 107.811,
"step": 59000
},
{
"epoch": 96.7741935483871,
"grad_norm": 13.834534645080566,
"learning_rate": 7.096989247311829e-06,
"loss": 0.5394,
"step": 60000
},
{
"epoch": 96.7741935483871,
"eval_loss": 1.5068557262420654,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3061,
"eval_samples_per_second": 1705.255,
"eval_steps_per_second": 107.803,
"step": 60000
},
{
"epoch": 98.38709677419355,
"grad_norm": 9.130626678466797,
"learning_rate": 6.881935483870969e-06,
"loss": 0.5095,
"step": 61000
},
{
"epoch": 98.38709677419355,
"eval_loss": 1.4926313161849976,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3074,
"eval_samples_per_second": 1698.19,
"eval_steps_per_second": 107.357,
"step": 61000
},
{
"epoch": 100.0,
"grad_norm": 18.79903793334961,
"learning_rate": 6.666881720430108e-06,
"loss": 0.5327,
"step": 62000
},
{
"epoch": 100.0,
"eval_loss": 1.4378135204315186,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3146,
"eval_samples_per_second": 1659.02,
"eval_steps_per_second": 104.881,
"step": 62000
},
{
"epoch": 101.61290322580645,
"grad_norm": 17.528038024902344,
"learning_rate": 6.451827956989248e-06,
"loss": 0.5108,
"step": 63000
},
{
"epoch": 101.61290322580645,
"eval_loss": 1.4716895818710327,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3118,
"eval_samples_per_second": 1673.899,
"eval_steps_per_second": 105.821,
"step": 63000
},
{
"epoch": 103.2258064516129,
"grad_norm": 9.862174034118652,
"learning_rate": 6.236774193548387e-06,
"loss": 0.4874,
"step": 64000
},
{
"epoch": 103.2258064516129,
"eval_loss": 1.519917368888855,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3105,
"eval_samples_per_second": 1681.31,
"eval_steps_per_second": 106.29,
"step": 64000
},
{
"epoch": 104.83870967741936,
"grad_norm": 11.85350513458252,
"learning_rate": 6.0217204301075275e-06,
"loss": 0.4856,
"step": 65000
},
{
"epoch": 104.83870967741936,
"eval_loss": 1.5175796747207642,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3143,
"eval_samples_per_second": 1661.035,
"eval_steps_per_second": 105.008,
"step": 65000
},
{
"epoch": 106.45161290322581,
"grad_norm": 21.145742416381836,
"learning_rate": 5.806666666666667e-06,
"loss": 0.4665,
"step": 66000
},
{
"epoch": 106.45161290322581,
"eval_loss": 1.5837030410766602,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3108,
"eval_samples_per_second": 1679.611,
"eval_steps_per_second": 106.182,
"step": 66000
},
{
"epoch": 108.06451612903226,
"grad_norm": 8.358002662658691,
"learning_rate": 5.591612903225807e-06,
"loss": 0.4846,
"step": 67000
},
{
"epoch": 108.06451612903226,
"eval_loss": 1.3910651206970215,
"eval_model_preparation_time": 0.0012,
"eval_runtime": 0.3115,
"eval_samples_per_second": 1676.029,
"eval_steps_per_second": 105.956,
"step": 67000
}
],
"logging_steps": 1000,
"max_steps": 93000,
"num_input_tokens_seen": 0,
"num_train_epochs": 150,
"save_steps": 1000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 80,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1978332035428352e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}