square_run_32_batch / trainer_state.json
corranm's picture
End of training
428ed53 verified
{
"best_metric": 1.2568100690841675,
"best_model_checkpoint": "square_run_32_batch/checkpoint-240",
"epoch": 30.0,
"eval_steps": 500,
"global_step": 450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.13333333333333333,
"grad_norm": 6.848705291748047,
"learning_rate": 4.444444444444445e-06,
"loss": 2.0193,
"step": 2
},
{
"epoch": 0.26666666666666666,
"grad_norm": 5.4228925704956055,
"learning_rate": 8.88888888888889e-06,
"loss": 2.1013,
"step": 4
},
{
"epoch": 0.4,
"grad_norm": 4.7422590255737305,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.9569,
"step": 6
},
{
"epoch": 0.5333333333333333,
"grad_norm": 5.0756025314331055,
"learning_rate": 1.777777777777778e-05,
"loss": 1.9677,
"step": 8
},
{
"epoch": 0.6666666666666666,
"grad_norm": 7.1845269203186035,
"learning_rate": 2.2222222222222223e-05,
"loss": 1.884,
"step": 10
},
{
"epoch": 0.8,
"grad_norm": 4.051375865936279,
"learning_rate": 2.6666666666666667e-05,
"loss": 1.9017,
"step": 12
},
{
"epoch": 0.9333333333333333,
"grad_norm": 5.293440341949463,
"learning_rate": 3.111111111111111e-05,
"loss": 1.9373,
"step": 14
},
{
"epoch": 1.0,
"eval_accuracy": 0.1893939393939394,
"eval_f1_macro": 0.04638218923933209,
"eval_f1_micro": 0.1893939393939394,
"eval_f1_weighted": 0.06149153876426603,
"eval_loss": 1.8818118572235107,
"eval_precision_macro": 0.027685492801771874,
"eval_precision_micro": 0.1893939393939394,
"eval_precision_weighted": 0.03670425182053089,
"eval_recall_macro": 0.14285714285714285,
"eval_recall_micro": 0.1893939393939394,
"eval_recall_weighted": 0.1893939393939394,
"eval_runtime": 2.2684,
"eval_samples_per_second": 58.192,
"eval_steps_per_second": 2.204,
"step": 15
},
{
"epoch": 1.0666666666666667,
"grad_norm": 3.4230170249938965,
"learning_rate": 3.555555555555556e-05,
"loss": 1.9179,
"step": 16
},
{
"epoch": 1.2,
"grad_norm": 3.4030396938323975,
"learning_rate": 4e-05,
"loss": 1.8139,
"step": 18
},
{
"epoch": 1.3333333333333333,
"grad_norm": 4.195278167724609,
"learning_rate": 4.4444444444444447e-05,
"loss": 1.8941,
"step": 20
},
{
"epoch": 1.4666666666666668,
"grad_norm": 3.2356927394866943,
"learning_rate": 4.888888888888889e-05,
"loss": 1.8915,
"step": 22
},
{
"epoch": 1.6,
"grad_norm": 3.322704315185547,
"learning_rate": 5.333333333333333e-05,
"loss": 1.917,
"step": 24
},
{
"epoch": 1.7333333333333334,
"grad_norm": 3.293910026550293,
"learning_rate": 5.7777777777777776e-05,
"loss": 1.8943,
"step": 26
},
{
"epoch": 1.8666666666666667,
"grad_norm": 4.803905487060547,
"learning_rate": 6.222222222222222e-05,
"loss": 1.8841,
"step": 28
},
{
"epoch": 2.0,
"grad_norm": 4.006722927093506,
"learning_rate": 6.666666666666667e-05,
"loss": 1.869,
"step": 30
},
{
"epoch": 2.0,
"eval_accuracy": 0.26515151515151514,
"eval_f1_macro": 0.10998877665544333,
"eval_f1_micro": 0.26515151515151514,
"eval_f1_weighted": 0.14177124783185388,
"eval_loss": 1.864223599433899,
"eval_precision_macro": 0.075,
"eval_precision_micro": 0.26515151515151514,
"eval_precision_weighted": 0.09678030303030305,
"eval_recall_macro": 0.20634920634920634,
"eval_recall_micro": 0.26515151515151514,
"eval_recall_weighted": 0.26515151515151514,
"eval_runtime": 1.8754,
"eval_samples_per_second": 70.384,
"eval_steps_per_second": 2.666,
"step": 30
},
{
"epoch": 2.1333333333333333,
"grad_norm": 2.15875506401062,
"learning_rate": 7.111111111111112e-05,
"loss": 1.8686,
"step": 32
},
{
"epoch": 2.2666666666666666,
"grad_norm": 2.8864150047302246,
"learning_rate": 7.555555555555556e-05,
"loss": 1.8652,
"step": 34
},
{
"epoch": 2.4,
"grad_norm": 3.819974899291992,
"learning_rate": 8e-05,
"loss": 1.818,
"step": 36
},
{
"epoch": 2.533333333333333,
"grad_norm": 7.35491418838501,
"learning_rate": 8.444444444444444e-05,
"loss": 1.9347,
"step": 38
},
{
"epoch": 2.6666666666666665,
"grad_norm": 5.84605598449707,
"learning_rate": 8.888888888888889e-05,
"loss": 1.8508,
"step": 40
},
{
"epoch": 2.8,
"grad_norm": 2.4050137996673584,
"learning_rate": 9.333333333333334e-05,
"loss": 1.8884,
"step": 42
},
{
"epoch": 2.9333333333333336,
"grad_norm": 5.182938575744629,
"learning_rate": 9.777777777777778e-05,
"loss": 1.9218,
"step": 44
},
{
"epoch": 3.0,
"eval_accuracy": 0.25757575757575757,
"eval_f1_macro": 0.11628985865833667,
"eval_f1_micro": 0.25757575757575757,
"eval_f1_weighted": 0.1459780747505363,
"eval_loss": 1.8754385709762573,
"eval_precision_macro": 0.131615925058548,
"eval_precision_micro": 0.25757575757575757,
"eval_precision_weighted": 0.15663189269746647,
"eval_recall_macro": 0.19047619047619047,
"eval_recall_micro": 0.25757575757575757,
"eval_recall_weighted": 0.25757575757575757,
"eval_runtime": 1.9051,
"eval_samples_per_second": 69.286,
"eval_steps_per_second": 2.624,
"step": 45
},
{
"epoch": 3.066666666666667,
"grad_norm": 5.7963128089904785,
"learning_rate": 9.97530864197531e-05,
"loss": 1.9754,
"step": 46
},
{
"epoch": 3.2,
"grad_norm": 5.1605048179626465,
"learning_rate": 9.925925925925926e-05,
"loss": 1.8577,
"step": 48
},
{
"epoch": 3.3333333333333335,
"grad_norm": 3.756355047225952,
"learning_rate": 9.876543209876543e-05,
"loss": 1.8707,
"step": 50
},
{
"epoch": 3.466666666666667,
"grad_norm": 2.5353710651397705,
"learning_rate": 9.827160493827162e-05,
"loss": 1.7918,
"step": 52
},
{
"epoch": 3.6,
"grad_norm": 4.181753635406494,
"learning_rate": 9.777777777777778e-05,
"loss": 1.8251,
"step": 54
},
{
"epoch": 3.7333333333333334,
"grad_norm": 2.4634644985198975,
"learning_rate": 9.728395061728396e-05,
"loss": 1.7713,
"step": 56
},
{
"epoch": 3.8666666666666667,
"grad_norm": 8.700553894042969,
"learning_rate": 9.679012345679013e-05,
"loss": 1.8962,
"step": 58
},
{
"epoch": 4.0,
"grad_norm": 5.921916484832764,
"learning_rate": 9.62962962962963e-05,
"loss": 1.6733,
"step": 60
},
{
"epoch": 4.0,
"eval_accuracy": 0.38636363636363635,
"eval_f1_macro": 0.2445293836598184,
"eval_f1_micro": 0.38636363636363635,
"eval_f1_weighted": 0.3052538765582244,
"eval_loss": 1.6881486177444458,
"eval_precision_macro": 0.24274221103966703,
"eval_precision_micro": 0.38636363636363635,
"eval_precision_weighted": 0.2917426054412356,
"eval_recall_macro": 0.2992441421012849,
"eval_recall_micro": 0.38636363636363635,
"eval_recall_weighted": 0.38636363636363635,
"eval_runtime": 1.8719,
"eval_samples_per_second": 70.516,
"eval_steps_per_second": 2.671,
"step": 60
},
{
"epoch": 4.133333333333334,
"grad_norm": 3.3924758434295654,
"learning_rate": 9.580246913580247e-05,
"loss": 1.5941,
"step": 62
},
{
"epoch": 4.266666666666667,
"grad_norm": 6.785348415374756,
"learning_rate": 9.530864197530865e-05,
"loss": 1.582,
"step": 64
},
{
"epoch": 4.4,
"grad_norm": 4.813143730163574,
"learning_rate": 9.481481481481483e-05,
"loss": 1.649,
"step": 66
},
{
"epoch": 4.533333333333333,
"grad_norm": 5.351255893707275,
"learning_rate": 9.432098765432099e-05,
"loss": 1.6271,
"step": 68
},
{
"epoch": 4.666666666666667,
"grad_norm": 11.194862365722656,
"learning_rate": 9.382716049382717e-05,
"loss": 1.7395,
"step": 70
},
{
"epoch": 4.8,
"grad_norm": 6.263866424560547,
"learning_rate": 9.333333333333334e-05,
"loss": 1.4422,
"step": 72
},
{
"epoch": 4.933333333333334,
"grad_norm": 8.602386474609375,
"learning_rate": 9.28395061728395e-05,
"loss": 1.54,
"step": 74
},
{
"epoch": 5.0,
"eval_accuracy": 0.42424242424242425,
"eval_f1_macro": 0.32515713851372885,
"eval_f1_micro": 0.42424242424242425,
"eval_f1_weighted": 0.38558697740383735,
"eval_loss": 1.5528110265731812,
"eval_precision_macro": 0.34291374508765815,
"eval_precision_micro": 0.42424242424242425,
"eval_precision_weighted": 0.4100833883442579,
"eval_recall_macro": 0.35698412698412696,
"eval_recall_micro": 0.42424242424242425,
"eval_recall_weighted": 0.42424242424242425,
"eval_runtime": 1.9443,
"eval_samples_per_second": 67.891,
"eval_steps_per_second": 2.572,
"step": 75
},
{
"epoch": 5.066666666666666,
"grad_norm": 7.738183498382568,
"learning_rate": 9.234567901234568e-05,
"loss": 1.6152,
"step": 76
},
{
"epoch": 5.2,
"grad_norm": 7.564102649688721,
"learning_rate": 9.185185185185186e-05,
"loss": 1.4993,
"step": 78
},
{
"epoch": 5.333333333333333,
"grad_norm": 8.335043907165527,
"learning_rate": 9.135802469135802e-05,
"loss": 1.494,
"step": 80
},
{
"epoch": 5.466666666666667,
"grad_norm": 6.382967948913574,
"learning_rate": 9.08641975308642e-05,
"loss": 1.4944,
"step": 82
},
{
"epoch": 5.6,
"grad_norm": 7.259094715118408,
"learning_rate": 9.037037037037038e-05,
"loss": 1.3191,
"step": 84
},
{
"epoch": 5.733333333333333,
"grad_norm": 4.972009658813477,
"learning_rate": 8.987654320987655e-05,
"loss": 1.3894,
"step": 86
},
{
"epoch": 5.866666666666667,
"grad_norm": 9.250694274902344,
"learning_rate": 8.938271604938272e-05,
"loss": 1.3979,
"step": 88
},
{
"epoch": 6.0,
"grad_norm": 7.207069396972656,
"learning_rate": 8.888888888888889e-05,
"loss": 1.4418,
"step": 90
},
{
"epoch": 6.0,
"eval_accuracy": 0.38636363636363635,
"eval_f1_macro": 0.285838283865586,
"eval_f1_micro": 0.38636363636363635,
"eval_f1_weighted": 0.3212562379097532,
"eval_loss": 1.5736558437347412,
"eval_precision_macro": 0.284608858206039,
"eval_precision_micro": 0.38636363636363635,
"eval_precision_weighted": 0.3242631096693543,
"eval_recall_macro": 0.3398034769463341,
"eval_recall_micro": 0.38636363636363635,
"eval_recall_weighted": 0.38636363636363635,
"eval_runtime": 1.8593,
"eval_samples_per_second": 70.995,
"eval_steps_per_second": 2.689,
"step": 90
},
{
"epoch": 6.133333333333334,
"grad_norm": 4.753687381744385,
"learning_rate": 8.839506172839507e-05,
"loss": 1.3218,
"step": 92
},
{
"epoch": 6.266666666666667,
"grad_norm": 5.942229747772217,
"learning_rate": 8.790123456790123e-05,
"loss": 1.3995,
"step": 94
},
{
"epoch": 6.4,
"grad_norm": 4.026015281677246,
"learning_rate": 8.740740740740741e-05,
"loss": 1.3155,
"step": 96
},
{
"epoch": 6.533333333333333,
"grad_norm": 4.893887042999268,
"learning_rate": 8.691358024691359e-05,
"loss": 1.2009,
"step": 98
},
{
"epoch": 6.666666666666667,
"grad_norm": 3.904926061630249,
"learning_rate": 8.641975308641975e-05,
"loss": 0.98,
"step": 100
},
{
"epoch": 6.8,
"grad_norm": 4.266864776611328,
"learning_rate": 8.592592592592593e-05,
"loss": 1.0803,
"step": 102
},
{
"epoch": 6.933333333333334,
"grad_norm": 5.24403190612793,
"learning_rate": 8.54320987654321e-05,
"loss": 0.8592,
"step": 104
},
{
"epoch": 7.0,
"eval_accuracy": 0.4393939393939394,
"eval_f1_macro": 0.3443599467808913,
"eval_f1_micro": 0.4393939393939394,
"eval_f1_weighted": 0.39645908811500513,
"eval_loss": 1.5408130884170532,
"eval_precision_macro": 0.32083233878346656,
"eval_precision_micro": 0.4393939393939394,
"eval_precision_weighted": 0.36735850041771095,
"eval_recall_macro": 0.37913832199546477,
"eval_recall_micro": 0.4393939393939394,
"eval_recall_weighted": 0.4393939393939394,
"eval_runtime": 1.9485,
"eval_samples_per_second": 67.743,
"eval_steps_per_second": 2.566,
"step": 105
},
{
"epoch": 7.066666666666666,
"grad_norm": 5.595825672149658,
"learning_rate": 8.493827160493828e-05,
"loss": 1.0203,
"step": 106
},
{
"epoch": 7.2,
"grad_norm": 5.34617805480957,
"learning_rate": 8.444444444444444e-05,
"loss": 1.0819,
"step": 108
},
{
"epoch": 7.333333333333333,
"grad_norm": 6.987905025482178,
"learning_rate": 8.395061728395062e-05,
"loss": 1.1165,
"step": 110
},
{
"epoch": 7.466666666666667,
"grad_norm": 6.039572715759277,
"learning_rate": 8.34567901234568e-05,
"loss": 1.0403,
"step": 112
},
{
"epoch": 7.6,
"grad_norm": 6.031858444213867,
"learning_rate": 8.296296296296296e-05,
"loss": 0.9709,
"step": 114
},
{
"epoch": 7.733333333333333,
"grad_norm": 6.656283855438232,
"learning_rate": 8.246913580246915e-05,
"loss": 0.8358,
"step": 116
},
{
"epoch": 7.866666666666667,
"grad_norm": 6.286685943603516,
"learning_rate": 8.197530864197531e-05,
"loss": 1.146,
"step": 118
},
{
"epoch": 8.0,
"grad_norm": 9.892986297607422,
"learning_rate": 8.148148148148148e-05,
"loss": 1.1427,
"step": 120
},
{
"epoch": 8.0,
"eval_accuracy": 0.5606060606060606,
"eval_f1_macro": 0.46377203827822905,
"eval_f1_micro": 0.5606060606060606,
"eval_f1_weighted": 0.5317054176401353,
"eval_loss": 1.2803829908370972,
"eval_precision_macro": 0.469819473380193,
"eval_precision_micro": 0.5606060606060606,
"eval_precision_weighted": 0.5280005916463256,
"eval_recall_macro": 0.4830687830687831,
"eval_recall_micro": 0.5606060606060606,
"eval_recall_weighted": 0.5606060606060606,
"eval_runtime": 1.9474,
"eval_samples_per_second": 67.784,
"eval_steps_per_second": 2.568,
"step": 120
},
{
"epoch": 8.133333333333333,
"grad_norm": 4.904130458831787,
"learning_rate": 8.098765432098767e-05,
"loss": 0.8933,
"step": 122
},
{
"epoch": 8.266666666666667,
"grad_norm": 4.419686794281006,
"learning_rate": 8.049382716049383e-05,
"loss": 0.9245,
"step": 124
},
{
"epoch": 8.4,
"grad_norm": 8.33668041229248,
"learning_rate": 8e-05,
"loss": 0.8385,
"step": 126
},
{
"epoch": 8.533333333333333,
"grad_norm": 8.35203742980957,
"learning_rate": 7.950617283950618e-05,
"loss": 0.9428,
"step": 128
},
{
"epoch": 8.666666666666666,
"grad_norm": 5.724539279937744,
"learning_rate": 7.901234567901235e-05,
"loss": 0.7591,
"step": 130
},
{
"epoch": 8.8,
"grad_norm": 8.662413597106934,
"learning_rate": 7.851851851851852e-05,
"loss": 0.995,
"step": 132
},
{
"epoch": 8.933333333333334,
"grad_norm": 5.197690010070801,
"learning_rate": 7.802469135802469e-05,
"loss": 0.7849,
"step": 134
},
{
"epoch": 9.0,
"eval_accuracy": 0.553030303030303,
"eval_f1_macro": 0.46486536691732006,
"eval_f1_micro": 0.553030303030303,
"eval_f1_weighted": 0.529141811901771,
"eval_loss": 1.2879999876022339,
"eval_precision_macro": 0.48036078903674717,
"eval_precision_micro": 0.553030303030303,
"eval_precision_weighted": 0.540143107077697,
"eval_recall_macro": 0.4822675736961451,
"eval_recall_micro": 0.553030303030303,
"eval_recall_weighted": 0.553030303030303,
"eval_runtime": 1.9912,
"eval_samples_per_second": 66.292,
"eval_steps_per_second": 2.511,
"step": 135
},
{
"epoch": 9.066666666666666,
"grad_norm": 7.03670597076416,
"learning_rate": 7.753086419753088e-05,
"loss": 0.8049,
"step": 136
},
{
"epoch": 9.2,
"grad_norm": 5.591729640960693,
"learning_rate": 7.703703703703704e-05,
"loss": 0.9341,
"step": 138
},
{
"epoch": 9.333333333333334,
"grad_norm": 6.677962303161621,
"learning_rate": 7.65432098765432e-05,
"loss": 0.7679,
"step": 140
},
{
"epoch": 9.466666666666667,
"grad_norm": 5.4789934158325195,
"learning_rate": 7.60493827160494e-05,
"loss": 0.7773,
"step": 142
},
{
"epoch": 9.6,
"grad_norm": 5.957266330718994,
"learning_rate": 7.555555555555556e-05,
"loss": 0.638,
"step": 144
},
{
"epoch": 9.733333333333333,
"grad_norm": 5.691118240356445,
"learning_rate": 7.506172839506173e-05,
"loss": 0.7762,
"step": 146
},
{
"epoch": 9.866666666666667,
"grad_norm": 6.8899827003479,
"learning_rate": 7.456790123456791e-05,
"loss": 0.9012,
"step": 148
},
{
"epoch": 10.0,
"grad_norm": 7.408969402313232,
"learning_rate": 7.407407407407407e-05,
"loss": 0.6846,
"step": 150
},
{
"epoch": 10.0,
"eval_accuracy": 0.5151515151515151,
"eval_f1_macro": 0.42983280392444223,
"eval_f1_micro": 0.5151515151515151,
"eval_f1_weighted": 0.48105498068393227,
"eval_loss": 1.3130199909210205,
"eval_precision_macro": 0.4404005812415951,
"eval_precision_micro": 0.5151515151515151,
"eval_precision_weighted": 0.5005354338015628,
"eval_recall_macro": 0.4670748299319728,
"eval_recall_micro": 0.5151515151515151,
"eval_recall_weighted": 0.5151515151515151,
"eval_runtime": 1.9855,
"eval_samples_per_second": 66.482,
"eval_steps_per_second": 2.518,
"step": 150
},
{
"epoch": 10.133333333333333,
"grad_norm": 5.070552825927734,
"learning_rate": 7.358024691358025e-05,
"loss": 0.6116,
"step": 152
},
{
"epoch": 10.266666666666667,
"grad_norm": 4.844223499298096,
"learning_rate": 7.308641975308643e-05,
"loss": 0.6517,
"step": 154
},
{
"epoch": 10.4,
"grad_norm": 3.965522289276123,
"learning_rate": 7.25925925925926e-05,
"loss": 0.5573,
"step": 156
},
{
"epoch": 10.533333333333333,
"grad_norm": 7.53262996673584,
"learning_rate": 7.209876543209877e-05,
"loss": 0.7258,
"step": 158
},
{
"epoch": 10.666666666666666,
"grad_norm": 6.725161552429199,
"learning_rate": 7.160493827160494e-05,
"loss": 0.8109,
"step": 160
},
{
"epoch": 10.8,
"grad_norm": 8.250865936279297,
"learning_rate": 7.111111111111112e-05,
"loss": 0.8596,
"step": 162
},
{
"epoch": 10.933333333333334,
"grad_norm": 4.163515567779541,
"learning_rate": 7.061728395061728e-05,
"loss": 0.4006,
"step": 164
},
{
"epoch": 11.0,
"eval_accuracy": 0.5833333333333334,
"eval_f1_macro": 0.49308835780529725,
"eval_f1_micro": 0.5833333333333334,
"eval_f1_weighted": 0.5597960736751899,
"eval_loss": 1.295769214630127,
"eval_precision_macro": 0.498317425896604,
"eval_precision_micro": 0.5833333333333334,
"eval_precision_weighted": 0.5756076561299337,
"eval_recall_macro": 0.5229024943310657,
"eval_recall_micro": 0.5833333333333334,
"eval_recall_weighted": 0.5833333333333334,
"eval_runtime": 1.9133,
"eval_samples_per_second": 68.991,
"eval_steps_per_second": 2.613,
"step": 165
},
{
"epoch": 11.066666666666666,
"grad_norm": 4.829576015472412,
"learning_rate": 7.012345679012346e-05,
"loss": 0.6355,
"step": 166
},
{
"epoch": 11.2,
"grad_norm": 5.353898525238037,
"learning_rate": 6.962962962962964e-05,
"loss": 0.4955,
"step": 168
},
{
"epoch": 11.333333333333334,
"grad_norm": 5.44912052154541,
"learning_rate": 6.91358024691358e-05,
"loss": 0.4833,
"step": 170
},
{
"epoch": 11.466666666666667,
"grad_norm": 5.900742530822754,
"learning_rate": 6.864197530864198e-05,
"loss": 0.5752,
"step": 172
},
{
"epoch": 11.6,
"grad_norm": 6.004303455352783,
"learning_rate": 6.814814814814815e-05,
"loss": 0.5738,
"step": 174
},
{
"epoch": 11.733333333333333,
"grad_norm": 3.937319040298462,
"learning_rate": 6.765432098765433e-05,
"loss": 0.4661,
"step": 176
},
{
"epoch": 11.866666666666667,
"grad_norm": 4.814683437347412,
"learning_rate": 6.716049382716049e-05,
"loss": 0.5694,
"step": 178
},
{
"epoch": 12.0,
"grad_norm": 6.7769880294799805,
"learning_rate": 6.666666666666667e-05,
"loss": 0.4329,
"step": 180
},
{
"epoch": 12.0,
"eval_accuracy": 0.553030303030303,
"eval_f1_macro": 0.506246746427407,
"eval_f1_micro": 0.553030303030303,
"eval_f1_weighted": 0.5561970515744254,
"eval_loss": 1.299007773399353,
"eval_precision_macro": 0.5314684490530354,
"eval_precision_micro": 0.553030303030303,
"eval_precision_weighted": 0.5874290165244113,
"eval_recall_macro": 0.5133106575963718,
"eval_recall_micro": 0.553030303030303,
"eval_recall_weighted": 0.553030303030303,
"eval_runtime": 2.0372,
"eval_samples_per_second": 64.793,
"eval_steps_per_second": 2.454,
"step": 180
},
{
"epoch": 12.133333333333333,
"grad_norm": 5.787886619567871,
"learning_rate": 6.617283950617285e-05,
"loss": 0.5719,
"step": 182
},
{
"epoch": 12.266666666666667,
"grad_norm": 2.843268632888794,
"learning_rate": 6.567901234567901e-05,
"loss": 0.4646,
"step": 184
},
{
"epoch": 12.4,
"grad_norm": 4.530274391174316,
"learning_rate": 6.51851851851852e-05,
"loss": 0.3544,
"step": 186
},
{
"epoch": 12.533333333333333,
"grad_norm": 5.348933696746826,
"learning_rate": 6.469135802469136e-05,
"loss": 0.3957,
"step": 188
},
{
"epoch": 12.666666666666666,
"grad_norm": 7.746328830718994,
"learning_rate": 6.419753086419753e-05,
"loss": 0.4989,
"step": 190
},
{
"epoch": 12.8,
"grad_norm": 6.134746074676514,
"learning_rate": 6.37037037037037e-05,
"loss": 0.7035,
"step": 192
},
{
"epoch": 12.933333333333334,
"grad_norm": 5.567310810089111,
"learning_rate": 6.320987654320988e-05,
"loss": 0.482,
"step": 194
},
{
"epoch": 13.0,
"eval_accuracy": 0.5151515151515151,
"eval_f1_macro": 0.4842067834885892,
"eval_f1_micro": 0.5151515151515151,
"eval_f1_weighted": 0.5233183119383529,
"eval_loss": 1.3830989599227905,
"eval_precision_macro": 0.5517290249433106,
"eval_precision_micro": 0.5151515151515151,
"eval_precision_weighted": 0.5803270803270804,
"eval_recall_macro": 0.48390778533635675,
"eval_recall_micro": 0.5151515151515151,
"eval_recall_weighted": 0.5151515151515151,
"eval_runtime": 2.882,
"eval_samples_per_second": 45.801,
"eval_steps_per_second": 1.735,
"step": 195
},
{
"epoch": 13.066666666666666,
"grad_norm": 6.7704386711120605,
"learning_rate": 6.271604938271606e-05,
"loss": 0.5136,
"step": 196
},
{
"epoch": 13.2,
"grad_norm": 5.41668701171875,
"learning_rate": 6.222222222222222e-05,
"loss": 0.4843,
"step": 198
},
{
"epoch": 13.333333333333334,
"grad_norm": 4.7562150955200195,
"learning_rate": 6.17283950617284e-05,
"loss": 0.3338,
"step": 200
},
{
"epoch": 13.466666666666667,
"grad_norm": 4.077147960662842,
"learning_rate": 6.123456790123457e-05,
"loss": 0.2694,
"step": 202
},
{
"epoch": 13.6,
"grad_norm": 4.678223609924316,
"learning_rate": 6.074074074074074e-05,
"loss": 0.2965,
"step": 204
},
{
"epoch": 13.733333333333333,
"grad_norm": 6.246657371520996,
"learning_rate": 6.024691358024692e-05,
"loss": 0.489,
"step": 206
},
{
"epoch": 13.866666666666667,
"grad_norm": 4.0403971672058105,
"learning_rate": 5.975308641975309e-05,
"loss": 0.3524,
"step": 208
},
{
"epoch": 14.0,
"grad_norm": 11.723469734191895,
"learning_rate": 5.925925925925926e-05,
"loss": 0.6409,
"step": 210
},
{
"epoch": 14.0,
"eval_accuracy": 0.5984848484848485,
"eval_f1_macro": 0.5080833548412379,
"eval_f1_micro": 0.5984848484848485,
"eval_f1_weighted": 0.576454835403795,
"eval_loss": 1.4066194295883179,
"eval_precision_macro": 0.5193577256077255,
"eval_precision_micro": 0.5984848484848485,
"eval_precision_weighted": 0.5819911307127215,
"eval_recall_macro": 0.5231594860166289,
"eval_recall_micro": 0.5984848484848485,
"eval_recall_weighted": 0.5984848484848485,
"eval_runtime": 4.8101,
"eval_samples_per_second": 27.442,
"eval_steps_per_second": 1.039,
"step": 210
},
{
"epoch": 14.133333333333333,
"grad_norm": 4.278630256652832,
"learning_rate": 5.8765432098765437e-05,
"loss": 0.1963,
"step": 212
},
{
"epoch": 14.266666666666667,
"grad_norm": 5.803009510040283,
"learning_rate": 5.8271604938271607e-05,
"loss": 0.4284,
"step": 214
},
{
"epoch": 14.4,
"grad_norm": 4.886916160583496,
"learning_rate": 5.7777777777777776e-05,
"loss": 0.3091,
"step": 216
},
{
"epoch": 14.533333333333333,
"grad_norm": 6.119672775268555,
"learning_rate": 5.728395061728395e-05,
"loss": 0.3287,
"step": 218
},
{
"epoch": 14.666666666666666,
"grad_norm": 7.14682149887085,
"learning_rate": 5.679012345679012e-05,
"loss": 0.2819,
"step": 220
},
{
"epoch": 14.8,
"grad_norm": 5.075103282928467,
"learning_rate": 5.62962962962963e-05,
"loss": 0.2101,
"step": 222
},
{
"epoch": 14.933333333333334,
"grad_norm": 4.5539045333862305,
"learning_rate": 5.580246913580247e-05,
"loss": 0.3206,
"step": 224
},
{
"epoch": 15.0,
"eval_accuracy": 0.5606060606060606,
"eval_f1_macro": 0.5154896879386676,
"eval_f1_micro": 0.5606060606060606,
"eval_f1_weighted": 0.5520090359376074,
"eval_loss": 1.3689966201782227,
"eval_precision_macro": 0.6158199643493761,
"eval_precision_micro": 0.5606060606060606,
"eval_precision_weighted": 0.5889932074758278,
"eval_recall_macro": 0.5170219198790628,
"eval_recall_micro": 0.5606060606060606,
"eval_recall_weighted": 0.5606060606060606,
"eval_runtime": 2.0949,
"eval_samples_per_second": 63.009,
"eval_steps_per_second": 2.387,
"step": 225
},
{
"epoch": 15.066666666666666,
"grad_norm": 4.093947887420654,
"learning_rate": 5.530864197530864e-05,
"loss": 0.3352,
"step": 226
},
{
"epoch": 15.2,
"grad_norm": 5.242745876312256,
"learning_rate": 5.4814814814814817e-05,
"loss": 0.2066,
"step": 228
},
{
"epoch": 15.333333333333334,
"grad_norm": 5.613947868347168,
"learning_rate": 5.4320987654320986e-05,
"loss": 0.3504,
"step": 230
},
{
"epoch": 15.466666666666667,
"grad_norm": 3.4319839477539062,
"learning_rate": 5.382716049382717e-05,
"loss": 0.2294,
"step": 232
},
{
"epoch": 15.6,
"grad_norm": 6.01231575012207,
"learning_rate": 5.333333333333333e-05,
"loss": 0.2498,
"step": 234
},
{
"epoch": 15.733333333333333,
"grad_norm": 3.9071357250213623,
"learning_rate": 5.28395061728395e-05,
"loss": 0.2092,
"step": 236
},
{
"epoch": 15.866666666666667,
"grad_norm": 5.718769550323486,
"learning_rate": 5.234567901234568e-05,
"loss": 0.2223,
"step": 238
},
{
"epoch": 16.0,
"grad_norm": 4.071746349334717,
"learning_rate": 5.185185185185185e-05,
"loss": 0.1773,
"step": 240
},
{
"epoch": 16.0,
"eval_accuracy": 0.6515151515151515,
"eval_f1_macro": 0.592019301793738,
"eval_f1_micro": 0.6515151515151515,
"eval_f1_weighted": 0.6407837434153223,
"eval_loss": 1.2568100690841675,
"eval_precision_macro": 0.6893528941196284,
"eval_precision_micro": 0.6515151515151515,
"eval_precision_weighted": 0.6623135907365115,
"eval_recall_macro": 0.5842857142857143,
"eval_recall_micro": 0.6515151515151515,
"eval_recall_weighted": 0.6515151515151515,
"eval_runtime": 1.9927,
"eval_samples_per_second": 66.24,
"eval_steps_per_second": 2.509,
"step": 240
},
{
"epoch": 16.133333333333333,
"grad_norm": 3.561516761779785,
"learning_rate": 5.135802469135803e-05,
"loss": 0.1696,
"step": 242
},
{
"epoch": 16.266666666666666,
"grad_norm": 1.3526779413223267,
"learning_rate": 5.0864197530864197e-05,
"loss": 0.0665,
"step": 244
},
{
"epoch": 16.4,
"grad_norm": 4.29080057144165,
"learning_rate": 5.0370370370370366e-05,
"loss": 0.195,
"step": 246
},
{
"epoch": 16.533333333333335,
"grad_norm": 6.229769706726074,
"learning_rate": 4.987654320987655e-05,
"loss": 0.2993,
"step": 248
},
{
"epoch": 16.666666666666668,
"grad_norm": 4.949665546417236,
"learning_rate": 4.938271604938271e-05,
"loss": 0.2081,
"step": 250
},
{
"epoch": 16.8,
"grad_norm": 6.123852252960205,
"learning_rate": 4.888888888888889e-05,
"loss": 0.212,
"step": 252
},
{
"epoch": 16.933333333333334,
"grad_norm": 4.0239105224609375,
"learning_rate": 4.8395061728395067e-05,
"loss": 0.3259,
"step": 254
},
{
"epoch": 17.0,
"eval_accuracy": 0.6060606060606061,
"eval_f1_macro": 0.5467242234296787,
"eval_f1_micro": 0.6060606060606061,
"eval_f1_weighted": 0.5961390083174005,
"eval_loss": 1.3405537605285645,
"eval_precision_macro": 0.5614736217067472,
"eval_precision_micro": 0.6060606060606061,
"eval_precision_weighted": 0.6033042542530208,
"eval_recall_macro": 0.5466817838246409,
"eval_recall_micro": 0.6060606060606061,
"eval_recall_weighted": 0.6060606060606061,
"eval_runtime": 2.0502,
"eval_samples_per_second": 64.382,
"eval_steps_per_second": 2.439,
"step": 255
},
{
"epoch": 17.066666666666666,
"grad_norm": 1.4321271181106567,
"learning_rate": 4.7901234567901237e-05,
"loss": 0.055,
"step": 256
},
{
"epoch": 17.2,
"grad_norm": 2.13454008102417,
"learning_rate": 4.740740740740741e-05,
"loss": 0.1221,
"step": 258
},
{
"epoch": 17.333333333333332,
"grad_norm": 5.276524066925049,
"learning_rate": 4.691358024691358e-05,
"loss": 0.1417,
"step": 260
},
{
"epoch": 17.466666666666665,
"grad_norm": 3.8555052280426025,
"learning_rate": 4.641975308641975e-05,
"loss": 0.2943,
"step": 262
},
{
"epoch": 17.6,
"grad_norm": 4.094534873962402,
"learning_rate": 4.592592592592593e-05,
"loss": 0.2206,
"step": 264
},
{
"epoch": 17.733333333333334,
"grad_norm": 4.184159278869629,
"learning_rate": 4.54320987654321e-05,
"loss": 0.1565,
"step": 266
},
{
"epoch": 17.866666666666667,
"grad_norm": 5.283144474029541,
"learning_rate": 4.493827160493828e-05,
"loss": 0.1427,
"step": 268
},
{
"epoch": 18.0,
"grad_norm": 3.6470813751220703,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.1123,
"step": 270
},
{
"epoch": 18.0,
"eval_accuracy": 0.6363636363636364,
"eval_f1_macro": 0.5867719657675725,
"eval_f1_micro": 0.6363636363636364,
"eval_f1_weighted": 0.6305501232595613,
"eval_loss": 1.376707911491394,
"eval_precision_macro": 0.6257631257631259,
"eval_precision_micro": 0.6363636363636364,
"eval_precision_weighted": 0.6413447663447664,
"eval_recall_macro": 0.5785109599395314,
"eval_recall_micro": 0.6363636363636364,
"eval_recall_weighted": 0.6363636363636364,
"eval_runtime": 1.992,
"eval_samples_per_second": 66.266,
"eval_steps_per_second": 2.51,
"step": 270
},
{
"epoch": 18.133333333333333,
"grad_norm": 3.1710643768310547,
"learning_rate": 4.3950617283950617e-05,
"loss": 0.1219,
"step": 272
},
{
"epoch": 18.266666666666666,
"grad_norm": 7.098196506500244,
"learning_rate": 4.345679012345679e-05,
"loss": 0.1588,
"step": 274
},
{
"epoch": 18.4,
"grad_norm": 1.8567241430282593,
"learning_rate": 4.296296296296296e-05,
"loss": 0.1043,
"step": 276
},
{
"epoch": 18.533333333333335,
"grad_norm": 2.1221156120300293,
"learning_rate": 4.246913580246914e-05,
"loss": 0.0748,
"step": 278
},
{
"epoch": 18.666666666666668,
"grad_norm": 3.03196120262146,
"learning_rate": 4.197530864197531e-05,
"loss": 0.1148,
"step": 280
},
{
"epoch": 18.8,
"grad_norm": 1.7942876815795898,
"learning_rate": 4.148148148148148e-05,
"loss": 0.0679,
"step": 282
},
{
"epoch": 18.933333333333334,
"grad_norm": 4.499013900756836,
"learning_rate": 4.0987654320987657e-05,
"loss": 0.1129,
"step": 284
},
{
"epoch": 19.0,
"eval_accuracy": 0.6439393939393939,
"eval_f1_macro": 0.587916778045086,
"eval_f1_micro": 0.6439393939393939,
"eval_f1_weighted": 0.6305576751206262,
"eval_loss": 1.4679865837097168,
"eval_precision_macro": 0.6809288563910413,
"eval_precision_micro": 0.6439393939393939,
"eval_precision_weighted": 0.6932697872537444,
"eval_recall_macro": 0.5806046863189721,
"eval_recall_micro": 0.6439393939393939,
"eval_recall_weighted": 0.6439393939393939,
"eval_runtime": 1.9847,
"eval_samples_per_second": 66.508,
"eval_steps_per_second": 2.519,
"step": 285
},
{
"epoch": 19.066666666666666,
"grad_norm": 2.631176233291626,
"learning_rate": 4.049382716049383e-05,
"loss": 0.1028,
"step": 286
},
{
"epoch": 19.2,
"grad_norm": 4.930914402008057,
"learning_rate": 4e-05,
"loss": 0.2555,
"step": 288
},
{
"epoch": 19.333333333333332,
"grad_norm": 3.355149745941162,
"learning_rate": 3.950617283950617e-05,
"loss": 0.0792,
"step": 290
},
{
"epoch": 19.466666666666665,
"grad_norm": 2.2780933380126953,
"learning_rate": 3.901234567901234e-05,
"loss": 0.0595,
"step": 292
},
{
"epoch": 19.6,
"grad_norm": 4.880768299102783,
"learning_rate": 3.851851851851852e-05,
"loss": 0.0756,
"step": 294
},
{
"epoch": 19.733333333333334,
"grad_norm": 2.175165891647339,
"learning_rate": 3.80246913580247e-05,
"loss": 0.1077,
"step": 296
},
{
"epoch": 19.866666666666667,
"grad_norm": 2.6557981967926025,
"learning_rate": 3.7530864197530867e-05,
"loss": 0.1094,
"step": 298
},
{
"epoch": 20.0,
"grad_norm": 1.2508912086486816,
"learning_rate": 3.7037037037037037e-05,
"loss": 0.0651,
"step": 300
},
{
"epoch": 20.0,
"eval_accuracy": 0.6893939393939394,
"eval_f1_macro": 0.6655257312106627,
"eval_f1_micro": 0.6893939393939394,
"eval_f1_weighted": 0.687595503348928,
"eval_loss": 1.4981398582458496,
"eval_precision_macro": 0.7114991648833447,
"eval_precision_micro": 0.6893939393939394,
"eval_precision_weighted": 0.7224498247915767,
"eval_recall_macro": 0.6510808767951625,
"eval_recall_micro": 0.6893939393939394,
"eval_recall_weighted": 0.6893939393939394,
"eval_runtime": 1.9861,
"eval_samples_per_second": 66.462,
"eval_steps_per_second": 2.517,
"step": 300
},
{
"epoch": 20.133333333333333,
"grad_norm": 5.263727188110352,
"learning_rate": 3.654320987654321e-05,
"loss": 0.075,
"step": 302
},
{
"epoch": 20.266666666666666,
"grad_norm": 4.619281768798828,
"learning_rate": 3.604938271604938e-05,
"loss": 0.1319,
"step": 304
},
{
"epoch": 20.4,
"grad_norm": 1.0995675325393677,
"learning_rate": 3.555555555555556e-05,
"loss": 0.0366,
"step": 306
},
{
"epoch": 20.533333333333335,
"grad_norm": 4.2385663986206055,
"learning_rate": 3.506172839506173e-05,
"loss": 0.1331,
"step": 308
},
{
"epoch": 20.666666666666668,
"grad_norm": 2.6913745403289795,
"learning_rate": 3.45679012345679e-05,
"loss": 0.0894,
"step": 310
},
{
"epoch": 20.8,
"grad_norm": 4.785970687866211,
"learning_rate": 3.4074074074074077e-05,
"loss": 0.0756,
"step": 312
},
{
"epoch": 20.933333333333334,
"grad_norm": 1.5702877044677734,
"learning_rate": 3.3580246913580247e-05,
"loss": 0.0685,
"step": 314
},
{
"epoch": 21.0,
"eval_accuracy": 0.6515151515151515,
"eval_f1_macro": 0.6091138915880551,
"eval_f1_micro": 0.6515151515151515,
"eval_f1_weighted": 0.6494256262321655,
"eval_loss": 1.4620611667633057,
"eval_precision_macro": 0.630280884283538,
"eval_precision_micro": 0.6515151515151515,
"eval_precision_weighted": 0.664075183502428,
"eval_recall_macro": 0.6039682539682539,
"eval_recall_micro": 0.6515151515151515,
"eval_recall_weighted": 0.6515151515151515,
"eval_runtime": 2.0276,
"eval_samples_per_second": 65.103,
"eval_steps_per_second": 2.466,
"step": 315
},
{
"epoch": 21.066666666666666,
"grad_norm": 0.6037698984146118,
"learning_rate": 3.308641975308642e-05,
"loss": 0.0537,
"step": 316
},
{
"epoch": 21.2,
"grad_norm": 0.877955436706543,
"learning_rate": 3.25925925925926e-05,
"loss": 0.0283,
"step": 318
},
{
"epoch": 21.333333333333332,
"grad_norm": 4.185865879058838,
"learning_rate": 3.209876543209876e-05,
"loss": 0.1153,
"step": 320
},
{
"epoch": 21.466666666666665,
"grad_norm": 0.7465834021568298,
"learning_rate": 3.160493827160494e-05,
"loss": 0.0311,
"step": 322
},
{
"epoch": 21.6,
"grad_norm": 1.4049850702285767,
"learning_rate": 3.111111111111111e-05,
"loss": 0.0641,
"step": 324
},
{
"epoch": 21.733333333333334,
"grad_norm": 1.6191234588623047,
"learning_rate": 3.061728395061729e-05,
"loss": 0.0417,
"step": 326
},
{
"epoch": 21.866666666666667,
"grad_norm": 1.2088876962661743,
"learning_rate": 3.012345679012346e-05,
"loss": 0.0314,
"step": 328
},
{
"epoch": 22.0,
"grad_norm": 0.7652052640914917,
"learning_rate": 2.962962962962963e-05,
"loss": 0.1469,
"step": 330
},
{
"epoch": 22.0,
"eval_accuracy": 0.6212121212121212,
"eval_f1_macro": 0.5330299221627766,
"eval_f1_micro": 0.6212121212121212,
"eval_f1_weighted": 0.604041002442862,
"eval_loss": 1.534732699394226,
"eval_precision_macro": 0.5476940619507992,
"eval_precision_micro": 0.6212121212121212,
"eval_precision_weighted": 0.6148931558944467,
"eval_recall_macro": 0.5439984882842026,
"eval_recall_micro": 0.6212121212121212,
"eval_recall_weighted": 0.6212121212121212,
"eval_runtime": 1.9747,
"eval_samples_per_second": 66.844,
"eval_steps_per_second": 2.532,
"step": 330
},
{
"epoch": 22.133333333333333,
"grad_norm": 3.304185152053833,
"learning_rate": 2.9135802469135803e-05,
"loss": 0.0456,
"step": 332
},
{
"epoch": 22.266666666666666,
"grad_norm": 2.3118255138397217,
"learning_rate": 2.8641975308641977e-05,
"loss": 0.0377,
"step": 334
},
{
"epoch": 22.4,
"grad_norm": 2.3639698028564453,
"learning_rate": 2.814814814814815e-05,
"loss": 0.0708,
"step": 336
},
{
"epoch": 22.533333333333335,
"grad_norm": 1.741746187210083,
"learning_rate": 2.765432098765432e-05,
"loss": 0.0353,
"step": 338
},
{
"epoch": 22.666666666666668,
"grad_norm": 0.6108101010322571,
"learning_rate": 2.7160493827160493e-05,
"loss": 0.0531,
"step": 340
},
{
"epoch": 22.8,
"grad_norm": 2.961045503616333,
"learning_rate": 2.6666666666666667e-05,
"loss": 0.0394,
"step": 342
},
{
"epoch": 22.933333333333334,
"grad_norm": 1.7298003435134888,
"learning_rate": 2.617283950617284e-05,
"loss": 0.0289,
"step": 344
},
{
"epoch": 23.0,
"eval_accuracy": 0.6287878787878788,
"eval_f1_macro": 0.5465784005632545,
"eval_f1_micro": 0.6287878787878788,
"eval_f1_weighted": 0.6179920372130975,
"eval_loss": 1.5417176485061646,
"eval_precision_macro": 0.5409361471861472,
"eval_precision_micro": 0.6287878787878788,
"eval_precision_weighted": 0.610816976584022,
"eval_recall_macro": 0.5549206349206349,
"eval_recall_micro": 0.6287878787878788,
"eval_recall_weighted": 0.6287878787878788,
"eval_runtime": 1.9902,
"eval_samples_per_second": 66.326,
"eval_steps_per_second": 2.512,
"step": 345
},
{
"epoch": 23.066666666666666,
"grad_norm": 0.7690654397010803,
"learning_rate": 2.5679012345679017e-05,
"loss": 0.0458,
"step": 346
},
{
"epoch": 23.2,
"grad_norm": 3.320651054382324,
"learning_rate": 2.5185185185185183e-05,
"loss": 0.0804,
"step": 348
},
{
"epoch": 23.333333333333332,
"grad_norm": 2.0301012992858887,
"learning_rate": 2.4691358024691357e-05,
"loss": 0.0279,
"step": 350
},
{
"epoch": 23.466666666666665,
"grad_norm": 0.4531901478767395,
"learning_rate": 2.4197530864197533e-05,
"loss": 0.0139,
"step": 352
},
{
"epoch": 23.6,
"grad_norm": 2.56703519821167,
"learning_rate": 2.3703703703703707e-05,
"loss": 0.0783,
"step": 354
},
{
"epoch": 23.733333333333334,
"grad_norm": 0.20635652542114258,
"learning_rate": 2.3209876543209877e-05,
"loss": 0.012,
"step": 356
},
{
"epoch": 23.866666666666667,
"grad_norm": 0.5930025577545166,
"learning_rate": 2.271604938271605e-05,
"loss": 0.0145,
"step": 358
},
{
"epoch": 24.0,
"grad_norm": 0.24041523039340973,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.01,
"step": 360
},
{
"epoch": 24.0,
"eval_accuracy": 0.6363636363636364,
"eval_f1_macro": 0.5474889044983636,
"eval_f1_micro": 0.6363636363636364,
"eval_f1_weighted": 0.6187343775995573,
"eval_loss": 1.5670151710510254,
"eval_precision_macro": 0.5434552419168567,
"eval_precision_micro": 0.6363636363636364,
"eval_precision_weighted": 0.6103857259761386,
"eval_recall_macro": 0.5594179894179894,
"eval_recall_micro": 0.6363636363636364,
"eval_recall_weighted": 0.6363636363636364,
"eval_runtime": 1.9405,
"eval_samples_per_second": 68.023,
"eval_steps_per_second": 2.577,
"step": 360
},
{
"epoch": 24.133333333333333,
"grad_norm": 0.21558411419391632,
"learning_rate": 2.1728395061728397e-05,
"loss": 0.0102,
"step": 362
},
{
"epoch": 24.266666666666666,
"grad_norm": 3.2394814491271973,
"learning_rate": 2.123456790123457e-05,
"loss": 0.0218,
"step": 364
},
{
"epoch": 24.4,
"grad_norm": 3.6115405559539795,
"learning_rate": 2.074074074074074e-05,
"loss": 0.1149,
"step": 366
},
{
"epoch": 24.533333333333335,
"grad_norm": 0.1589735597372055,
"learning_rate": 2.0246913580246917e-05,
"loss": 0.0082,
"step": 368
},
{
"epoch": 24.666666666666668,
"grad_norm": 1.3840848207473755,
"learning_rate": 1.9753086419753087e-05,
"loss": 0.0174,
"step": 370
},
{
"epoch": 24.8,
"grad_norm": 3.772754192352295,
"learning_rate": 1.925925925925926e-05,
"loss": 0.043,
"step": 372
},
{
"epoch": 24.933333333333334,
"grad_norm": 0.41601723432540894,
"learning_rate": 1.8765432098765433e-05,
"loss": 0.035,
"step": 374
},
{
"epoch": 25.0,
"eval_accuracy": 0.6363636363636364,
"eval_f1_macro": 0.5529395694676043,
"eval_f1_micro": 0.6363636363636364,
"eval_f1_weighted": 0.6209326623035122,
"eval_loss": 1.6037245988845825,
"eval_precision_macro": 0.5470247238680418,
"eval_precision_micro": 0.6363636363636364,
"eval_precision_weighted": 0.6156263091746962,
"eval_recall_macro": 0.5679213907785335,
"eval_recall_micro": 0.6363636363636364,
"eval_recall_weighted": 0.6363636363636364,
"eval_runtime": 1.9551,
"eval_samples_per_second": 67.515,
"eval_steps_per_second": 2.557,
"step": 375
},
{
"epoch": 25.066666666666666,
"grad_norm": 0.4554837942123413,
"learning_rate": 1.8271604938271607e-05,
"loss": 0.0636,
"step": 376
},
{
"epoch": 25.2,
"grad_norm": 4.667645454406738,
"learning_rate": 1.777777777777778e-05,
"loss": 0.0685,
"step": 378
},
{
"epoch": 25.333333333333332,
"grad_norm": 6.68064022064209,
"learning_rate": 1.728395061728395e-05,
"loss": 0.0919,
"step": 380
},
{
"epoch": 25.466666666666665,
"grad_norm": 0.2510056793689728,
"learning_rate": 1.6790123456790123e-05,
"loss": 0.0111,
"step": 382
},
{
"epoch": 25.6,
"grad_norm": 0.6245520114898682,
"learning_rate": 1.62962962962963e-05,
"loss": 0.0134,
"step": 384
},
{
"epoch": 25.733333333333334,
"grad_norm": 2.165201187133789,
"learning_rate": 1.580246913580247e-05,
"loss": 0.0271,
"step": 386
},
{
"epoch": 25.866666666666667,
"grad_norm": 0.24112091958522797,
"learning_rate": 1.5308641975308643e-05,
"loss": 0.0105,
"step": 388
},
{
"epoch": 26.0,
"grad_norm": 0.377363383769989,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.0109,
"step": 390
},
{
"epoch": 26.0,
"eval_accuracy": 0.6212121212121212,
"eval_f1_macro": 0.5896814040471776,
"eval_f1_micro": 0.6212121212121212,
"eval_f1_weighted": 0.6203213160225189,
"eval_loss": 1.6751586198806763,
"eval_precision_macro": 0.6144605795534588,
"eval_precision_micro": 0.6212121212121212,
"eval_precision_weighted": 0.6527441598649029,
"eval_recall_macro": 0.5999697656840514,
"eval_recall_micro": 0.6212121212121212,
"eval_recall_weighted": 0.6212121212121212,
"eval_runtime": 1.9656,
"eval_samples_per_second": 67.154,
"eval_steps_per_second": 2.544,
"step": 390
},
{
"epoch": 26.133333333333333,
"grad_norm": 0.3774866461753845,
"learning_rate": 1.4320987654320988e-05,
"loss": 0.0097,
"step": 392
},
{
"epoch": 26.266666666666666,
"grad_norm": 3.956695079803467,
"learning_rate": 1.382716049382716e-05,
"loss": 0.0233,
"step": 394
},
{
"epoch": 26.4,
"grad_norm": 0.5877533555030823,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.0157,
"step": 396
},
{
"epoch": 26.533333333333335,
"grad_norm": 1.2962318658828735,
"learning_rate": 1.2839506172839508e-05,
"loss": 0.0249,
"step": 398
},
{
"epoch": 26.666666666666668,
"grad_norm": 2.2431485652923584,
"learning_rate": 1.2345679012345678e-05,
"loss": 0.0224,
"step": 400
},
{
"epoch": 26.8,
"grad_norm": 0.21492817997932434,
"learning_rate": 1.1851851851851853e-05,
"loss": 0.0117,
"step": 402
},
{
"epoch": 26.933333333333334,
"grad_norm": 0.4237399697303772,
"learning_rate": 1.1358024691358025e-05,
"loss": 0.038,
"step": 404
},
{
"epoch": 27.0,
"eval_accuracy": 0.6136363636363636,
"eval_f1_macro": 0.5343822919199936,
"eval_f1_micro": 0.6136363636363636,
"eval_f1_weighted": 0.6008425380028616,
"eval_loss": 1.672375202178955,
"eval_precision_macro": 0.5331553830282576,
"eval_precision_micro": 0.6136363636363636,
"eval_precision_weighted": 0.6004965634415023,
"eval_recall_macro": 0.546832955404384,
"eval_recall_micro": 0.6136363636363636,
"eval_recall_weighted": 0.6136363636363636,
"eval_runtime": 1.9801,
"eval_samples_per_second": 66.662,
"eval_steps_per_second": 2.525,
"step": 405
},
{
"epoch": 27.066666666666666,
"grad_norm": 1.5725435018539429,
"learning_rate": 1.0864197530864198e-05,
"loss": 0.0149,
"step": 406
},
{
"epoch": 27.2,
"grad_norm": 0.13784648478031158,
"learning_rate": 1.037037037037037e-05,
"loss": 0.0092,
"step": 408
},
{
"epoch": 27.333333333333332,
"grad_norm": 0.09840863198041916,
"learning_rate": 9.876543209876543e-06,
"loss": 0.008,
"step": 410
},
{
"epoch": 27.466666666666665,
"grad_norm": 0.8349915146827698,
"learning_rate": 9.382716049382717e-06,
"loss": 0.0206,
"step": 412
},
{
"epoch": 27.6,
"grad_norm": 0.33149102330207825,
"learning_rate": 8.88888888888889e-06,
"loss": 0.0173,
"step": 414
},
{
"epoch": 27.733333333333334,
"grad_norm": 0.3867279589176178,
"learning_rate": 8.395061728395062e-06,
"loss": 0.0093,
"step": 416
},
{
"epoch": 27.866666666666667,
"grad_norm": 1.726897120475769,
"learning_rate": 7.901234567901235e-06,
"loss": 0.0214,
"step": 418
},
{
"epoch": 28.0,
"grad_norm": 0.19306233525276184,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.0116,
"step": 420
},
{
"epoch": 28.0,
"eval_accuracy": 0.6212121212121212,
"eval_f1_macro": 0.5383730158730159,
"eval_f1_micro": 0.6212121212121212,
"eval_f1_weighted": 0.609029280904281,
"eval_loss": 1.6251877546310425,
"eval_precision_macro": 0.533696432596027,
"eval_precision_micro": 0.6212121212121212,
"eval_precision_weighted": 0.6033010121498966,
"eval_recall_macro": 0.5490778533635676,
"eval_recall_micro": 0.6212121212121212,
"eval_recall_weighted": 0.6212121212121212,
"eval_runtime": 1.9285,
"eval_samples_per_second": 68.448,
"eval_steps_per_second": 2.593,
"step": 420
},
{
"epoch": 28.133333333333333,
"grad_norm": 1.669783115386963,
"learning_rate": 6.91358024691358e-06,
"loss": 0.0318,
"step": 422
},
{
"epoch": 28.266666666666666,
"grad_norm": 0.6250646114349365,
"learning_rate": 6.419753086419754e-06,
"loss": 0.0195,
"step": 424
},
{
"epoch": 28.4,
"grad_norm": 0.4752732813358307,
"learning_rate": 5.925925925925927e-06,
"loss": 0.0124,
"step": 426
},
{
"epoch": 28.533333333333335,
"grad_norm": 0.16341274976730347,
"learning_rate": 5.432098765432099e-06,
"loss": 0.0065,
"step": 428
},
{
"epoch": 28.666666666666668,
"grad_norm": 0.08904340863227844,
"learning_rate": 4.938271604938272e-06,
"loss": 0.0062,
"step": 430
},
{
"epoch": 28.8,
"grad_norm": 0.24332502484321594,
"learning_rate": 4.444444444444445e-06,
"loss": 0.0055,
"step": 432
},
{
"epoch": 28.933333333333334,
"grad_norm": 0.47205692529678345,
"learning_rate": 3.9506172839506175e-06,
"loss": 0.006,
"step": 434
},
{
"epoch": 29.0,
"eval_accuracy": 0.6363636363636364,
"eval_f1_macro": 0.557191887992969,
"eval_f1_micro": 0.6363636363636364,
"eval_f1_weighted": 0.6294141170899599,
"eval_loss": 1.597952961921692,
"eval_precision_macro": 0.5529214559386972,
"eval_precision_micro": 0.6363636363636364,
"eval_precision_weighted": 0.6245954516428655,
"eval_recall_macro": 0.563363567649282,
"eval_recall_micro": 0.6363636363636364,
"eval_recall_weighted": 0.6363636363636364,
"eval_runtime": 1.9083,
"eval_samples_per_second": 69.172,
"eval_steps_per_second": 2.62,
"step": 435
},
{
"epoch": 29.066666666666666,
"grad_norm": 0.27642032504081726,
"learning_rate": 3.45679012345679e-06,
"loss": 0.0162,
"step": 436
},
{
"epoch": 29.2,
"grad_norm": 0.9449041485786438,
"learning_rate": 2.9629629629629633e-06,
"loss": 0.0088,
"step": 438
},
{
"epoch": 29.333333333333332,
"grad_norm": 0.14337310194969177,
"learning_rate": 2.469135802469136e-06,
"loss": 0.0193,
"step": 440
},
{
"epoch": 29.466666666666665,
"grad_norm": 0.17881515622138977,
"learning_rate": 1.9753086419753087e-06,
"loss": 0.0191,
"step": 442
},
{
"epoch": 29.6,
"grad_norm": 0.15386801958084106,
"learning_rate": 1.4814814814814817e-06,
"loss": 0.005,
"step": 444
},
{
"epoch": 29.733333333333334,
"grad_norm": 0.32567164301872253,
"learning_rate": 9.876543209876544e-07,
"loss": 0.0077,
"step": 446
},
{
"epoch": 29.866666666666667,
"grad_norm": 0.6249086260795593,
"learning_rate": 4.938271604938272e-07,
"loss": 0.0158,
"step": 448
},
{
"epoch": 30.0,
"grad_norm": 0.24103443324565887,
"learning_rate": 0.0,
"loss": 0.0046,
"step": 450
},
{
"epoch": 30.0,
"eval_accuracy": 0.6439393939393939,
"eval_f1_macro": 0.5605037390491809,
"eval_f1_micro": 0.6439393939393939,
"eval_f1_weighted": 0.634156085647718,
"eval_loss": 1.593876838684082,
"eval_precision_macro": 0.5545634920634921,
"eval_precision_micro": 0.6439393939393939,
"eval_precision_weighted": 0.6269465488215488,
"eval_recall_macro": 0.5686545729402873,
"eval_recall_micro": 0.6439393939393939,
"eval_recall_weighted": 0.6439393939393939,
"eval_runtime": 2.0538,
"eval_samples_per_second": 64.27,
"eval_steps_per_second": 2.434,
"step": 450
},
{
"epoch": 30.0,
"step": 450,
"total_flos": 1.0740871074163507e+18,
"train_loss": 0.6009381743893027,
"train_runtime": 318.0836,
"train_samples_per_second": 43.573,
"train_steps_per_second": 1.415
}
],
"logging_steps": 2,
"max_steps": 450,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.0740871074163507e+18,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}