{ "best_metric": 1.2568100690841675, "best_model_checkpoint": "square_run_32_batch/checkpoint-240", "epoch": 30.0, "eval_steps": 500, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13333333333333333, "grad_norm": 6.848705291748047, "learning_rate": 4.444444444444445e-06, "loss": 2.0193, "step": 2 }, { "epoch": 0.26666666666666666, "grad_norm": 5.4228925704956055, "learning_rate": 8.88888888888889e-06, "loss": 2.1013, "step": 4 }, { "epoch": 0.4, "grad_norm": 4.7422590255737305, "learning_rate": 1.3333333333333333e-05, "loss": 1.9569, "step": 6 }, { "epoch": 0.5333333333333333, "grad_norm": 5.0756025314331055, "learning_rate": 1.777777777777778e-05, "loss": 1.9677, "step": 8 }, { "epoch": 0.6666666666666666, "grad_norm": 7.1845269203186035, "learning_rate": 2.2222222222222223e-05, "loss": 1.884, "step": 10 }, { "epoch": 0.8, "grad_norm": 4.051375865936279, "learning_rate": 2.6666666666666667e-05, "loss": 1.9017, "step": 12 }, { "epoch": 0.9333333333333333, "grad_norm": 5.293440341949463, "learning_rate": 3.111111111111111e-05, "loss": 1.9373, "step": 14 }, { "epoch": 1.0, "eval_accuracy": 0.1893939393939394, "eval_f1_macro": 0.04638218923933209, "eval_f1_micro": 0.1893939393939394, "eval_f1_weighted": 0.06149153876426603, "eval_loss": 1.8818118572235107, "eval_precision_macro": 0.027685492801771874, "eval_precision_micro": 0.1893939393939394, "eval_precision_weighted": 0.03670425182053089, "eval_recall_macro": 0.14285714285714285, "eval_recall_micro": 0.1893939393939394, "eval_recall_weighted": 0.1893939393939394, "eval_runtime": 2.2684, "eval_samples_per_second": 58.192, "eval_steps_per_second": 2.204, "step": 15 }, { "epoch": 1.0666666666666667, "grad_norm": 3.4230170249938965, "learning_rate": 3.555555555555556e-05, "loss": 1.9179, "step": 16 }, { "epoch": 1.2, "grad_norm": 3.4030396938323975, "learning_rate": 4e-05, "loss": 1.8139, "step": 18 }, { "epoch": 1.3333333333333333, "grad_norm": 4.195278167724609, "learning_rate": 4.4444444444444447e-05, "loss": 1.8941, "step": 20 }, { "epoch": 1.4666666666666668, "grad_norm": 3.2356927394866943, "learning_rate": 4.888888888888889e-05, "loss": 1.8915, "step": 22 }, { "epoch": 1.6, "grad_norm": 3.322704315185547, "learning_rate": 5.333333333333333e-05, "loss": 1.917, "step": 24 }, { "epoch": 1.7333333333333334, "grad_norm": 3.293910026550293, "learning_rate": 5.7777777777777776e-05, "loss": 1.8943, "step": 26 }, { "epoch": 1.8666666666666667, "grad_norm": 4.803905487060547, "learning_rate": 6.222222222222222e-05, "loss": 1.8841, "step": 28 }, { "epoch": 2.0, "grad_norm": 4.006722927093506, "learning_rate": 6.666666666666667e-05, "loss": 1.869, "step": 30 }, { "epoch": 2.0, "eval_accuracy": 0.26515151515151514, "eval_f1_macro": 0.10998877665544333, "eval_f1_micro": 0.26515151515151514, "eval_f1_weighted": 0.14177124783185388, "eval_loss": 1.864223599433899, "eval_precision_macro": 0.075, "eval_precision_micro": 0.26515151515151514, "eval_precision_weighted": 0.09678030303030305, "eval_recall_macro": 0.20634920634920634, "eval_recall_micro": 0.26515151515151514, "eval_recall_weighted": 0.26515151515151514, "eval_runtime": 1.8754, "eval_samples_per_second": 70.384, "eval_steps_per_second": 2.666, "step": 30 }, { "epoch": 2.1333333333333333, "grad_norm": 2.15875506401062, "learning_rate": 7.111111111111112e-05, "loss": 1.8686, "step": 32 }, { "epoch": 2.2666666666666666, "grad_norm": 2.8864150047302246, "learning_rate": 7.555555555555556e-05, "loss": 1.8652, "step": 34 }, { "epoch": 2.4, "grad_norm": 3.819974899291992, "learning_rate": 8e-05, "loss": 1.818, "step": 36 }, { "epoch": 2.533333333333333, "grad_norm": 7.35491418838501, "learning_rate": 8.444444444444444e-05, "loss": 1.9347, "step": 38 }, { "epoch": 2.6666666666666665, "grad_norm": 5.84605598449707, "learning_rate": 8.888888888888889e-05, "loss": 1.8508, "step": 40 }, { "epoch": 2.8, "grad_norm": 2.4050137996673584, "learning_rate": 9.333333333333334e-05, "loss": 1.8884, "step": 42 }, { "epoch": 2.9333333333333336, "grad_norm": 5.182938575744629, "learning_rate": 9.777777777777778e-05, "loss": 1.9218, "step": 44 }, { "epoch": 3.0, "eval_accuracy": 0.25757575757575757, "eval_f1_macro": 0.11628985865833667, "eval_f1_micro": 0.25757575757575757, "eval_f1_weighted": 0.1459780747505363, "eval_loss": 1.8754385709762573, "eval_precision_macro": 0.131615925058548, "eval_precision_micro": 0.25757575757575757, "eval_precision_weighted": 0.15663189269746647, "eval_recall_macro": 0.19047619047619047, "eval_recall_micro": 0.25757575757575757, "eval_recall_weighted": 0.25757575757575757, "eval_runtime": 1.9051, "eval_samples_per_second": 69.286, "eval_steps_per_second": 2.624, "step": 45 }, { "epoch": 3.066666666666667, "grad_norm": 5.7963128089904785, "learning_rate": 9.97530864197531e-05, "loss": 1.9754, "step": 46 }, { "epoch": 3.2, "grad_norm": 5.1605048179626465, "learning_rate": 9.925925925925926e-05, "loss": 1.8577, "step": 48 }, { "epoch": 3.3333333333333335, "grad_norm": 3.756355047225952, "learning_rate": 9.876543209876543e-05, "loss": 1.8707, "step": 50 }, { "epoch": 3.466666666666667, "grad_norm": 2.5353710651397705, "learning_rate": 9.827160493827162e-05, "loss": 1.7918, "step": 52 }, { "epoch": 3.6, "grad_norm": 4.181753635406494, "learning_rate": 9.777777777777778e-05, "loss": 1.8251, "step": 54 }, { "epoch": 3.7333333333333334, "grad_norm": 2.4634644985198975, "learning_rate": 9.728395061728396e-05, "loss": 1.7713, "step": 56 }, { "epoch": 3.8666666666666667, "grad_norm": 8.700553894042969, "learning_rate": 9.679012345679013e-05, "loss": 1.8962, "step": 58 }, { "epoch": 4.0, "grad_norm": 5.921916484832764, "learning_rate": 9.62962962962963e-05, "loss": 1.6733, "step": 60 }, { "epoch": 4.0, "eval_accuracy": 0.38636363636363635, "eval_f1_macro": 0.2445293836598184, "eval_f1_micro": 0.38636363636363635, "eval_f1_weighted": 0.3052538765582244, "eval_loss": 1.6881486177444458, "eval_precision_macro": 0.24274221103966703, "eval_precision_micro": 0.38636363636363635, "eval_precision_weighted": 0.2917426054412356, "eval_recall_macro": 0.2992441421012849, "eval_recall_micro": 0.38636363636363635, "eval_recall_weighted": 0.38636363636363635, "eval_runtime": 1.8719, "eval_samples_per_second": 70.516, "eval_steps_per_second": 2.671, "step": 60 }, { "epoch": 4.133333333333334, "grad_norm": 3.3924758434295654, "learning_rate": 9.580246913580247e-05, "loss": 1.5941, "step": 62 }, { "epoch": 4.266666666666667, "grad_norm": 6.785348415374756, "learning_rate": 9.530864197530865e-05, "loss": 1.582, "step": 64 }, { "epoch": 4.4, "grad_norm": 4.813143730163574, "learning_rate": 9.481481481481483e-05, "loss": 1.649, "step": 66 }, { "epoch": 4.533333333333333, "grad_norm": 5.351255893707275, "learning_rate": 9.432098765432099e-05, "loss": 1.6271, "step": 68 }, { "epoch": 4.666666666666667, "grad_norm": 11.194862365722656, "learning_rate": 9.382716049382717e-05, "loss": 1.7395, "step": 70 }, { "epoch": 4.8, "grad_norm": 6.263866424560547, "learning_rate": 9.333333333333334e-05, "loss": 1.4422, "step": 72 }, { "epoch": 4.933333333333334, "grad_norm": 8.602386474609375, "learning_rate": 9.28395061728395e-05, "loss": 1.54, "step": 74 }, { "epoch": 5.0, "eval_accuracy": 0.42424242424242425, "eval_f1_macro": 0.32515713851372885, "eval_f1_micro": 0.42424242424242425, "eval_f1_weighted": 0.38558697740383735, "eval_loss": 1.5528110265731812, "eval_precision_macro": 0.34291374508765815, "eval_precision_micro": 0.42424242424242425, "eval_precision_weighted": 0.4100833883442579, "eval_recall_macro": 0.35698412698412696, "eval_recall_micro": 0.42424242424242425, "eval_recall_weighted": 0.42424242424242425, "eval_runtime": 1.9443, "eval_samples_per_second": 67.891, "eval_steps_per_second": 2.572, "step": 75 }, { "epoch": 5.066666666666666, "grad_norm": 7.738183498382568, "learning_rate": 9.234567901234568e-05, "loss": 1.6152, "step": 76 }, { "epoch": 5.2, "grad_norm": 7.564102649688721, "learning_rate": 9.185185185185186e-05, "loss": 1.4993, "step": 78 }, { "epoch": 5.333333333333333, "grad_norm": 8.335043907165527, "learning_rate": 9.135802469135802e-05, "loss": 1.494, "step": 80 }, { "epoch": 5.466666666666667, "grad_norm": 6.382967948913574, "learning_rate": 9.08641975308642e-05, "loss": 1.4944, "step": 82 }, { "epoch": 5.6, "grad_norm": 7.259094715118408, "learning_rate": 9.037037037037038e-05, "loss": 1.3191, "step": 84 }, { "epoch": 5.733333333333333, "grad_norm": 4.972009658813477, "learning_rate": 8.987654320987655e-05, "loss": 1.3894, "step": 86 }, { "epoch": 5.866666666666667, "grad_norm": 9.250694274902344, "learning_rate": 8.938271604938272e-05, "loss": 1.3979, "step": 88 }, { "epoch": 6.0, "grad_norm": 7.207069396972656, "learning_rate": 8.888888888888889e-05, "loss": 1.4418, "step": 90 }, { "epoch": 6.0, "eval_accuracy": 0.38636363636363635, "eval_f1_macro": 0.285838283865586, "eval_f1_micro": 0.38636363636363635, "eval_f1_weighted": 0.3212562379097532, "eval_loss": 1.5736558437347412, "eval_precision_macro": 0.284608858206039, "eval_precision_micro": 0.38636363636363635, "eval_precision_weighted": 0.3242631096693543, "eval_recall_macro": 0.3398034769463341, "eval_recall_micro": 0.38636363636363635, "eval_recall_weighted": 0.38636363636363635, "eval_runtime": 1.8593, "eval_samples_per_second": 70.995, "eval_steps_per_second": 2.689, "step": 90 }, { "epoch": 6.133333333333334, "grad_norm": 4.753687381744385, "learning_rate": 8.839506172839507e-05, "loss": 1.3218, "step": 92 }, { "epoch": 6.266666666666667, "grad_norm": 5.942229747772217, "learning_rate": 8.790123456790123e-05, "loss": 1.3995, "step": 94 }, { "epoch": 6.4, "grad_norm": 4.026015281677246, "learning_rate": 8.740740740740741e-05, "loss": 1.3155, "step": 96 }, { "epoch": 6.533333333333333, "grad_norm": 4.893887042999268, "learning_rate": 8.691358024691359e-05, "loss": 1.2009, "step": 98 }, { "epoch": 6.666666666666667, "grad_norm": 3.904926061630249, "learning_rate": 8.641975308641975e-05, "loss": 0.98, "step": 100 }, { "epoch": 6.8, "grad_norm": 4.266864776611328, "learning_rate": 8.592592592592593e-05, "loss": 1.0803, "step": 102 }, { "epoch": 6.933333333333334, "grad_norm": 5.24403190612793, "learning_rate": 8.54320987654321e-05, "loss": 0.8592, "step": 104 }, { "epoch": 7.0, "eval_accuracy": 0.4393939393939394, "eval_f1_macro": 0.3443599467808913, "eval_f1_micro": 0.4393939393939394, "eval_f1_weighted": 0.39645908811500513, "eval_loss": 1.5408130884170532, "eval_precision_macro": 0.32083233878346656, "eval_precision_micro": 0.4393939393939394, "eval_precision_weighted": 0.36735850041771095, "eval_recall_macro": 0.37913832199546477, "eval_recall_micro": 0.4393939393939394, "eval_recall_weighted": 0.4393939393939394, "eval_runtime": 1.9485, "eval_samples_per_second": 67.743, "eval_steps_per_second": 2.566, "step": 105 }, { "epoch": 7.066666666666666, "grad_norm": 5.595825672149658, "learning_rate": 8.493827160493828e-05, "loss": 1.0203, "step": 106 }, { "epoch": 7.2, "grad_norm": 5.34617805480957, "learning_rate": 8.444444444444444e-05, "loss": 1.0819, "step": 108 }, { "epoch": 7.333333333333333, "grad_norm": 6.987905025482178, "learning_rate": 8.395061728395062e-05, "loss": 1.1165, "step": 110 }, { "epoch": 7.466666666666667, "grad_norm": 6.039572715759277, "learning_rate": 8.34567901234568e-05, "loss": 1.0403, "step": 112 }, { "epoch": 7.6, "grad_norm": 6.031858444213867, "learning_rate": 8.296296296296296e-05, "loss": 0.9709, "step": 114 }, { "epoch": 7.733333333333333, "grad_norm": 6.656283855438232, "learning_rate": 8.246913580246915e-05, "loss": 0.8358, "step": 116 }, { "epoch": 7.866666666666667, "grad_norm": 6.286685943603516, "learning_rate": 8.197530864197531e-05, "loss": 1.146, "step": 118 }, { "epoch": 8.0, "grad_norm": 9.892986297607422, "learning_rate": 8.148148148148148e-05, "loss": 1.1427, "step": 120 }, { "epoch": 8.0, "eval_accuracy": 0.5606060606060606, "eval_f1_macro": 0.46377203827822905, "eval_f1_micro": 0.5606060606060606, "eval_f1_weighted": 0.5317054176401353, "eval_loss": 1.2803829908370972, "eval_precision_macro": 0.469819473380193, "eval_precision_micro": 0.5606060606060606, "eval_precision_weighted": 0.5280005916463256, "eval_recall_macro": 0.4830687830687831, "eval_recall_micro": 0.5606060606060606, "eval_recall_weighted": 0.5606060606060606, "eval_runtime": 1.9474, "eval_samples_per_second": 67.784, "eval_steps_per_second": 2.568, "step": 120 }, { "epoch": 8.133333333333333, "grad_norm": 4.904130458831787, "learning_rate": 8.098765432098767e-05, "loss": 0.8933, "step": 122 }, { "epoch": 8.266666666666667, "grad_norm": 4.419686794281006, "learning_rate": 8.049382716049383e-05, "loss": 0.9245, "step": 124 }, { "epoch": 8.4, "grad_norm": 8.33668041229248, "learning_rate": 8e-05, "loss": 0.8385, "step": 126 }, { "epoch": 8.533333333333333, "grad_norm": 8.35203742980957, "learning_rate": 7.950617283950618e-05, "loss": 0.9428, "step": 128 }, { "epoch": 8.666666666666666, "grad_norm": 5.724539279937744, "learning_rate": 7.901234567901235e-05, "loss": 0.7591, "step": 130 }, { "epoch": 8.8, "grad_norm": 8.662413597106934, "learning_rate": 7.851851851851852e-05, "loss": 0.995, "step": 132 }, { "epoch": 8.933333333333334, "grad_norm": 5.197690010070801, "learning_rate": 7.802469135802469e-05, "loss": 0.7849, "step": 134 }, { "epoch": 9.0, "eval_accuracy": 0.553030303030303, "eval_f1_macro": 0.46486536691732006, "eval_f1_micro": 0.553030303030303, "eval_f1_weighted": 0.529141811901771, "eval_loss": 1.2879999876022339, "eval_precision_macro": 0.48036078903674717, "eval_precision_micro": 0.553030303030303, "eval_precision_weighted": 0.540143107077697, "eval_recall_macro": 0.4822675736961451, "eval_recall_micro": 0.553030303030303, "eval_recall_weighted": 0.553030303030303, "eval_runtime": 1.9912, "eval_samples_per_second": 66.292, "eval_steps_per_second": 2.511, "step": 135 }, { "epoch": 9.066666666666666, "grad_norm": 7.03670597076416, "learning_rate": 7.753086419753088e-05, "loss": 0.8049, "step": 136 }, { "epoch": 9.2, "grad_norm": 5.591729640960693, "learning_rate": 7.703703703703704e-05, "loss": 0.9341, "step": 138 }, { "epoch": 9.333333333333334, "grad_norm": 6.677962303161621, "learning_rate": 7.65432098765432e-05, "loss": 0.7679, "step": 140 }, { "epoch": 9.466666666666667, "grad_norm": 5.4789934158325195, "learning_rate": 7.60493827160494e-05, "loss": 0.7773, "step": 142 }, { "epoch": 9.6, "grad_norm": 5.957266330718994, "learning_rate": 7.555555555555556e-05, "loss": 0.638, "step": 144 }, { "epoch": 9.733333333333333, "grad_norm": 5.691118240356445, "learning_rate": 7.506172839506173e-05, "loss": 0.7762, "step": 146 }, { "epoch": 9.866666666666667, "grad_norm": 6.8899827003479, "learning_rate": 7.456790123456791e-05, "loss": 0.9012, "step": 148 }, { "epoch": 10.0, "grad_norm": 7.408969402313232, "learning_rate": 7.407407407407407e-05, "loss": 0.6846, "step": 150 }, { "epoch": 10.0, "eval_accuracy": 0.5151515151515151, "eval_f1_macro": 0.42983280392444223, "eval_f1_micro": 0.5151515151515151, "eval_f1_weighted": 0.48105498068393227, "eval_loss": 1.3130199909210205, "eval_precision_macro": 0.4404005812415951, "eval_precision_micro": 0.5151515151515151, "eval_precision_weighted": 0.5005354338015628, "eval_recall_macro": 0.4670748299319728, "eval_recall_micro": 0.5151515151515151, "eval_recall_weighted": 0.5151515151515151, "eval_runtime": 1.9855, "eval_samples_per_second": 66.482, "eval_steps_per_second": 2.518, "step": 150 }, { "epoch": 10.133333333333333, "grad_norm": 5.070552825927734, "learning_rate": 7.358024691358025e-05, "loss": 0.6116, "step": 152 }, { "epoch": 10.266666666666667, "grad_norm": 4.844223499298096, "learning_rate": 7.308641975308643e-05, "loss": 0.6517, "step": 154 }, { "epoch": 10.4, "grad_norm": 3.965522289276123, "learning_rate": 7.25925925925926e-05, "loss": 0.5573, "step": 156 }, { "epoch": 10.533333333333333, "grad_norm": 7.53262996673584, "learning_rate": 7.209876543209877e-05, "loss": 0.7258, "step": 158 }, { "epoch": 10.666666666666666, "grad_norm": 6.725161552429199, "learning_rate": 7.160493827160494e-05, "loss": 0.8109, "step": 160 }, { "epoch": 10.8, "grad_norm": 8.250865936279297, "learning_rate": 7.111111111111112e-05, "loss": 0.8596, "step": 162 }, { "epoch": 10.933333333333334, "grad_norm": 4.163515567779541, "learning_rate": 7.061728395061728e-05, "loss": 0.4006, "step": 164 }, { "epoch": 11.0, "eval_accuracy": 0.5833333333333334, "eval_f1_macro": 0.49308835780529725, "eval_f1_micro": 0.5833333333333334, "eval_f1_weighted": 0.5597960736751899, "eval_loss": 1.295769214630127, "eval_precision_macro": 0.498317425896604, "eval_precision_micro": 0.5833333333333334, "eval_precision_weighted": 0.5756076561299337, "eval_recall_macro": 0.5229024943310657, "eval_recall_micro": 0.5833333333333334, "eval_recall_weighted": 0.5833333333333334, "eval_runtime": 1.9133, "eval_samples_per_second": 68.991, "eval_steps_per_second": 2.613, "step": 165 }, { "epoch": 11.066666666666666, "grad_norm": 4.829576015472412, "learning_rate": 7.012345679012346e-05, "loss": 0.6355, "step": 166 }, { "epoch": 11.2, "grad_norm": 5.353898525238037, "learning_rate": 6.962962962962964e-05, "loss": 0.4955, "step": 168 }, { "epoch": 11.333333333333334, "grad_norm": 5.44912052154541, "learning_rate": 6.91358024691358e-05, "loss": 0.4833, "step": 170 }, { "epoch": 11.466666666666667, "grad_norm": 5.900742530822754, "learning_rate": 6.864197530864198e-05, "loss": 0.5752, "step": 172 }, { "epoch": 11.6, "grad_norm": 6.004303455352783, "learning_rate": 6.814814814814815e-05, "loss": 0.5738, "step": 174 }, { "epoch": 11.733333333333333, "grad_norm": 3.937319040298462, "learning_rate": 6.765432098765433e-05, "loss": 0.4661, "step": 176 }, { "epoch": 11.866666666666667, "grad_norm": 4.814683437347412, "learning_rate": 6.716049382716049e-05, "loss": 0.5694, "step": 178 }, { "epoch": 12.0, "grad_norm": 6.7769880294799805, "learning_rate": 6.666666666666667e-05, "loss": 0.4329, "step": 180 }, { "epoch": 12.0, "eval_accuracy": 0.553030303030303, "eval_f1_macro": 0.506246746427407, "eval_f1_micro": 0.553030303030303, "eval_f1_weighted": 0.5561970515744254, "eval_loss": 1.299007773399353, "eval_precision_macro": 0.5314684490530354, "eval_precision_micro": 0.553030303030303, "eval_precision_weighted": 0.5874290165244113, "eval_recall_macro": 0.5133106575963718, "eval_recall_micro": 0.553030303030303, "eval_recall_weighted": 0.553030303030303, "eval_runtime": 2.0372, "eval_samples_per_second": 64.793, "eval_steps_per_second": 2.454, "step": 180 }, { "epoch": 12.133333333333333, "grad_norm": 5.787886619567871, "learning_rate": 6.617283950617285e-05, "loss": 0.5719, "step": 182 }, { "epoch": 12.266666666666667, "grad_norm": 2.843268632888794, "learning_rate": 6.567901234567901e-05, "loss": 0.4646, "step": 184 }, { "epoch": 12.4, "grad_norm": 4.530274391174316, "learning_rate": 6.51851851851852e-05, "loss": 0.3544, "step": 186 }, { "epoch": 12.533333333333333, "grad_norm": 5.348933696746826, "learning_rate": 6.469135802469136e-05, "loss": 0.3957, "step": 188 }, { "epoch": 12.666666666666666, "grad_norm": 7.746328830718994, "learning_rate": 6.419753086419753e-05, "loss": 0.4989, "step": 190 }, { "epoch": 12.8, "grad_norm": 6.134746074676514, "learning_rate": 6.37037037037037e-05, "loss": 0.7035, "step": 192 }, { "epoch": 12.933333333333334, "grad_norm": 5.567310810089111, "learning_rate": 6.320987654320988e-05, "loss": 0.482, "step": 194 }, { "epoch": 13.0, "eval_accuracy": 0.5151515151515151, "eval_f1_macro": 0.4842067834885892, "eval_f1_micro": 0.5151515151515151, "eval_f1_weighted": 0.5233183119383529, "eval_loss": 1.3830989599227905, "eval_precision_macro": 0.5517290249433106, "eval_precision_micro": 0.5151515151515151, "eval_precision_weighted": 0.5803270803270804, "eval_recall_macro": 0.48390778533635675, "eval_recall_micro": 0.5151515151515151, "eval_recall_weighted": 0.5151515151515151, "eval_runtime": 2.882, "eval_samples_per_second": 45.801, "eval_steps_per_second": 1.735, "step": 195 }, { "epoch": 13.066666666666666, "grad_norm": 6.7704386711120605, "learning_rate": 6.271604938271606e-05, "loss": 0.5136, "step": 196 }, { "epoch": 13.2, "grad_norm": 5.41668701171875, "learning_rate": 6.222222222222222e-05, "loss": 0.4843, "step": 198 }, { "epoch": 13.333333333333334, "grad_norm": 4.7562150955200195, "learning_rate": 6.17283950617284e-05, "loss": 0.3338, "step": 200 }, { "epoch": 13.466666666666667, "grad_norm": 4.077147960662842, "learning_rate": 6.123456790123457e-05, "loss": 0.2694, "step": 202 }, { "epoch": 13.6, "grad_norm": 4.678223609924316, "learning_rate": 6.074074074074074e-05, "loss": 0.2965, "step": 204 }, { "epoch": 13.733333333333333, "grad_norm": 6.246657371520996, "learning_rate": 6.024691358024692e-05, "loss": 0.489, "step": 206 }, { "epoch": 13.866666666666667, "grad_norm": 4.0403971672058105, "learning_rate": 5.975308641975309e-05, "loss": 0.3524, "step": 208 }, { "epoch": 14.0, "grad_norm": 11.723469734191895, "learning_rate": 5.925925925925926e-05, "loss": 0.6409, "step": 210 }, { "epoch": 14.0, "eval_accuracy": 0.5984848484848485, "eval_f1_macro": 0.5080833548412379, "eval_f1_micro": 0.5984848484848485, "eval_f1_weighted": 0.576454835403795, "eval_loss": 1.4066194295883179, "eval_precision_macro": 0.5193577256077255, "eval_precision_micro": 0.5984848484848485, "eval_precision_weighted": 0.5819911307127215, "eval_recall_macro": 0.5231594860166289, "eval_recall_micro": 0.5984848484848485, "eval_recall_weighted": 0.5984848484848485, "eval_runtime": 4.8101, "eval_samples_per_second": 27.442, "eval_steps_per_second": 1.039, "step": 210 }, { "epoch": 14.133333333333333, "grad_norm": 4.278630256652832, "learning_rate": 5.8765432098765437e-05, "loss": 0.1963, "step": 212 }, { "epoch": 14.266666666666667, "grad_norm": 5.803009510040283, "learning_rate": 5.8271604938271607e-05, "loss": 0.4284, "step": 214 }, { "epoch": 14.4, "grad_norm": 4.886916160583496, "learning_rate": 5.7777777777777776e-05, "loss": 0.3091, "step": 216 }, { "epoch": 14.533333333333333, "grad_norm": 6.119672775268555, "learning_rate": 5.728395061728395e-05, "loss": 0.3287, "step": 218 }, { "epoch": 14.666666666666666, "grad_norm": 7.14682149887085, "learning_rate": 5.679012345679012e-05, "loss": 0.2819, "step": 220 }, { "epoch": 14.8, "grad_norm": 5.075103282928467, "learning_rate": 5.62962962962963e-05, "loss": 0.2101, "step": 222 }, { "epoch": 14.933333333333334, "grad_norm": 4.5539045333862305, "learning_rate": 5.580246913580247e-05, "loss": 0.3206, "step": 224 }, { "epoch": 15.0, "eval_accuracy": 0.5606060606060606, "eval_f1_macro": 0.5154896879386676, "eval_f1_micro": 0.5606060606060606, "eval_f1_weighted": 0.5520090359376074, "eval_loss": 1.3689966201782227, "eval_precision_macro": 0.6158199643493761, "eval_precision_micro": 0.5606060606060606, "eval_precision_weighted": 0.5889932074758278, "eval_recall_macro": 0.5170219198790628, "eval_recall_micro": 0.5606060606060606, "eval_recall_weighted": 0.5606060606060606, "eval_runtime": 2.0949, "eval_samples_per_second": 63.009, "eval_steps_per_second": 2.387, "step": 225 }, { "epoch": 15.066666666666666, "grad_norm": 4.093947887420654, "learning_rate": 5.530864197530864e-05, "loss": 0.3352, "step": 226 }, { "epoch": 15.2, "grad_norm": 5.242745876312256, "learning_rate": 5.4814814814814817e-05, "loss": 0.2066, "step": 228 }, { "epoch": 15.333333333333334, "grad_norm": 5.613947868347168, "learning_rate": 5.4320987654320986e-05, "loss": 0.3504, "step": 230 }, { "epoch": 15.466666666666667, "grad_norm": 3.4319839477539062, "learning_rate": 5.382716049382717e-05, "loss": 0.2294, "step": 232 }, { "epoch": 15.6, "grad_norm": 6.01231575012207, "learning_rate": 5.333333333333333e-05, "loss": 0.2498, "step": 234 }, { "epoch": 15.733333333333333, "grad_norm": 3.9071357250213623, "learning_rate": 5.28395061728395e-05, "loss": 0.2092, "step": 236 }, { "epoch": 15.866666666666667, "grad_norm": 5.718769550323486, "learning_rate": 5.234567901234568e-05, "loss": 0.2223, "step": 238 }, { "epoch": 16.0, "grad_norm": 4.071746349334717, "learning_rate": 5.185185185185185e-05, "loss": 0.1773, "step": 240 }, { "epoch": 16.0, "eval_accuracy": 0.6515151515151515, "eval_f1_macro": 0.592019301793738, "eval_f1_micro": 0.6515151515151515, "eval_f1_weighted": 0.6407837434153223, "eval_loss": 1.2568100690841675, "eval_precision_macro": 0.6893528941196284, "eval_precision_micro": 0.6515151515151515, "eval_precision_weighted": 0.6623135907365115, "eval_recall_macro": 0.5842857142857143, "eval_recall_micro": 0.6515151515151515, "eval_recall_weighted": 0.6515151515151515, "eval_runtime": 1.9927, "eval_samples_per_second": 66.24, "eval_steps_per_second": 2.509, "step": 240 }, { "epoch": 16.133333333333333, "grad_norm": 3.561516761779785, "learning_rate": 5.135802469135803e-05, "loss": 0.1696, "step": 242 }, { "epoch": 16.266666666666666, "grad_norm": 1.3526779413223267, "learning_rate": 5.0864197530864197e-05, "loss": 0.0665, "step": 244 }, { "epoch": 16.4, "grad_norm": 4.29080057144165, "learning_rate": 5.0370370370370366e-05, "loss": 0.195, "step": 246 }, { "epoch": 16.533333333333335, "grad_norm": 6.229769706726074, "learning_rate": 4.987654320987655e-05, "loss": 0.2993, "step": 248 }, { "epoch": 16.666666666666668, "grad_norm": 4.949665546417236, "learning_rate": 4.938271604938271e-05, "loss": 0.2081, "step": 250 }, { "epoch": 16.8, "grad_norm": 6.123852252960205, "learning_rate": 4.888888888888889e-05, "loss": 0.212, "step": 252 }, { "epoch": 16.933333333333334, "grad_norm": 4.0239105224609375, "learning_rate": 4.8395061728395067e-05, "loss": 0.3259, "step": 254 }, { "epoch": 17.0, "eval_accuracy": 0.6060606060606061, "eval_f1_macro": 0.5467242234296787, "eval_f1_micro": 0.6060606060606061, "eval_f1_weighted": 0.5961390083174005, "eval_loss": 1.3405537605285645, "eval_precision_macro": 0.5614736217067472, "eval_precision_micro": 0.6060606060606061, "eval_precision_weighted": 0.6033042542530208, "eval_recall_macro": 0.5466817838246409, "eval_recall_micro": 0.6060606060606061, "eval_recall_weighted": 0.6060606060606061, "eval_runtime": 2.0502, "eval_samples_per_second": 64.382, "eval_steps_per_second": 2.439, "step": 255 }, { "epoch": 17.066666666666666, "grad_norm": 1.4321271181106567, "learning_rate": 4.7901234567901237e-05, "loss": 0.055, "step": 256 }, { "epoch": 17.2, "grad_norm": 2.13454008102417, "learning_rate": 4.740740740740741e-05, "loss": 0.1221, "step": 258 }, { "epoch": 17.333333333333332, "grad_norm": 5.276524066925049, "learning_rate": 4.691358024691358e-05, "loss": 0.1417, "step": 260 }, { "epoch": 17.466666666666665, "grad_norm": 3.8555052280426025, "learning_rate": 4.641975308641975e-05, "loss": 0.2943, "step": 262 }, { "epoch": 17.6, "grad_norm": 4.094534873962402, "learning_rate": 4.592592592592593e-05, "loss": 0.2206, "step": 264 }, { "epoch": 17.733333333333334, "grad_norm": 4.184159278869629, "learning_rate": 4.54320987654321e-05, "loss": 0.1565, "step": 266 }, { "epoch": 17.866666666666667, "grad_norm": 5.283144474029541, "learning_rate": 4.493827160493828e-05, "loss": 0.1427, "step": 268 }, { "epoch": 18.0, "grad_norm": 3.6470813751220703, "learning_rate": 4.4444444444444447e-05, "loss": 0.1123, "step": 270 }, { "epoch": 18.0, "eval_accuracy": 0.6363636363636364, "eval_f1_macro": 0.5867719657675725, "eval_f1_micro": 0.6363636363636364, "eval_f1_weighted": 0.6305501232595613, "eval_loss": 1.376707911491394, "eval_precision_macro": 0.6257631257631259, "eval_precision_micro": 0.6363636363636364, "eval_precision_weighted": 0.6413447663447664, "eval_recall_macro": 0.5785109599395314, "eval_recall_micro": 0.6363636363636364, "eval_recall_weighted": 0.6363636363636364, "eval_runtime": 1.992, "eval_samples_per_second": 66.266, "eval_steps_per_second": 2.51, "step": 270 }, { "epoch": 18.133333333333333, "grad_norm": 3.1710643768310547, "learning_rate": 4.3950617283950617e-05, "loss": 0.1219, "step": 272 }, { "epoch": 18.266666666666666, "grad_norm": 7.098196506500244, "learning_rate": 4.345679012345679e-05, "loss": 0.1588, "step": 274 }, { "epoch": 18.4, "grad_norm": 1.8567241430282593, "learning_rate": 4.296296296296296e-05, "loss": 0.1043, "step": 276 }, { "epoch": 18.533333333333335, "grad_norm": 2.1221156120300293, "learning_rate": 4.246913580246914e-05, "loss": 0.0748, "step": 278 }, { "epoch": 18.666666666666668, "grad_norm": 3.03196120262146, "learning_rate": 4.197530864197531e-05, "loss": 0.1148, "step": 280 }, { "epoch": 18.8, "grad_norm": 1.7942876815795898, "learning_rate": 4.148148148148148e-05, "loss": 0.0679, "step": 282 }, { "epoch": 18.933333333333334, "grad_norm": 4.499013900756836, "learning_rate": 4.0987654320987657e-05, "loss": 0.1129, "step": 284 }, { "epoch": 19.0, "eval_accuracy": 0.6439393939393939, "eval_f1_macro": 0.587916778045086, "eval_f1_micro": 0.6439393939393939, "eval_f1_weighted": 0.6305576751206262, "eval_loss": 1.4679865837097168, "eval_precision_macro": 0.6809288563910413, "eval_precision_micro": 0.6439393939393939, "eval_precision_weighted": 0.6932697872537444, "eval_recall_macro": 0.5806046863189721, "eval_recall_micro": 0.6439393939393939, "eval_recall_weighted": 0.6439393939393939, "eval_runtime": 1.9847, "eval_samples_per_second": 66.508, "eval_steps_per_second": 2.519, "step": 285 }, { "epoch": 19.066666666666666, "grad_norm": 2.631176233291626, "learning_rate": 4.049382716049383e-05, "loss": 0.1028, "step": 286 }, { "epoch": 19.2, "grad_norm": 4.930914402008057, "learning_rate": 4e-05, "loss": 0.2555, "step": 288 }, { "epoch": 19.333333333333332, "grad_norm": 3.355149745941162, "learning_rate": 3.950617283950617e-05, "loss": 0.0792, "step": 290 }, { "epoch": 19.466666666666665, "grad_norm": 2.2780933380126953, "learning_rate": 3.901234567901234e-05, "loss": 0.0595, "step": 292 }, { "epoch": 19.6, "grad_norm": 4.880768299102783, "learning_rate": 3.851851851851852e-05, "loss": 0.0756, "step": 294 }, { "epoch": 19.733333333333334, "grad_norm": 2.175165891647339, "learning_rate": 3.80246913580247e-05, "loss": 0.1077, "step": 296 }, { "epoch": 19.866666666666667, "grad_norm": 2.6557981967926025, "learning_rate": 3.7530864197530867e-05, "loss": 0.1094, "step": 298 }, { "epoch": 20.0, "grad_norm": 1.2508912086486816, "learning_rate": 3.7037037037037037e-05, "loss": 0.0651, "step": 300 }, { "epoch": 20.0, "eval_accuracy": 0.6893939393939394, "eval_f1_macro": 0.6655257312106627, "eval_f1_micro": 0.6893939393939394, "eval_f1_weighted": 0.687595503348928, "eval_loss": 1.4981398582458496, "eval_precision_macro": 0.7114991648833447, "eval_precision_micro": 0.6893939393939394, "eval_precision_weighted": 0.7224498247915767, "eval_recall_macro": 0.6510808767951625, "eval_recall_micro": 0.6893939393939394, "eval_recall_weighted": 0.6893939393939394, "eval_runtime": 1.9861, "eval_samples_per_second": 66.462, "eval_steps_per_second": 2.517, "step": 300 }, { "epoch": 20.133333333333333, "grad_norm": 5.263727188110352, "learning_rate": 3.654320987654321e-05, "loss": 0.075, "step": 302 }, { "epoch": 20.266666666666666, "grad_norm": 4.619281768798828, "learning_rate": 3.604938271604938e-05, "loss": 0.1319, "step": 304 }, { "epoch": 20.4, "grad_norm": 1.0995675325393677, "learning_rate": 3.555555555555556e-05, "loss": 0.0366, "step": 306 }, { "epoch": 20.533333333333335, "grad_norm": 4.2385663986206055, "learning_rate": 3.506172839506173e-05, "loss": 0.1331, "step": 308 }, { "epoch": 20.666666666666668, "grad_norm": 2.6913745403289795, "learning_rate": 3.45679012345679e-05, "loss": 0.0894, "step": 310 }, { "epoch": 20.8, "grad_norm": 4.785970687866211, "learning_rate": 3.4074074074074077e-05, "loss": 0.0756, "step": 312 }, { "epoch": 20.933333333333334, "grad_norm": 1.5702877044677734, "learning_rate": 3.3580246913580247e-05, "loss": 0.0685, "step": 314 }, { "epoch": 21.0, "eval_accuracy": 0.6515151515151515, "eval_f1_macro": 0.6091138915880551, "eval_f1_micro": 0.6515151515151515, "eval_f1_weighted": 0.6494256262321655, "eval_loss": 1.4620611667633057, "eval_precision_macro": 0.630280884283538, "eval_precision_micro": 0.6515151515151515, "eval_precision_weighted": 0.664075183502428, "eval_recall_macro": 0.6039682539682539, "eval_recall_micro": 0.6515151515151515, "eval_recall_weighted": 0.6515151515151515, "eval_runtime": 2.0276, "eval_samples_per_second": 65.103, "eval_steps_per_second": 2.466, "step": 315 }, { "epoch": 21.066666666666666, "grad_norm": 0.6037698984146118, "learning_rate": 3.308641975308642e-05, "loss": 0.0537, "step": 316 }, { "epoch": 21.2, "grad_norm": 0.877955436706543, "learning_rate": 3.25925925925926e-05, "loss": 0.0283, "step": 318 }, { "epoch": 21.333333333333332, "grad_norm": 4.185865879058838, "learning_rate": 3.209876543209876e-05, "loss": 0.1153, "step": 320 }, { "epoch": 21.466666666666665, "grad_norm": 0.7465834021568298, "learning_rate": 3.160493827160494e-05, "loss": 0.0311, "step": 322 }, { "epoch": 21.6, "grad_norm": 1.4049850702285767, "learning_rate": 3.111111111111111e-05, "loss": 0.0641, "step": 324 }, { "epoch": 21.733333333333334, "grad_norm": 1.6191234588623047, "learning_rate": 3.061728395061729e-05, "loss": 0.0417, "step": 326 }, { "epoch": 21.866666666666667, "grad_norm": 1.2088876962661743, "learning_rate": 3.012345679012346e-05, "loss": 0.0314, "step": 328 }, { "epoch": 22.0, "grad_norm": 0.7652052640914917, "learning_rate": 2.962962962962963e-05, "loss": 0.1469, "step": 330 }, { "epoch": 22.0, "eval_accuracy": 0.6212121212121212, "eval_f1_macro": 0.5330299221627766, "eval_f1_micro": 0.6212121212121212, "eval_f1_weighted": 0.604041002442862, "eval_loss": 1.534732699394226, "eval_precision_macro": 0.5476940619507992, "eval_precision_micro": 0.6212121212121212, "eval_precision_weighted": 0.6148931558944467, "eval_recall_macro": 0.5439984882842026, "eval_recall_micro": 0.6212121212121212, "eval_recall_weighted": 0.6212121212121212, "eval_runtime": 1.9747, "eval_samples_per_second": 66.844, "eval_steps_per_second": 2.532, "step": 330 }, { "epoch": 22.133333333333333, "grad_norm": 3.304185152053833, "learning_rate": 2.9135802469135803e-05, "loss": 0.0456, "step": 332 }, { "epoch": 22.266666666666666, "grad_norm": 2.3118255138397217, "learning_rate": 2.8641975308641977e-05, "loss": 0.0377, "step": 334 }, { "epoch": 22.4, "grad_norm": 2.3639698028564453, "learning_rate": 2.814814814814815e-05, "loss": 0.0708, "step": 336 }, { "epoch": 22.533333333333335, "grad_norm": 1.741746187210083, "learning_rate": 2.765432098765432e-05, "loss": 0.0353, "step": 338 }, { "epoch": 22.666666666666668, "grad_norm": 0.6108101010322571, "learning_rate": 2.7160493827160493e-05, "loss": 0.0531, "step": 340 }, { "epoch": 22.8, "grad_norm": 2.961045503616333, "learning_rate": 2.6666666666666667e-05, "loss": 0.0394, "step": 342 }, { "epoch": 22.933333333333334, "grad_norm": 1.7298003435134888, "learning_rate": 2.617283950617284e-05, "loss": 0.0289, "step": 344 }, { "epoch": 23.0, "eval_accuracy": 0.6287878787878788, "eval_f1_macro": 0.5465784005632545, "eval_f1_micro": 0.6287878787878788, "eval_f1_weighted": 0.6179920372130975, "eval_loss": 1.5417176485061646, "eval_precision_macro": 0.5409361471861472, "eval_precision_micro": 0.6287878787878788, "eval_precision_weighted": 0.610816976584022, "eval_recall_macro": 0.5549206349206349, "eval_recall_micro": 0.6287878787878788, "eval_recall_weighted": 0.6287878787878788, "eval_runtime": 1.9902, "eval_samples_per_second": 66.326, "eval_steps_per_second": 2.512, "step": 345 }, { "epoch": 23.066666666666666, "grad_norm": 0.7690654397010803, "learning_rate": 2.5679012345679017e-05, "loss": 0.0458, "step": 346 }, { "epoch": 23.2, "grad_norm": 3.320651054382324, "learning_rate": 2.5185185185185183e-05, "loss": 0.0804, "step": 348 }, { "epoch": 23.333333333333332, "grad_norm": 2.0301012992858887, "learning_rate": 2.4691358024691357e-05, "loss": 0.0279, "step": 350 }, { "epoch": 23.466666666666665, "grad_norm": 0.4531901478767395, "learning_rate": 2.4197530864197533e-05, "loss": 0.0139, "step": 352 }, { "epoch": 23.6, "grad_norm": 2.56703519821167, "learning_rate": 2.3703703703703707e-05, "loss": 0.0783, "step": 354 }, { "epoch": 23.733333333333334, "grad_norm": 0.20635652542114258, "learning_rate": 2.3209876543209877e-05, "loss": 0.012, "step": 356 }, { "epoch": 23.866666666666667, "grad_norm": 0.5930025577545166, "learning_rate": 2.271604938271605e-05, "loss": 0.0145, "step": 358 }, { "epoch": 24.0, "grad_norm": 0.24041523039340973, "learning_rate": 2.2222222222222223e-05, "loss": 0.01, "step": 360 }, { "epoch": 24.0, "eval_accuracy": 0.6363636363636364, "eval_f1_macro": 0.5474889044983636, "eval_f1_micro": 0.6363636363636364, "eval_f1_weighted": 0.6187343775995573, "eval_loss": 1.5670151710510254, "eval_precision_macro": 0.5434552419168567, "eval_precision_micro": 0.6363636363636364, "eval_precision_weighted": 0.6103857259761386, "eval_recall_macro": 0.5594179894179894, "eval_recall_micro": 0.6363636363636364, "eval_recall_weighted": 0.6363636363636364, "eval_runtime": 1.9405, "eval_samples_per_second": 68.023, "eval_steps_per_second": 2.577, "step": 360 }, { "epoch": 24.133333333333333, "grad_norm": 0.21558411419391632, "learning_rate": 2.1728395061728397e-05, "loss": 0.0102, "step": 362 }, { "epoch": 24.266666666666666, "grad_norm": 3.2394814491271973, "learning_rate": 2.123456790123457e-05, "loss": 0.0218, "step": 364 }, { "epoch": 24.4, "grad_norm": 3.6115405559539795, "learning_rate": 2.074074074074074e-05, "loss": 0.1149, "step": 366 }, { "epoch": 24.533333333333335, "grad_norm": 0.1589735597372055, "learning_rate": 2.0246913580246917e-05, "loss": 0.0082, "step": 368 }, { "epoch": 24.666666666666668, "grad_norm": 1.3840848207473755, "learning_rate": 1.9753086419753087e-05, "loss": 0.0174, "step": 370 }, { "epoch": 24.8, "grad_norm": 3.772754192352295, "learning_rate": 1.925925925925926e-05, "loss": 0.043, "step": 372 }, { "epoch": 24.933333333333334, "grad_norm": 0.41601723432540894, "learning_rate": 1.8765432098765433e-05, "loss": 0.035, "step": 374 }, { "epoch": 25.0, "eval_accuracy": 0.6363636363636364, "eval_f1_macro": 0.5529395694676043, "eval_f1_micro": 0.6363636363636364, "eval_f1_weighted": 0.6209326623035122, "eval_loss": 1.6037245988845825, "eval_precision_macro": 0.5470247238680418, "eval_precision_micro": 0.6363636363636364, "eval_precision_weighted": 0.6156263091746962, "eval_recall_macro": 0.5679213907785335, "eval_recall_micro": 0.6363636363636364, "eval_recall_weighted": 0.6363636363636364, "eval_runtime": 1.9551, "eval_samples_per_second": 67.515, "eval_steps_per_second": 2.557, "step": 375 }, { "epoch": 25.066666666666666, "grad_norm": 0.4554837942123413, "learning_rate": 1.8271604938271607e-05, "loss": 0.0636, "step": 376 }, { "epoch": 25.2, "grad_norm": 4.667645454406738, "learning_rate": 1.777777777777778e-05, "loss": 0.0685, "step": 378 }, { "epoch": 25.333333333333332, "grad_norm": 6.68064022064209, "learning_rate": 1.728395061728395e-05, "loss": 0.0919, "step": 380 }, { "epoch": 25.466666666666665, "grad_norm": 0.2510056793689728, "learning_rate": 1.6790123456790123e-05, "loss": 0.0111, "step": 382 }, { "epoch": 25.6, "grad_norm": 0.6245520114898682, "learning_rate": 1.62962962962963e-05, "loss": 0.0134, "step": 384 }, { "epoch": 25.733333333333334, "grad_norm": 2.165201187133789, "learning_rate": 1.580246913580247e-05, "loss": 0.0271, "step": 386 }, { "epoch": 25.866666666666667, "grad_norm": 0.24112091958522797, "learning_rate": 1.5308641975308643e-05, "loss": 0.0105, "step": 388 }, { "epoch": 26.0, "grad_norm": 0.377363383769989, "learning_rate": 1.4814814814814815e-05, "loss": 0.0109, "step": 390 }, { "epoch": 26.0, "eval_accuracy": 0.6212121212121212, "eval_f1_macro": 0.5896814040471776, "eval_f1_micro": 0.6212121212121212, "eval_f1_weighted": 0.6203213160225189, "eval_loss": 1.6751586198806763, "eval_precision_macro": 0.6144605795534588, "eval_precision_micro": 0.6212121212121212, "eval_precision_weighted": 0.6527441598649029, "eval_recall_macro": 0.5999697656840514, "eval_recall_micro": 0.6212121212121212, "eval_recall_weighted": 0.6212121212121212, "eval_runtime": 1.9656, "eval_samples_per_second": 67.154, "eval_steps_per_second": 2.544, "step": 390 }, { "epoch": 26.133333333333333, "grad_norm": 0.3774866461753845, "learning_rate": 1.4320987654320988e-05, "loss": 0.0097, "step": 392 }, { "epoch": 26.266666666666666, "grad_norm": 3.956695079803467, "learning_rate": 1.382716049382716e-05, "loss": 0.0233, "step": 394 }, { "epoch": 26.4, "grad_norm": 0.5877533555030823, "learning_rate": 1.3333333333333333e-05, "loss": 0.0157, "step": 396 }, { "epoch": 26.533333333333335, "grad_norm": 1.2962318658828735, "learning_rate": 1.2839506172839508e-05, "loss": 0.0249, "step": 398 }, { "epoch": 26.666666666666668, "grad_norm": 2.2431485652923584, "learning_rate": 1.2345679012345678e-05, "loss": 0.0224, "step": 400 }, { "epoch": 26.8, "grad_norm": 0.21492817997932434, "learning_rate": 1.1851851851851853e-05, "loss": 0.0117, "step": 402 }, { "epoch": 26.933333333333334, "grad_norm": 0.4237399697303772, "learning_rate": 1.1358024691358025e-05, "loss": 0.038, "step": 404 }, { "epoch": 27.0, "eval_accuracy": 0.6136363636363636, "eval_f1_macro": 0.5343822919199936, "eval_f1_micro": 0.6136363636363636, "eval_f1_weighted": 0.6008425380028616, "eval_loss": 1.672375202178955, "eval_precision_macro": 0.5331553830282576, "eval_precision_micro": 0.6136363636363636, "eval_precision_weighted": 0.6004965634415023, "eval_recall_macro": 0.546832955404384, "eval_recall_micro": 0.6136363636363636, "eval_recall_weighted": 0.6136363636363636, "eval_runtime": 1.9801, "eval_samples_per_second": 66.662, "eval_steps_per_second": 2.525, "step": 405 }, { "epoch": 27.066666666666666, "grad_norm": 1.5725435018539429, "learning_rate": 1.0864197530864198e-05, "loss": 0.0149, "step": 406 }, { "epoch": 27.2, "grad_norm": 0.13784648478031158, "learning_rate": 1.037037037037037e-05, "loss": 0.0092, "step": 408 }, { "epoch": 27.333333333333332, "grad_norm": 0.09840863198041916, "learning_rate": 9.876543209876543e-06, "loss": 0.008, "step": 410 }, { "epoch": 27.466666666666665, "grad_norm": 0.8349915146827698, "learning_rate": 9.382716049382717e-06, "loss": 0.0206, "step": 412 }, { "epoch": 27.6, "grad_norm": 0.33149102330207825, "learning_rate": 8.88888888888889e-06, "loss": 0.0173, "step": 414 }, { "epoch": 27.733333333333334, "grad_norm": 0.3867279589176178, "learning_rate": 8.395061728395062e-06, "loss": 0.0093, "step": 416 }, { "epoch": 27.866666666666667, "grad_norm": 1.726897120475769, "learning_rate": 7.901234567901235e-06, "loss": 0.0214, "step": 418 }, { "epoch": 28.0, "grad_norm": 0.19306233525276184, "learning_rate": 7.4074074074074075e-06, "loss": 0.0116, "step": 420 }, { "epoch": 28.0, "eval_accuracy": 0.6212121212121212, "eval_f1_macro": 0.5383730158730159, "eval_f1_micro": 0.6212121212121212, "eval_f1_weighted": 0.609029280904281, "eval_loss": 1.6251877546310425, "eval_precision_macro": 0.533696432596027, "eval_precision_micro": 0.6212121212121212, "eval_precision_weighted": 0.6033010121498966, "eval_recall_macro": 0.5490778533635676, "eval_recall_micro": 0.6212121212121212, "eval_recall_weighted": 0.6212121212121212, "eval_runtime": 1.9285, "eval_samples_per_second": 68.448, "eval_steps_per_second": 2.593, "step": 420 }, { "epoch": 28.133333333333333, "grad_norm": 1.669783115386963, "learning_rate": 6.91358024691358e-06, "loss": 0.0318, "step": 422 }, { "epoch": 28.266666666666666, "grad_norm": 0.6250646114349365, "learning_rate": 6.419753086419754e-06, "loss": 0.0195, "step": 424 }, { "epoch": 28.4, "grad_norm": 0.4752732813358307, "learning_rate": 5.925925925925927e-06, "loss": 0.0124, "step": 426 }, { "epoch": 28.533333333333335, "grad_norm": 0.16341274976730347, "learning_rate": 5.432098765432099e-06, "loss": 0.0065, "step": 428 }, { "epoch": 28.666666666666668, "grad_norm": 0.08904340863227844, "learning_rate": 4.938271604938272e-06, "loss": 0.0062, "step": 430 }, { "epoch": 28.8, "grad_norm": 0.24332502484321594, "learning_rate": 4.444444444444445e-06, "loss": 0.0055, "step": 432 }, { "epoch": 28.933333333333334, "grad_norm": 0.47205692529678345, "learning_rate": 3.9506172839506175e-06, "loss": 0.006, "step": 434 }, { "epoch": 29.0, "eval_accuracy": 0.6363636363636364, "eval_f1_macro": 0.557191887992969, "eval_f1_micro": 0.6363636363636364, "eval_f1_weighted": 0.6294141170899599, "eval_loss": 1.597952961921692, "eval_precision_macro": 0.5529214559386972, "eval_precision_micro": 0.6363636363636364, "eval_precision_weighted": 0.6245954516428655, "eval_recall_macro": 0.563363567649282, "eval_recall_micro": 0.6363636363636364, "eval_recall_weighted": 0.6363636363636364, "eval_runtime": 1.9083, "eval_samples_per_second": 69.172, "eval_steps_per_second": 2.62, "step": 435 }, { "epoch": 29.066666666666666, "grad_norm": 0.27642032504081726, "learning_rate": 3.45679012345679e-06, "loss": 0.0162, "step": 436 }, { "epoch": 29.2, "grad_norm": 0.9449041485786438, "learning_rate": 2.9629629629629633e-06, "loss": 0.0088, "step": 438 }, { "epoch": 29.333333333333332, "grad_norm": 0.14337310194969177, "learning_rate": 2.469135802469136e-06, "loss": 0.0193, "step": 440 }, { "epoch": 29.466666666666665, "grad_norm": 0.17881515622138977, "learning_rate": 1.9753086419753087e-06, "loss": 0.0191, "step": 442 }, { "epoch": 29.6, "grad_norm": 0.15386801958084106, "learning_rate": 1.4814814814814817e-06, "loss": 0.005, "step": 444 }, { "epoch": 29.733333333333334, "grad_norm": 0.32567164301872253, "learning_rate": 9.876543209876544e-07, "loss": 0.0077, "step": 446 }, { "epoch": 29.866666666666667, "grad_norm": 0.6249086260795593, "learning_rate": 4.938271604938272e-07, "loss": 0.0158, "step": 448 }, { "epoch": 30.0, "grad_norm": 0.24103443324565887, "learning_rate": 0.0, "loss": 0.0046, "step": 450 }, { "epoch": 30.0, "eval_accuracy": 0.6439393939393939, "eval_f1_macro": 0.5605037390491809, "eval_f1_micro": 0.6439393939393939, "eval_f1_weighted": 0.634156085647718, "eval_loss": 1.593876838684082, "eval_precision_macro": 0.5545634920634921, "eval_precision_micro": 0.6439393939393939, "eval_precision_weighted": 0.6269465488215488, "eval_recall_macro": 0.5686545729402873, "eval_recall_micro": 0.6439393939393939, "eval_recall_weighted": 0.6439393939393939, "eval_runtime": 2.0538, "eval_samples_per_second": 64.27, "eval_steps_per_second": 2.434, "step": 450 }, { "epoch": 30.0, "step": 450, "total_flos": 1.0740871074163507e+18, "train_loss": 0.6009381743893027, "train_runtime": 318.0836, "train_samples_per_second": 43.573, "train_steps_per_second": 1.415 } ], "logging_steps": 2, "max_steps": 450, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0740871074163507e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }