{ "best_global_step": 1188, "best_metric": 0.9063876651982378, "best_model_checkpoint": "./albert_multilabel_large\\checkpoint-1188", "epoch": 3.0, "eval_steps": 500, "global_step": 1188, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025252525252525255, "grad_norm": 11.870144844055176, "learning_rate": 1.9983164983164986e-05, "loss": 0.758, "step": 1 }, { "epoch": 0.005050505050505051, "grad_norm": 3.960209369659424, "learning_rate": 1.9966329966329967e-05, "loss": 0.6626, "step": 2 }, { "epoch": 0.007575757575757576, "grad_norm": 4.163188457489014, "learning_rate": 1.994949494949495e-05, "loss": 0.6126, "step": 3 }, { "epoch": 0.010101010101010102, "grad_norm": 3.630805253982544, "learning_rate": 1.9932659932659936e-05, "loss": 0.6105, "step": 4 }, { "epoch": 0.012626262626262626, "grad_norm": 3.7358059883117676, "learning_rate": 1.9915824915824917e-05, "loss": 0.5486, "step": 5 }, { "epoch": 0.015151515151515152, "grad_norm": 4.360195636749268, "learning_rate": 1.98989898989899e-05, "loss": 0.4834, "step": 6 }, { "epoch": 0.017676767676767676, "grad_norm": 4.5092549324035645, "learning_rate": 1.9882154882154885e-05, "loss": 0.4723, "step": 7 }, { "epoch": 0.020202020202020204, "grad_norm": 3.9408679008483887, "learning_rate": 1.9865319865319866e-05, "loss": 0.4853, "step": 8 }, { "epoch": 0.022727272727272728, "grad_norm": 253.27503967285156, "learning_rate": 1.984848484848485e-05, "loss": 0.5783, "step": 9 }, { "epoch": 0.025252525252525252, "grad_norm": 17.573854446411133, "learning_rate": 1.9831649831649832e-05, "loss": 0.3991, "step": 10 }, { "epoch": 0.027777777777777776, "grad_norm": 4.111778259277344, "learning_rate": 1.9814814814814816e-05, "loss": 0.4054, "step": 11 }, { "epoch": 0.030303030303030304, "grad_norm": 2.9756879806518555, "learning_rate": 1.97979797979798e-05, "loss": 0.4636, "step": 12 }, { "epoch": 0.03282828282828283, "grad_norm": 1.9542008638381958, "learning_rate": 1.978114478114478e-05, "loss": 0.3732, "step": 13 }, { "epoch": 0.03535353535353535, "grad_norm": 2.1436798572540283, "learning_rate": 1.9764309764309766e-05, "loss": 0.3341, "step": 14 }, { "epoch": 0.03787878787878788, "grad_norm": 2.5457680225372314, "learning_rate": 1.9747474747474747e-05, "loss": 0.358, "step": 15 }, { "epoch": 0.04040404040404041, "grad_norm": 2.3681640625, "learning_rate": 1.973063973063973e-05, "loss": 0.3813, "step": 16 }, { "epoch": 0.04292929292929293, "grad_norm": 3.3765199184417725, "learning_rate": 1.9713804713804716e-05, "loss": 0.3288, "step": 17 }, { "epoch": 0.045454545454545456, "grad_norm": 2.5906496047973633, "learning_rate": 1.96969696969697e-05, "loss": 0.3025, "step": 18 }, { "epoch": 0.047979797979797977, "grad_norm": 1.6513965129852295, "learning_rate": 1.968013468013468e-05, "loss": 0.287, "step": 19 }, { "epoch": 0.050505050505050504, "grad_norm": 2.1033501625061035, "learning_rate": 1.9663299663299665e-05, "loss": 0.4552, "step": 20 }, { "epoch": 0.05303030303030303, "grad_norm": 2.6947014331817627, "learning_rate": 1.964646464646465e-05, "loss": 0.3561, "step": 21 }, { "epoch": 0.05555555555555555, "grad_norm": 1.4776068925857544, "learning_rate": 1.962962962962963e-05, "loss": 0.3564, "step": 22 }, { "epoch": 0.05808080808080808, "grad_norm": 1.8511464595794678, "learning_rate": 1.9612794612794615e-05, "loss": 0.3851, "step": 23 }, { "epoch": 0.06060606060606061, "grad_norm": 1.9145028591156006, "learning_rate": 1.9595959595959596e-05, "loss": 0.3591, "step": 24 }, { "epoch": 0.06313131313131314, "grad_norm": 3.7978272438049316, "learning_rate": 1.957912457912458e-05, "loss": 0.2914, "step": 25 }, { "epoch": 0.06565656565656566, "grad_norm": 1.9927159547805786, "learning_rate": 1.9562289562289565e-05, "loss": 0.276, "step": 26 }, { "epoch": 0.06818181818181818, "grad_norm": 1.7165324687957764, "learning_rate": 1.9545454545454546e-05, "loss": 0.3855, "step": 27 }, { "epoch": 0.0707070707070707, "grad_norm": 3.547311544418335, "learning_rate": 1.952861952861953e-05, "loss": 0.3146, "step": 28 }, { "epoch": 0.07323232323232323, "grad_norm": 2.205611228942871, "learning_rate": 1.951178451178451e-05, "loss": 0.2515, "step": 29 }, { "epoch": 0.07575757575757576, "grad_norm": 131.6199951171875, "learning_rate": 1.9494949494949496e-05, "loss": 0.4546, "step": 30 }, { "epoch": 0.07828282828282829, "grad_norm": 330.8481140136719, "learning_rate": 1.947811447811448e-05, "loss": 0.425, "step": 31 }, { "epoch": 0.08080808080808081, "grad_norm": 24.042455673217773, "learning_rate": 1.9461279461279464e-05, "loss": 0.3878, "step": 32 }, { "epoch": 0.08333333333333333, "grad_norm": 9.382911682128906, "learning_rate": 1.9444444444444445e-05, "loss": 0.3594, "step": 33 }, { "epoch": 0.08585858585858586, "grad_norm": 2.793823003768921, "learning_rate": 1.942760942760943e-05, "loss": 0.3946, "step": 34 }, { "epoch": 0.08838383838383838, "grad_norm": 2.098381280899048, "learning_rate": 1.9410774410774414e-05, "loss": 0.2873, "step": 35 }, { "epoch": 0.09090909090909091, "grad_norm": 3.7400736808776855, "learning_rate": 1.9393939393939395e-05, "loss": 0.1903, "step": 36 }, { "epoch": 0.09343434343434344, "grad_norm": 2.984248638153076, "learning_rate": 1.937710437710438e-05, "loss": 0.3758, "step": 37 }, { "epoch": 0.09595959595959595, "grad_norm": 2.084982395172119, "learning_rate": 1.936026936026936e-05, "loss": 0.2711, "step": 38 }, { "epoch": 0.09848484848484848, "grad_norm": 2.7394371032714844, "learning_rate": 1.9343434343434345e-05, "loss": 0.3333, "step": 39 }, { "epoch": 0.10101010101010101, "grad_norm": 3.4980244636535645, "learning_rate": 1.932659932659933e-05, "loss": 0.253, "step": 40 }, { "epoch": 0.10353535353535354, "grad_norm": 3.121978521347046, "learning_rate": 1.930976430976431e-05, "loss": 0.2335, "step": 41 }, { "epoch": 0.10606060606060606, "grad_norm": 2.696462631225586, "learning_rate": 1.9292929292929295e-05, "loss": 0.3397, "step": 42 }, { "epoch": 0.10858585858585859, "grad_norm": 3.063912868499756, "learning_rate": 1.9276094276094276e-05, "loss": 0.3242, "step": 43 }, { "epoch": 0.1111111111111111, "grad_norm": 8.048778533935547, "learning_rate": 1.925925925925926e-05, "loss": 0.3208, "step": 44 }, { "epoch": 0.11363636363636363, "grad_norm": 10.508525848388672, "learning_rate": 1.9242424242424244e-05, "loss": 0.3138, "step": 45 }, { "epoch": 0.11616161616161616, "grad_norm": 2.972494125366211, "learning_rate": 1.922558922558923e-05, "loss": 0.2749, "step": 46 }, { "epoch": 0.11868686868686869, "grad_norm": 2.6326518058776855, "learning_rate": 1.920875420875421e-05, "loss": 0.2419, "step": 47 }, { "epoch": 0.12121212121212122, "grad_norm": 3.0405683517456055, "learning_rate": 1.9191919191919194e-05, "loss": 0.3422, "step": 48 }, { "epoch": 0.12373737373737374, "grad_norm": 5.278780460357666, "learning_rate": 1.917508417508418e-05, "loss": 0.2458, "step": 49 }, { "epoch": 0.12626262626262627, "grad_norm": 4.309386730194092, "learning_rate": 1.915824915824916e-05, "loss": 0.2252, "step": 50 }, { "epoch": 0.12878787878787878, "grad_norm": 2.3794400691986084, "learning_rate": 1.9141414141414144e-05, "loss": 0.1916, "step": 51 }, { "epoch": 0.13131313131313133, "grad_norm": 3.2079036235809326, "learning_rate": 1.9124579124579125e-05, "loss": 0.2763, "step": 52 }, { "epoch": 0.13383838383838384, "grad_norm": 6.404500961303711, "learning_rate": 1.910774410774411e-05, "loss": 0.3073, "step": 53 }, { "epoch": 0.13636363636363635, "grad_norm": 3.2996926307678223, "learning_rate": 1.9090909090909094e-05, "loss": 0.2442, "step": 54 }, { "epoch": 0.1388888888888889, "grad_norm": 3.197521924972534, "learning_rate": 1.9074074074074075e-05, "loss": 0.2451, "step": 55 }, { "epoch": 0.1414141414141414, "grad_norm": 2.8418166637420654, "learning_rate": 1.905723905723906e-05, "loss": 0.2383, "step": 56 }, { "epoch": 0.14393939393939395, "grad_norm": 2.393613338470459, "learning_rate": 1.904040404040404e-05, "loss": 0.1834, "step": 57 }, { "epoch": 0.14646464646464646, "grad_norm": 2.1811683177948, "learning_rate": 1.9023569023569024e-05, "loss": 0.2564, "step": 58 }, { "epoch": 0.14898989898989898, "grad_norm": 2.4366374015808105, "learning_rate": 1.900673400673401e-05, "loss": 0.2541, "step": 59 }, { "epoch": 0.15151515151515152, "grad_norm": 5.706679344177246, "learning_rate": 1.8989898989898993e-05, "loss": 0.3285, "step": 60 }, { "epoch": 0.15404040404040403, "grad_norm": 1.8341200351715088, "learning_rate": 1.8973063973063974e-05, "loss": 0.2692, "step": 61 }, { "epoch": 0.15656565656565657, "grad_norm": 3.0101611614227295, "learning_rate": 1.895622895622896e-05, "loss": 0.2135, "step": 62 }, { "epoch": 0.1590909090909091, "grad_norm": 3.3140006065368652, "learning_rate": 1.8939393939393943e-05, "loss": 0.2027, "step": 63 }, { "epoch": 0.16161616161616163, "grad_norm": 2.118210554122925, "learning_rate": 1.8922558922558924e-05, "loss": 0.2213, "step": 64 }, { "epoch": 0.16414141414141414, "grad_norm": 3.6016147136688232, "learning_rate": 1.8905723905723908e-05, "loss": 0.1682, "step": 65 }, { "epoch": 0.16666666666666666, "grad_norm": 7.3510823249816895, "learning_rate": 1.888888888888889e-05, "loss": 0.369, "step": 66 }, { "epoch": 0.1691919191919192, "grad_norm": 3.6129775047302246, "learning_rate": 1.8872053872053873e-05, "loss": 0.2257, "step": 67 }, { "epoch": 0.1717171717171717, "grad_norm": 4.521103858947754, "learning_rate": 1.8855218855218858e-05, "loss": 0.1425, "step": 68 }, { "epoch": 0.17424242424242425, "grad_norm": 1.941278100013733, "learning_rate": 1.883838383838384e-05, "loss": 0.2962, "step": 69 }, { "epoch": 0.17676767676767677, "grad_norm": 4.856161594390869, "learning_rate": 1.8821548821548823e-05, "loss": 0.2071, "step": 70 }, { "epoch": 0.17929292929292928, "grad_norm": 4.528213024139404, "learning_rate": 1.8804713804713804e-05, "loss": 0.2222, "step": 71 }, { "epoch": 0.18181818181818182, "grad_norm": 6.646481037139893, "learning_rate": 1.8787878787878792e-05, "loss": 0.2942, "step": 72 }, { "epoch": 0.18434343434343434, "grad_norm": 2.1316299438476562, "learning_rate": 1.8771043771043773e-05, "loss": 0.2697, "step": 73 }, { "epoch": 0.18686868686868688, "grad_norm": 3.7682583332061768, "learning_rate": 1.8754208754208757e-05, "loss": 0.2575, "step": 74 }, { "epoch": 0.1893939393939394, "grad_norm": 2.1818718910217285, "learning_rate": 1.873737373737374e-05, "loss": 0.1755, "step": 75 }, { "epoch": 0.1919191919191919, "grad_norm": 5.337326526641846, "learning_rate": 1.8720538720538723e-05, "loss": 0.2238, "step": 76 }, { "epoch": 0.19444444444444445, "grad_norm": 5.185172080993652, "learning_rate": 1.8703703703703707e-05, "loss": 0.14, "step": 77 }, { "epoch": 0.19696969696969696, "grad_norm": 5.610733509063721, "learning_rate": 1.8686868686868688e-05, "loss": 0.2109, "step": 78 }, { "epoch": 0.1994949494949495, "grad_norm": 3.34989333152771, "learning_rate": 1.8670033670033672e-05, "loss": 0.2358, "step": 79 }, { "epoch": 0.20202020202020202, "grad_norm": 4.732699394226074, "learning_rate": 1.8653198653198653e-05, "loss": 0.2582, "step": 80 }, { "epoch": 0.20454545454545456, "grad_norm": 3.595618963241577, "learning_rate": 1.8636363636363638e-05, "loss": 0.2499, "step": 81 }, { "epoch": 0.20707070707070707, "grad_norm": 4.39829158782959, "learning_rate": 1.8619528619528622e-05, "loss": 0.1776, "step": 82 }, { "epoch": 0.20959595959595959, "grad_norm": 5.79127836227417, "learning_rate": 1.8602693602693603e-05, "loss": 0.1329, "step": 83 }, { "epoch": 0.21212121212121213, "grad_norm": 3.827282428741455, "learning_rate": 1.8585858585858588e-05, "loss": 0.2073, "step": 84 }, { "epoch": 0.21464646464646464, "grad_norm": 6.159754753112793, "learning_rate": 1.856902356902357e-05, "loss": 0.1658, "step": 85 }, { "epoch": 0.21717171717171718, "grad_norm": 9.290190696716309, "learning_rate": 1.8552188552188556e-05, "loss": 0.286, "step": 86 }, { "epoch": 0.2196969696969697, "grad_norm": 5.264730930328369, "learning_rate": 1.8535353535353537e-05, "loss": 0.2504, "step": 87 }, { "epoch": 0.2222222222222222, "grad_norm": 3.915583848953247, "learning_rate": 1.851851851851852e-05, "loss": 0.2535, "step": 88 }, { "epoch": 0.22474747474747475, "grad_norm": 3.885434627532959, "learning_rate": 1.8501683501683503e-05, "loss": 0.1451, "step": 89 }, { "epoch": 0.22727272727272727, "grad_norm": 3.5729010105133057, "learning_rate": 1.8484848484848487e-05, "loss": 0.1989, "step": 90 }, { "epoch": 0.2297979797979798, "grad_norm": 2.3339507579803467, "learning_rate": 1.846801346801347e-05, "loss": 0.3191, "step": 91 }, { "epoch": 0.23232323232323232, "grad_norm": 3.946099281311035, "learning_rate": 1.8451178451178452e-05, "loss": 0.2271, "step": 92 }, { "epoch": 0.23484848484848486, "grad_norm": 5.328370571136475, "learning_rate": 1.8434343434343437e-05, "loss": 0.3326, "step": 93 }, { "epoch": 0.23737373737373738, "grad_norm": 4.987793445587158, "learning_rate": 1.8417508417508418e-05, "loss": 0.2377, "step": 94 }, { "epoch": 0.2398989898989899, "grad_norm": 3.6775288581848145, "learning_rate": 1.8400673400673402e-05, "loss": 0.2323, "step": 95 }, { "epoch": 0.24242424242424243, "grad_norm": 3.444467782974243, "learning_rate": 1.8383838383838387e-05, "loss": 0.2712, "step": 96 }, { "epoch": 0.24494949494949494, "grad_norm": 7.329760551452637, "learning_rate": 1.8367003367003367e-05, "loss": 0.3223, "step": 97 }, { "epoch": 0.2474747474747475, "grad_norm": 3.329362154006958, "learning_rate": 1.8350168350168352e-05, "loss": 0.1859, "step": 98 }, { "epoch": 0.25, "grad_norm": 2.950449228286743, "learning_rate": 1.8333333333333333e-05, "loss": 0.2032, "step": 99 }, { "epoch": 0.25252525252525254, "grad_norm": 3.4235892295837402, "learning_rate": 1.831649831649832e-05, "loss": 0.1982, "step": 100 }, { "epoch": 0.255050505050505, "grad_norm": 4.13006067276001, "learning_rate": 1.82996632996633e-05, "loss": 0.1787, "step": 101 }, { "epoch": 0.25757575757575757, "grad_norm": 2.0153565406799316, "learning_rate": 1.8282828282828286e-05, "loss": 0.1408, "step": 102 }, { "epoch": 0.2601010101010101, "grad_norm": 3.2294890880584717, "learning_rate": 1.8265993265993267e-05, "loss": 0.2113, "step": 103 }, { "epoch": 0.26262626262626265, "grad_norm": 3.2181968688964844, "learning_rate": 1.824915824915825e-05, "loss": 0.1296, "step": 104 }, { "epoch": 0.26515151515151514, "grad_norm": 1.6924734115600586, "learning_rate": 1.8232323232323236e-05, "loss": 0.1773, "step": 105 }, { "epoch": 0.2676767676767677, "grad_norm": 5.491613864898682, "learning_rate": 1.8215488215488217e-05, "loss": 0.1511, "step": 106 }, { "epoch": 0.2702020202020202, "grad_norm": 4.4867143630981445, "learning_rate": 1.81986531986532e-05, "loss": 0.1978, "step": 107 }, { "epoch": 0.2727272727272727, "grad_norm": 1.801491379737854, "learning_rate": 1.8181818181818182e-05, "loss": 0.2535, "step": 108 }, { "epoch": 0.27525252525252525, "grad_norm": 2.2414021492004395, "learning_rate": 1.8164983164983166e-05, "loss": 0.2129, "step": 109 }, { "epoch": 0.2777777777777778, "grad_norm": 1.8164544105529785, "learning_rate": 1.814814814814815e-05, "loss": 0.1744, "step": 110 }, { "epoch": 0.2803030303030303, "grad_norm": 1.4675378799438477, "learning_rate": 1.8131313131313132e-05, "loss": 0.2502, "step": 111 }, { "epoch": 0.2828282828282828, "grad_norm": 2.9425742626190186, "learning_rate": 1.8114478114478116e-05, "loss": 0.1312, "step": 112 }, { "epoch": 0.28535353535353536, "grad_norm": 2.8444998264312744, "learning_rate": 1.8097643097643097e-05, "loss": 0.233, "step": 113 }, { "epoch": 0.2878787878787879, "grad_norm": 1.8977577686309814, "learning_rate": 1.8080808080808085e-05, "loss": 0.0896, "step": 114 }, { "epoch": 0.2904040404040404, "grad_norm": 4.595700740814209, "learning_rate": 1.8063973063973066e-05, "loss": 0.3304, "step": 115 }, { "epoch": 0.29292929292929293, "grad_norm": 2.2750136852264404, "learning_rate": 1.804713804713805e-05, "loss": 0.209, "step": 116 }, { "epoch": 0.29545454545454547, "grad_norm": 2.0217509269714355, "learning_rate": 1.803030303030303e-05, "loss": 0.2202, "step": 117 }, { "epoch": 0.29797979797979796, "grad_norm": 2.943140745162964, "learning_rate": 1.8013468013468016e-05, "loss": 0.2903, "step": 118 }, { "epoch": 0.3005050505050505, "grad_norm": 2.4190146923065186, "learning_rate": 1.7996632996633e-05, "loss": 0.1384, "step": 119 }, { "epoch": 0.30303030303030304, "grad_norm": 3.664355993270874, "learning_rate": 1.797979797979798e-05, "loss": 0.1866, "step": 120 }, { "epoch": 0.3055555555555556, "grad_norm": 3.616316795349121, "learning_rate": 1.7962962962962965e-05, "loss": 0.2016, "step": 121 }, { "epoch": 0.30808080808080807, "grad_norm": 6.439982891082764, "learning_rate": 1.7946127946127946e-05, "loss": 0.2699, "step": 122 }, { "epoch": 0.3106060606060606, "grad_norm": 3.2625112533569336, "learning_rate": 1.792929292929293e-05, "loss": 0.242, "step": 123 }, { "epoch": 0.31313131313131315, "grad_norm": 4.760579586029053, "learning_rate": 1.7912457912457915e-05, "loss": 0.1812, "step": 124 }, { "epoch": 0.31565656565656564, "grad_norm": 5.375882625579834, "learning_rate": 1.7895622895622896e-05, "loss": 0.0892, "step": 125 }, { "epoch": 0.3181818181818182, "grad_norm": 1.5627996921539307, "learning_rate": 1.787878787878788e-05, "loss": 0.1608, "step": 126 }, { "epoch": 0.3207070707070707, "grad_norm": 2.0782926082611084, "learning_rate": 1.786195286195286e-05, "loss": 0.1384, "step": 127 }, { "epoch": 0.32323232323232326, "grad_norm": 3.5221481323242188, "learning_rate": 1.7845117845117846e-05, "loss": 0.2595, "step": 128 }, { "epoch": 0.32575757575757575, "grad_norm": 1.7717233896255493, "learning_rate": 1.782828282828283e-05, "loss": 0.1401, "step": 129 }, { "epoch": 0.3282828282828283, "grad_norm": 3.81760311126709, "learning_rate": 1.781144781144781e-05, "loss": 0.1153, "step": 130 }, { "epoch": 0.33080808080808083, "grad_norm": 4.479602813720703, "learning_rate": 1.7794612794612796e-05, "loss": 0.2117, "step": 131 }, { "epoch": 0.3333333333333333, "grad_norm": 1.1932178735733032, "learning_rate": 1.7777777777777777e-05, "loss": 0.1669, "step": 132 }, { "epoch": 0.33585858585858586, "grad_norm": 3.330796003341675, "learning_rate": 1.7760942760942764e-05, "loss": 0.152, "step": 133 }, { "epoch": 0.3383838383838384, "grad_norm": 3.5781233310699463, "learning_rate": 1.7744107744107745e-05, "loss": 0.269, "step": 134 }, { "epoch": 0.3409090909090909, "grad_norm": 2.489184617996216, "learning_rate": 1.772727272727273e-05, "loss": 0.1631, "step": 135 }, { "epoch": 0.3434343434343434, "grad_norm": 5.023707389831543, "learning_rate": 1.771043771043771e-05, "loss": 0.1623, "step": 136 }, { "epoch": 0.34595959595959597, "grad_norm": 1.8095295429229736, "learning_rate": 1.7693602693602695e-05, "loss": 0.1941, "step": 137 }, { "epoch": 0.3484848484848485, "grad_norm": 5.773559093475342, "learning_rate": 1.767676767676768e-05, "loss": 0.2198, "step": 138 }, { "epoch": 0.351010101010101, "grad_norm": 2.3348917961120605, "learning_rate": 1.765993265993266e-05, "loss": 0.0749, "step": 139 }, { "epoch": 0.35353535353535354, "grad_norm": 4.8729023933410645, "learning_rate": 1.7643097643097645e-05, "loss": 0.2523, "step": 140 }, { "epoch": 0.3560606060606061, "grad_norm": 2.1227433681488037, "learning_rate": 1.7626262626262626e-05, "loss": 0.1616, "step": 141 }, { "epoch": 0.35858585858585856, "grad_norm": 4.208232402801514, "learning_rate": 1.760942760942761e-05, "loss": 0.1206, "step": 142 }, { "epoch": 0.3611111111111111, "grad_norm": 2.2808191776275635, "learning_rate": 1.7592592592592595e-05, "loss": 0.1413, "step": 143 }, { "epoch": 0.36363636363636365, "grad_norm": 2.797044515609741, "learning_rate": 1.7575757575757576e-05, "loss": 0.1553, "step": 144 }, { "epoch": 0.3661616161616162, "grad_norm": 2.0235748291015625, "learning_rate": 1.755892255892256e-05, "loss": 0.1748, "step": 145 }, { "epoch": 0.3686868686868687, "grad_norm": 1.668614149093628, "learning_rate": 1.754208754208754e-05, "loss": 0.1397, "step": 146 }, { "epoch": 0.3712121212121212, "grad_norm": 2.048588991165161, "learning_rate": 1.752525252525253e-05, "loss": 0.1636, "step": 147 }, { "epoch": 0.37373737373737376, "grad_norm": 3.2544357776641846, "learning_rate": 1.750841750841751e-05, "loss": 0.1906, "step": 148 }, { "epoch": 0.37626262626262624, "grad_norm": 2.5983431339263916, "learning_rate": 1.7491582491582494e-05, "loss": 0.1833, "step": 149 }, { "epoch": 0.3787878787878788, "grad_norm": 3.579721689224243, "learning_rate": 1.7474747474747475e-05, "loss": 0.2445, "step": 150 }, { "epoch": 0.3813131313131313, "grad_norm": 3.889470338821411, "learning_rate": 1.745791245791246e-05, "loss": 0.2457, "step": 151 }, { "epoch": 0.3838383838383838, "grad_norm": 1.612406611442566, "learning_rate": 1.7441077441077444e-05, "loss": 0.1468, "step": 152 }, { "epoch": 0.38636363636363635, "grad_norm": 3.572401285171509, "learning_rate": 1.7424242424242425e-05, "loss": 0.1659, "step": 153 }, { "epoch": 0.3888888888888889, "grad_norm": 2.7137911319732666, "learning_rate": 1.740740740740741e-05, "loss": 0.2389, "step": 154 }, { "epoch": 0.39141414141414144, "grad_norm": 2.293943166732788, "learning_rate": 1.739057239057239e-05, "loss": 0.2188, "step": 155 }, { "epoch": 0.3939393939393939, "grad_norm": 5.641902923583984, "learning_rate": 1.7373737373737375e-05, "loss": 0.1843, "step": 156 }, { "epoch": 0.39646464646464646, "grad_norm": 4.039111137390137, "learning_rate": 1.735690235690236e-05, "loss": 0.2627, "step": 157 }, { "epoch": 0.398989898989899, "grad_norm": 2.942754030227661, "learning_rate": 1.734006734006734e-05, "loss": 0.2299, "step": 158 }, { "epoch": 0.4015151515151515, "grad_norm": 3.7655181884765625, "learning_rate": 1.7323232323232324e-05, "loss": 0.2167, "step": 159 }, { "epoch": 0.40404040404040403, "grad_norm": 2.5062334537506104, "learning_rate": 1.7306397306397305e-05, "loss": 0.164, "step": 160 }, { "epoch": 0.4065656565656566, "grad_norm": 2.07106614112854, "learning_rate": 1.7289562289562293e-05, "loss": 0.1956, "step": 161 }, { "epoch": 0.4090909090909091, "grad_norm": 6.250090599060059, "learning_rate": 1.7272727272727274e-05, "loss": 0.1399, "step": 162 }, { "epoch": 0.4116161616161616, "grad_norm": 2.017141819000244, "learning_rate": 1.725589225589226e-05, "loss": 0.2036, "step": 163 }, { "epoch": 0.41414141414141414, "grad_norm": 3.3339602947235107, "learning_rate": 1.723905723905724e-05, "loss": 0.1596, "step": 164 }, { "epoch": 0.4166666666666667, "grad_norm": 3.8334908485412598, "learning_rate": 1.7222222222222224e-05, "loss": 0.1267, "step": 165 }, { "epoch": 0.41919191919191917, "grad_norm": 2.0751090049743652, "learning_rate": 1.7205387205387208e-05, "loss": 0.2058, "step": 166 }, { "epoch": 0.4217171717171717, "grad_norm": 3.0513789653778076, "learning_rate": 1.718855218855219e-05, "loss": 0.091, "step": 167 }, { "epoch": 0.42424242424242425, "grad_norm": 5.1696648597717285, "learning_rate": 1.7171717171717173e-05, "loss": 0.2313, "step": 168 }, { "epoch": 0.42676767676767674, "grad_norm": 3.9072530269622803, "learning_rate": 1.7154882154882154e-05, "loss": 0.2648, "step": 169 }, { "epoch": 0.4292929292929293, "grad_norm": 4.278628349304199, "learning_rate": 1.713804713804714e-05, "loss": 0.1539, "step": 170 }, { "epoch": 0.4318181818181818, "grad_norm": 1.6870406866073608, "learning_rate": 1.7121212121212123e-05, "loss": 0.1714, "step": 171 }, { "epoch": 0.43434343434343436, "grad_norm": 1.6782217025756836, "learning_rate": 1.7104377104377104e-05, "loss": 0.1269, "step": 172 }, { "epoch": 0.43686868686868685, "grad_norm": 5.854135513305664, "learning_rate": 1.708754208754209e-05, "loss": 0.1568, "step": 173 }, { "epoch": 0.4393939393939394, "grad_norm": 3.947122097015381, "learning_rate": 1.707070707070707e-05, "loss": 0.2051, "step": 174 }, { "epoch": 0.44191919191919193, "grad_norm": 2.085911273956299, "learning_rate": 1.7053872053872057e-05, "loss": 0.1101, "step": 175 }, { "epoch": 0.4444444444444444, "grad_norm": 4.145143985748291, "learning_rate": 1.7037037037037038e-05, "loss": 0.1773, "step": 176 }, { "epoch": 0.44696969696969696, "grad_norm": 4.920554161071777, "learning_rate": 1.7020202020202023e-05, "loss": 0.2379, "step": 177 }, { "epoch": 0.4494949494949495, "grad_norm": 2.8730502128601074, "learning_rate": 1.7003367003367004e-05, "loss": 0.1019, "step": 178 }, { "epoch": 0.45202020202020204, "grad_norm": 1.0413464307785034, "learning_rate": 1.6986531986531988e-05, "loss": 0.1205, "step": 179 }, { "epoch": 0.45454545454545453, "grad_norm": 2.591437816619873, "learning_rate": 1.6969696969696972e-05, "loss": 0.1173, "step": 180 }, { "epoch": 0.45707070707070707, "grad_norm": 4.737552165985107, "learning_rate": 1.6952861952861953e-05, "loss": 0.4692, "step": 181 }, { "epoch": 0.4595959595959596, "grad_norm": 5.872066974639893, "learning_rate": 1.6936026936026938e-05, "loss": 0.178, "step": 182 }, { "epoch": 0.4621212121212121, "grad_norm": 4.527502536773682, "learning_rate": 1.691919191919192e-05, "loss": 0.2855, "step": 183 }, { "epoch": 0.46464646464646464, "grad_norm": 3.166898488998413, "learning_rate": 1.6902356902356903e-05, "loss": 0.201, "step": 184 }, { "epoch": 0.4671717171717172, "grad_norm": 8.388322830200195, "learning_rate": 1.6885521885521888e-05, "loss": 0.2455, "step": 185 }, { "epoch": 0.4696969696969697, "grad_norm": 3.2028310298919678, "learning_rate": 1.686868686868687e-05, "loss": 0.2577, "step": 186 }, { "epoch": 0.4722222222222222, "grad_norm": 3.2072689533233643, "learning_rate": 1.6851851851851853e-05, "loss": 0.1123, "step": 187 }, { "epoch": 0.47474747474747475, "grad_norm": 2.532289743423462, "learning_rate": 1.6835016835016837e-05, "loss": 0.2389, "step": 188 }, { "epoch": 0.4772727272727273, "grad_norm": 3.049967050552368, "learning_rate": 1.681818181818182e-05, "loss": 0.1156, "step": 189 }, { "epoch": 0.4797979797979798, "grad_norm": 2.940448760986328, "learning_rate": 1.6801346801346803e-05, "loss": 0.149, "step": 190 }, { "epoch": 0.4823232323232323, "grad_norm": 2.2545042037963867, "learning_rate": 1.6784511784511787e-05, "loss": 0.1751, "step": 191 }, { "epoch": 0.48484848484848486, "grad_norm": 2.66123628616333, "learning_rate": 1.6767676767676768e-05, "loss": 0.1685, "step": 192 }, { "epoch": 0.48737373737373735, "grad_norm": 2.0476951599121094, "learning_rate": 1.6750841750841752e-05, "loss": 0.1705, "step": 193 }, { "epoch": 0.4898989898989899, "grad_norm": 2.9459142684936523, "learning_rate": 1.6734006734006737e-05, "loss": 0.1873, "step": 194 }, { "epoch": 0.49242424242424243, "grad_norm": 3.9844117164611816, "learning_rate": 1.6717171717171718e-05, "loss": 0.1531, "step": 195 }, { "epoch": 0.494949494949495, "grad_norm": 6.765873908996582, "learning_rate": 1.6700336700336702e-05, "loss": 0.164, "step": 196 }, { "epoch": 0.49747474747474746, "grad_norm": 2.809617757797241, "learning_rate": 1.6683501683501683e-05, "loss": 0.144, "step": 197 }, { "epoch": 0.5, "grad_norm": 6.575211524963379, "learning_rate": 1.6666666666666667e-05, "loss": 0.2556, "step": 198 }, { "epoch": 0.5025252525252525, "grad_norm": 4.31246280670166, "learning_rate": 1.6649831649831652e-05, "loss": 0.1998, "step": 199 }, { "epoch": 0.5050505050505051, "grad_norm": 3.3406026363372803, "learning_rate": 1.6632996632996633e-05, "loss": 0.1341, "step": 200 }, { "epoch": 0.5075757575757576, "grad_norm": 2.613698720932007, "learning_rate": 1.6616161616161617e-05, "loss": 0.1245, "step": 201 }, { "epoch": 0.51010101010101, "grad_norm": 4.394161224365234, "learning_rate": 1.65993265993266e-05, "loss": 0.1683, "step": 202 }, { "epoch": 0.5126262626262627, "grad_norm": 1.0652117729187012, "learning_rate": 1.6582491582491586e-05, "loss": 0.1133, "step": 203 }, { "epoch": 0.5151515151515151, "grad_norm": 2.15743350982666, "learning_rate": 1.6565656565656567e-05, "loss": 0.1525, "step": 204 }, { "epoch": 0.5176767676767676, "grad_norm": 1.6530711650848389, "learning_rate": 1.654882154882155e-05, "loss": 0.1417, "step": 205 }, { "epoch": 0.5202020202020202, "grad_norm": 6.711721897125244, "learning_rate": 1.6531986531986532e-05, "loss": 0.2334, "step": 206 }, { "epoch": 0.5227272727272727, "grad_norm": 1.627074122428894, "learning_rate": 1.6515151515151517e-05, "loss": 0.1847, "step": 207 }, { "epoch": 0.5252525252525253, "grad_norm": 1.3665039539337158, "learning_rate": 1.64983164983165e-05, "loss": 0.0733, "step": 208 }, { "epoch": 0.5277777777777778, "grad_norm": 1.800305724143982, "learning_rate": 1.6481481481481482e-05, "loss": 0.0821, "step": 209 }, { "epoch": 0.5303030303030303, "grad_norm": 2.238971710205078, "learning_rate": 1.6464646464646466e-05, "loss": 0.162, "step": 210 }, { "epoch": 0.5328282828282829, "grad_norm": 3.941727638244629, "learning_rate": 1.6447811447811447e-05, "loss": 0.171, "step": 211 }, { "epoch": 0.5353535353535354, "grad_norm": 2.0416862964630127, "learning_rate": 1.6430976430976432e-05, "loss": 0.121, "step": 212 }, { "epoch": 0.5378787878787878, "grad_norm": 2.75635027885437, "learning_rate": 1.6414141414141416e-05, "loss": 0.1973, "step": 213 }, { "epoch": 0.5404040404040404, "grad_norm": 5.226922512054443, "learning_rate": 1.6397306397306397e-05, "loss": 0.111, "step": 214 }, { "epoch": 0.5429292929292929, "grad_norm": 6.741361618041992, "learning_rate": 1.638047138047138e-05, "loss": 0.0877, "step": 215 }, { "epoch": 0.5454545454545454, "grad_norm": 2.957056999206543, "learning_rate": 1.6363636363636366e-05, "loss": 0.0627, "step": 216 }, { "epoch": 0.547979797979798, "grad_norm": 3.5542659759521484, "learning_rate": 1.634680134680135e-05, "loss": 0.2518, "step": 217 }, { "epoch": 0.5505050505050505, "grad_norm": 8.325895309448242, "learning_rate": 1.632996632996633e-05, "loss": 0.1792, "step": 218 }, { "epoch": 0.553030303030303, "grad_norm": 4.116200923919678, "learning_rate": 1.6313131313131316e-05, "loss": 0.0821, "step": 219 }, { "epoch": 0.5555555555555556, "grad_norm": 2.4049417972564697, "learning_rate": 1.6296296296296297e-05, "loss": 0.1445, "step": 220 }, { "epoch": 0.5580808080808081, "grad_norm": 2.702348470687866, "learning_rate": 1.627946127946128e-05, "loss": 0.2077, "step": 221 }, { "epoch": 0.5606060606060606, "grad_norm": 4.276516437530518, "learning_rate": 1.6262626262626265e-05, "loss": 0.189, "step": 222 }, { "epoch": 0.5631313131313131, "grad_norm": 2.212054491043091, "learning_rate": 1.6245791245791246e-05, "loss": 0.2111, "step": 223 }, { "epoch": 0.5656565656565656, "grad_norm": 2.9544410705566406, "learning_rate": 1.622895622895623e-05, "loss": 0.1649, "step": 224 }, { "epoch": 0.5681818181818182, "grad_norm": 3.0044991970062256, "learning_rate": 1.6212121212121212e-05, "loss": 0.1728, "step": 225 }, { "epoch": 0.5707070707070707, "grad_norm": 3.5259811878204346, "learning_rate": 1.6195286195286196e-05, "loss": 0.1621, "step": 226 }, { "epoch": 0.5732323232323232, "grad_norm": 3.774447441101074, "learning_rate": 1.617845117845118e-05, "loss": 0.2288, "step": 227 }, { "epoch": 0.5757575757575758, "grad_norm": 2.975698232650757, "learning_rate": 1.616161616161616e-05, "loss": 0.1691, "step": 228 }, { "epoch": 0.5782828282828283, "grad_norm": 4.2801713943481445, "learning_rate": 1.6144781144781146e-05, "loss": 0.1332, "step": 229 }, { "epoch": 0.5808080808080808, "grad_norm": 4.899673938751221, "learning_rate": 1.612794612794613e-05, "loss": 0.1472, "step": 230 }, { "epoch": 0.5833333333333334, "grad_norm": 5.345510482788086, "learning_rate": 1.6111111111111115e-05, "loss": 0.2117, "step": 231 }, { "epoch": 0.5858585858585859, "grad_norm": 3.8797693252563477, "learning_rate": 1.6094276094276096e-05, "loss": 0.2047, "step": 232 }, { "epoch": 0.5883838383838383, "grad_norm": 6.221108913421631, "learning_rate": 1.607744107744108e-05, "loss": 0.1221, "step": 233 }, { "epoch": 0.5909090909090909, "grad_norm": 3.437472343444824, "learning_rate": 1.606060606060606e-05, "loss": 0.1748, "step": 234 }, { "epoch": 0.5934343434343434, "grad_norm": 6.737703323364258, "learning_rate": 1.6043771043771045e-05, "loss": 0.1095, "step": 235 }, { "epoch": 0.5959595959595959, "grad_norm": 1.2895629405975342, "learning_rate": 1.602693602693603e-05, "loss": 0.0798, "step": 236 }, { "epoch": 0.5984848484848485, "grad_norm": 3.281799554824829, "learning_rate": 1.601010101010101e-05, "loss": 0.2223, "step": 237 }, { "epoch": 0.601010101010101, "grad_norm": 3.6054065227508545, "learning_rate": 1.5993265993265995e-05, "loss": 0.1802, "step": 238 }, { "epoch": 0.6035353535353535, "grad_norm": 2.032210350036621, "learning_rate": 1.597643097643098e-05, "loss": 0.1613, "step": 239 }, { "epoch": 0.6060606060606061, "grad_norm": 3.515641212463379, "learning_rate": 1.595959595959596e-05, "loss": 0.1694, "step": 240 }, { "epoch": 0.6085858585858586, "grad_norm": 3.1133809089660645, "learning_rate": 1.5942760942760945e-05, "loss": 0.1114, "step": 241 }, { "epoch": 0.6111111111111112, "grad_norm": 3.401221752166748, "learning_rate": 1.5925925925925926e-05, "loss": 0.1692, "step": 242 }, { "epoch": 0.6136363636363636, "grad_norm": 1.9235018491744995, "learning_rate": 1.590909090909091e-05, "loss": 0.2513, "step": 243 }, { "epoch": 0.6161616161616161, "grad_norm": 2.6812822818756104, "learning_rate": 1.5892255892255895e-05, "loss": 0.1857, "step": 244 }, { "epoch": 0.6186868686868687, "grad_norm": 3.470087766647339, "learning_rate": 1.5875420875420876e-05, "loss": 0.1377, "step": 245 }, { "epoch": 0.6212121212121212, "grad_norm": 2.309100866317749, "learning_rate": 1.585858585858586e-05, "loss": 0.22, "step": 246 }, { "epoch": 0.6237373737373737, "grad_norm": 5.392738342285156, "learning_rate": 1.584175084175084e-05, "loss": 0.1767, "step": 247 }, { "epoch": 0.6262626262626263, "grad_norm": 3.751511573791504, "learning_rate": 1.5824915824915825e-05, "loss": 0.1504, "step": 248 }, { "epoch": 0.6287878787878788, "grad_norm": 1.9343714714050293, "learning_rate": 1.580808080808081e-05, "loss": 0.2363, "step": 249 }, { "epoch": 0.6313131313131313, "grad_norm": 3.65728759765625, "learning_rate": 1.5791245791245794e-05, "loss": 0.3138, "step": 250 }, { "epoch": 0.6338383838383839, "grad_norm": 4.637652397155762, "learning_rate": 1.5774410774410775e-05, "loss": 0.1518, "step": 251 }, { "epoch": 0.6363636363636364, "grad_norm": 2.6128430366516113, "learning_rate": 1.575757575757576e-05, "loss": 0.137, "step": 252 }, { "epoch": 0.6388888888888888, "grad_norm": 2.5993456840515137, "learning_rate": 1.5740740740740744e-05, "loss": 0.2126, "step": 253 }, { "epoch": 0.6414141414141414, "grad_norm": 2.630402088165283, "learning_rate": 1.5723905723905725e-05, "loss": 0.1712, "step": 254 }, { "epoch": 0.6439393939393939, "grad_norm": 3.6941192150115967, "learning_rate": 1.570707070707071e-05, "loss": 0.251, "step": 255 }, { "epoch": 0.6464646464646465, "grad_norm": 3.1765594482421875, "learning_rate": 1.569023569023569e-05, "loss": 0.2738, "step": 256 }, { "epoch": 0.648989898989899, "grad_norm": 5.44793701171875, "learning_rate": 1.5673400673400674e-05, "loss": 0.1806, "step": 257 }, { "epoch": 0.6515151515151515, "grad_norm": 2.671917676925659, "learning_rate": 1.565656565656566e-05, "loss": 0.2302, "step": 258 }, { "epoch": 0.6540404040404041, "grad_norm": 3.816720485687256, "learning_rate": 1.563973063973064e-05, "loss": 0.2166, "step": 259 }, { "epoch": 0.6565656565656566, "grad_norm": 4.604842662811279, "learning_rate": 1.5622895622895624e-05, "loss": 0.1936, "step": 260 }, { "epoch": 0.6590909090909091, "grad_norm": 3.8842062950134277, "learning_rate": 1.5606060606060605e-05, "loss": 0.1321, "step": 261 }, { "epoch": 0.6616161616161617, "grad_norm": 4.19383430480957, "learning_rate": 1.558922558922559e-05, "loss": 0.2093, "step": 262 }, { "epoch": 0.6641414141414141, "grad_norm": 6.01501989364624, "learning_rate": 1.5572390572390574e-05, "loss": 0.281, "step": 263 }, { "epoch": 0.6666666666666666, "grad_norm": 3.173448324203491, "learning_rate": 1.555555555555556e-05, "loss": 0.2289, "step": 264 }, { "epoch": 0.6691919191919192, "grad_norm": 3.035527229309082, "learning_rate": 1.553872053872054e-05, "loss": 0.2046, "step": 265 }, { "epoch": 0.6717171717171717, "grad_norm": 4.2569684982299805, "learning_rate": 1.5521885521885524e-05, "loss": 0.3241, "step": 266 }, { "epoch": 0.6742424242424242, "grad_norm": 4.195226669311523, "learning_rate": 1.5505050505050508e-05, "loss": 0.1396, "step": 267 }, { "epoch": 0.6767676767676768, "grad_norm": 1.8019922971725464, "learning_rate": 1.548821548821549e-05, "loss": 0.1341, "step": 268 }, { "epoch": 0.6792929292929293, "grad_norm": 2.006047248840332, "learning_rate": 1.5471380471380473e-05, "loss": 0.2256, "step": 269 }, { "epoch": 0.6818181818181818, "grad_norm": 2.592977523803711, "learning_rate": 1.5454545454545454e-05, "loss": 0.1794, "step": 270 }, { "epoch": 0.6843434343434344, "grad_norm": 1.8588799238204956, "learning_rate": 1.543771043771044e-05, "loss": 0.1279, "step": 271 }, { "epoch": 0.6868686868686869, "grad_norm": 2.6189968585968018, "learning_rate": 1.5420875420875423e-05, "loss": 0.0616, "step": 272 }, { "epoch": 0.6893939393939394, "grad_norm": 1.683362603187561, "learning_rate": 1.5404040404040404e-05, "loss": 0.0476, "step": 273 }, { "epoch": 0.6919191919191919, "grad_norm": 2.88405179977417, "learning_rate": 1.538720538720539e-05, "loss": 0.1016, "step": 274 }, { "epoch": 0.6944444444444444, "grad_norm": 1.6002604961395264, "learning_rate": 1.537037037037037e-05, "loss": 0.1343, "step": 275 }, { "epoch": 0.696969696969697, "grad_norm": 1.0753880739212036, "learning_rate": 1.5353535353535354e-05, "loss": 0.1, "step": 276 }, { "epoch": 0.6994949494949495, "grad_norm": 3.1269478797912598, "learning_rate": 1.5336700336700338e-05, "loss": 0.136, "step": 277 }, { "epoch": 0.702020202020202, "grad_norm": 2.584567070007324, "learning_rate": 1.5319865319865323e-05, "loss": 0.2408, "step": 278 }, { "epoch": 0.7045454545454546, "grad_norm": 3.7829692363739014, "learning_rate": 1.5303030303030304e-05, "loss": 0.1797, "step": 279 }, { "epoch": 0.7070707070707071, "grad_norm": 1.9160706996917725, "learning_rate": 1.5286195286195288e-05, "loss": 0.1379, "step": 280 }, { "epoch": 0.7095959595959596, "grad_norm": 1.7192413806915283, "learning_rate": 1.5269360269360272e-05, "loss": 0.0992, "step": 281 }, { "epoch": 0.7121212121212122, "grad_norm": 1.8255947828292847, "learning_rate": 1.5252525252525255e-05, "loss": 0.1683, "step": 282 }, { "epoch": 0.7146464646464646, "grad_norm": 1.4913876056671143, "learning_rate": 1.5235690235690238e-05, "loss": 0.1559, "step": 283 }, { "epoch": 0.7171717171717171, "grad_norm": 1.4210553169250488, "learning_rate": 1.521885521885522e-05, "loss": 0.0947, "step": 284 }, { "epoch": 0.7196969696969697, "grad_norm": 2.0691561698913574, "learning_rate": 1.5202020202020203e-05, "loss": 0.1111, "step": 285 }, { "epoch": 0.7222222222222222, "grad_norm": 1.425347089767456, "learning_rate": 1.5185185185185187e-05, "loss": 0.1305, "step": 286 }, { "epoch": 0.7247474747474747, "grad_norm": 2.620968818664551, "learning_rate": 1.516835016835017e-05, "loss": 0.1324, "step": 287 }, { "epoch": 0.7272727272727273, "grad_norm": 1.2153469324111938, "learning_rate": 1.5151515151515153e-05, "loss": 0.1875, "step": 288 }, { "epoch": 0.7297979797979798, "grad_norm": 2.5912091732025146, "learning_rate": 1.5134680134680136e-05, "loss": 0.1125, "step": 289 }, { "epoch": 0.7323232323232324, "grad_norm": 3.2707126140594482, "learning_rate": 1.5117845117845118e-05, "loss": 0.175, "step": 290 }, { "epoch": 0.7348484848484849, "grad_norm": 2.4020352363586426, "learning_rate": 1.5101010101010103e-05, "loss": 0.1203, "step": 291 }, { "epoch": 0.7373737373737373, "grad_norm": 4.660423278808594, "learning_rate": 1.5084175084175085e-05, "loss": 0.2577, "step": 292 }, { "epoch": 0.73989898989899, "grad_norm": 5.82301139831543, "learning_rate": 1.5067340067340068e-05, "loss": 0.1374, "step": 293 }, { "epoch": 0.7424242424242424, "grad_norm": 1.974256992340088, "learning_rate": 1.505050505050505e-05, "loss": 0.1145, "step": 294 }, { "epoch": 0.7449494949494949, "grad_norm": 2.0848591327667236, "learning_rate": 1.5033670033670035e-05, "loss": 0.1168, "step": 295 }, { "epoch": 0.7474747474747475, "grad_norm": 2.9144437313079834, "learning_rate": 1.5016835016835018e-05, "loss": 0.2312, "step": 296 }, { "epoch": 0.75, "grad_norm": 4.225992202758789, "learning_rate": 1.5000000000000002e-05, "loss": 0.1782, "step": 297 }, { "epoch": 0.7525252525252525, "grad_norm": 4.229215145111084, "learning_rate": 1.4983164983164985e-05, "loss": 0.1512, "step": 298 }, { "epoch": 0.7550505050505051, "grad_norm": 2.8152060508728027, "learning_rate": 1.4966329966329967e-05, "loss": 0.1511, "step": 299 }, { "epoch": 0.7575757575757576, "grad_norm": 3.588789224624634, "learning_rate": 1.4949494949494952e-05, "loss": 0.2879, "step": 300 }, { "epoch": 0.76010101010101, "grad_norm": 3.0448029041290283, "learning_rate": 1.4932659932659934e-05, "loss": 0.1764, "step": 301 }, { "epoch": 0.7626262626262627, "grad_norm": 1.7650105953216553, "learning_rate": 1.4915824915824917e-05, "loss": 0.1692, "step": 302 }, { "epoch": 0.7651515151515151, "grad_norm": 1.2958582639694214, "learning_rate": 1.48989898989899e-05, "loss": 0.1425, "step": 303 }, { "epoch": 0.7676767676767676, "grad_norm": 2.6900827884674072, "learning_rate": 1.4882154882154884e-05, "loss": 0.2965, "step": 304 }, { "epoch": 0.7702020202020202, "grad_norm": 5.048685550689697, "learning_rate": 1.4865319865319867e-05, "loss": 0.2404, "step": 305 }, { "epoch": 0.7727272727272727, "grad_norm": 3.7027716636657715, "learning_rate": 1.484848484848485e-05, "loss": 0.148, "step": 306 }, { "epoch": 0.7752525252525253, "grad_norm": 4.220457553863525, "learning_rate": 1.4831649831649832e-05, "loss": 0.1583, "step": 307 }, { "epoch": 0.7777777777777778, "grad_norm": 3.0033810138702393, "learning_rate": 1.4814814814814815e-05, "loss": 0.2244, "step": 308 }, { "epoch": 0.7803030303030303, "grad_norm": 4.2939043045043945, "learning_rate": 1.47979797979798e-05, "loss": 0.1701, "step": 309 }, { "epoch": 0.7828282828282829, "grad_norm": 2.8431057929992676, "learning_rate": 1.4781144781144782e-05, "loss": 0.1284, "step": 310 }, { "epoch": 0.7853535353535354, "grad_norm": 1.8190436363220215, "learning_rate": 1.4764309764309765e-05, "loss": 0.22, "step": 311 }, { "epoch": 0.7878787878787878, "grad_norm": 4.867546558380127, "learning_rate": 1.4747474747474747e-05, "loss": 0.308, "step": 312 }, { "epoch": 0.7904040404040404, "grad_norm": 2.632307767868042, "learning_rate": 1.473063973063973e-05, "loss": 0.1137, "step": 313 }, { "epoch": 0.7929292929292929, "grad_norm": 5.3593339920043945, "learning_rate": 1.4713804713804716e-05, "loss": 0.1462, "step": 314 }, { "epoch": 0.7954545454545454, "grad_norm": 1.6120672225952148, "learning_rate": 1.4696969696969699e-05, "loss": 0.1291, "step": 315 }, { "epoch": 0.797979797979798, "grad_norm": 2.3134396076202393, "learning_rate": 1.4680134680134681e-05, "loss": 0.2119, "step": 316 }, { "epoch": 0.8005050505050505, "grad_norm": 3.2344558238983154, "learning_rate": 1.4663299663299664e-05, "loss": 0.1588, "step": 317 }, { "epoch": 0.803030303030303, "grad_norm": 3.4733057022094727, "learning_rate": 1.4646464646464649e-05, "loss": 0.1466, "step": 318 }, { "epoch": 0.8055555555555556, "grad_norm": 3.2476141452789307, "learning_rate": 1.4629629629629631e-05, "loss": 0.2466, "step": 319 }, { "epoch": 0.8080808080808081, "grad_norm": 5.198851108551025, "learning_rate": 1.4612794612794614e-05, "loss": 0.2239, "step": 320 }, { "epoch": 0.8106060606060606, "grad_norm": 2.9820196628570557, "learning_rate": 1.4595959595959597e-05, "loss": 0.2467, "step": 321 }, { "epoch": 0.8131313131313131, "grad_norm": 3.2972326278686523, "learning_rate": 1.457912457912458e-05, "loss": 0.1642, "step": 322 }, { "epoch": 0.8156565656565656, "grad_norm": 2.3504161834716797, "learning_rate": 1.4562289562289564e-05, "loss": 0.0984, "step": 323 }, { "epoch": 0.8181818181818182, "grad_norm": 4.491511821746826, "learning_rate": 1.4545454545454546e-05, "loss": 0.2576, "step": 324 }, { "epoch": 0.8207070707070707, "grad_norm": 3.1529364585876465, "learning_rate": 1.4528619528619529e-05, "loss": 0.1666, "step": 325 }, { "epoch": 0.8232323232323232, "grad_norm": 3.350400447845459, "learning_rate": 1.4511784511784512e-05, "loss": 0.2227, "step": 326 }, { "epoch": 0.8257575757575758, "grad_norm": 9.460111618041992, "learning_rate": 1.4494949494949494e-05, "loss": 0.2407, "step": 327 }, { "epoch": 0.8282828282828283, "grad_norm": 2.648740768432617, "learning_rate": 1.447811447811448e-05, "loss": 0.2017, "step": 328 }, { "epoch": 0.8308080808080808, "grad_norm": 2.6164615154266357, "learning_rate": 1.4461279461279463e-05, "loss": 0.1981, "step": 329 }, { "epoch": 0.8333333333333334, "grad_norm": 8.09026050567627, "learning_rate": 1.4444444444444446e-05, "loss": 0.1033, "step": 330 }, { "epoch": 0.8358585858585859, "grad_norm": 2.6296372413635254, "learning_rate": 1.4427609427609428e-05, "loss": 0.1782, "step": 331 }, { "epoch": 0.8383838383838383, "grad_norm": 2.2566778659820557, "learning_rate": 1.4410774410774413e-05, "loss": 0.1623, "step": 332 }, { "epoch": 0.8409090909090909, "grad_norm": 2.2079505920410156, "learning_rate": 1.4393939393939396e-05, "loss": 0.1723, "step": 333 }, { "epoch": 0.8434343434343434, "grad_norm": 1.1799554824829102, "learning_rate": 1.4377104377104378e-05, "loss": 0.1547, "step": 334 }, { "epoch": 0.8459595959595959, "grad_norm": 1.7930541038513184, "learning_rate": 1.4360269360269361e-05, "loss": 0.0783, "step": 335 }, { "epoch": 0.8484848484848485, "grad_norm": 1.4967056512832642, "learning_rate": 1.4343434343434344e-05, "loss": 0.1916, "step": 336 }, { "epoch": 0.851010101010101, "grad_norm": 4.922051906585693, "learning_rate": 1.4326599326599328e-05, "loss": 0.1304, "step": 337 }, { "epoch": 0.8535353535353535, "grad_norm": 2.2897162437438965, "learning_rate": 1.430976430976431e-05, "loss": 0.2447, "step": 338 }, { "epoch": 0.8560606060606061, "grad_norm": 2.769693613052368, "learning_rate": 1.4292929292929293e-05, "loss": 0.1017, "step": 339 }, { "epoch": 0.8585858585858586, "grad_norm": 1.7574080228805542, "learning_rate": 1.4276094276094276e-05, "loss": 0.0973, "step": 340 }, { "epoch": 0.8611111111111112, "grad_norm": 1.2174127101898193, "learning_rate": 1.4259259259259259e-05, "loss": 0.1903, "step": 341 }, { "epoch": 0.8636363636363636, "grad_norm": 3.2463648319244385, "learning_rate": 1.4242424242424245e-05, "loss": 0.0818, "step": 342 }, { "epoch": 0.8661616161616161, "grad_norm": 6.192782402038574, "learning_rate": 1.4225589225589227e-05, "loss": 0.2601, "step": 343 }, { "epoch": 0.8686868686868687, "grad_norm": 2.965963125228882, "learning_rate": 1.420875420875421e-05, "loss": 0.1399, "step": 344 }, { "epoch": 0.8712121212121212, "grad_norm": 2.0515079498291016, "learning_rate": 1.4191919191919193e-05, "loss": 0.2507, "step": 345 }, { "epoch": 0.8737373737373737, "grad_norm": 2.2152068614959717, "learning_rate": 1.4175084175084177e-05, "loss": 0.107, "step": 346 }, { "epoch": 0.8762626262626263, "grad_norm": 1.5435770750045776, "learning_rate": 1.415824915824916e-05, "loss": 0.1083, "step": 347 }, { "epoch": 0.8787878787878788, "grad_norm": 9.01554012298584, "learning_rate": 1.4141414141414143e-05, "loss": 0.1817, "step": 348 }, { "epoch": 0.8813131313131313, "grad_norm": 4.514248847961426, "learning_rate": 1.4124579124579125e-05, "loss": 0.111, "step": 349 }, { "epoch": 0.8838383838383839, "grad_norm": 2.0948216915130615, "learning_rate": 1.4107744107744108e-05, "loss": 0.141, "step": 350 }, { "epoch": 0.8863636363636364, "grad_norm": 1.3202215433120728, "learning_rate": 1.4090909090909092e-05, "loss": 0.122, "step": 351 }, { "epoch": 0.8888888888888888, "grad_norm": 2.4289798736572266, "learning_rate": 1.4074074074074075e-05, "loss": 0.1194, "step": 352 }, { "epoch": 0.8914141414141414, "grad_norm": 1.9062124490737915, "learning_rate": 1.4057239057239058e-05, "loss": 0.118, "step": 353 }, { "epoch": 0.8939393939393939, "grad_norm": 4.942960739135742, "learning_rate": 1.404040404040404e-05, "loss": 0.1745, "step": 354 }, { "epoch": 0.8964646464646465, "grad_norm": 1.1973398923873901, "learning_rate": 1.4023569023569023e-05, "loss": 0.0998, "step": 355 }, { "epoch": 0.898989898989899, "grad_norm": 2.537156343460083, "learning_rate": 1.4006734006734009e-05, "loss": 0.2053, "step": 356 }, { "epoch": 0.9015151515151515, "grad_norm": 1.6075160503387451, "learning_rate": 1.3989898989898992e-05, "loss": 0.1967, "step": 357 }, { "epoch": 0.9040404040404041, "grad_norm": 1.8782991170883179, "learning_rate": 1.3973063973063974e-05, "loss": 0.1067, "step": 358 }, { "epoch": 0.9065656565656566, "grad_norm": 1.8922234773635864, "learning_rate": 1.3956228956228957e-05, "loss": 0.1048, "step": 359 }, { "epoch": 0.9090909090909091, "grad_norm": 2.411635637283325, "learning_rate": 1.3939393939393942e-05, "loss": 0.1557, "step": 360 }, { "epoch": 0.9116161616161617, "grad_norm": 2.8136637210845947, "learning_rate": 1.3922558922558924e-05, "loss": 0.207, "step": 361 }, { "epoch": 0.9141414141414141, "grad_norm": 3.0127274990081787, "learning_rate": 1.3905723905723907e-05, "loss": 0.1463, "step": 362 }, { "epoch": 0.9166666666666666, "grad_norm": 5.223660469055176, "learning_rate": 1.388888888888889e-05, "loss": 0.1901, "step": 363 }, { "epoch": 0.9191919191919192, "grad_norm": 2.7952096462249756, "learning_rate": 1.3872053872053872e-05, "loss": 0.1744, "step": 364 }, { "epoch": 0.9217171717171717, "grad_norm": 4.329316139221191, "learning_rate": 1.3855218855218857e-05, "loss": 0.1503, "step": 365 }, { "epoch": 0.9242424242424242, "grad_norm": 1.9337060451507568, "learning_rate": 1.383838383838384e-05, "loss": 0.1652, "step": 366 }, { "epoch": 0.9267676767676768, "grad_norm": 6.0468645095825195, "learning_rate": 1.3821548821548822e-05, "loss": 0.21, "step": 367 }, { "epoch": 0.9292929292929293, "grad_norm": 6.893640041351318, "learning_rate": 1.3804713804713805e-05, "loss": 0.0772, "step": 368 }, { "epoch": 0.9318181818181818, "grad_norm": 11.513550758361816, "learning_rate": 1.378787878787879e-05, "loss": 0.1589, "step": 369 }, { "epoch": 0.9343434343434344, "grad_norm": 3.4360713958740234, "learning_rate": 1.3771043771043773e-05, "loss": 0.1376, "step": 370 }, { "epoch": 0.9368686868686869, "grad_norm": 1.2209364175796509, "learning_rate": 1.3754208754208756e-05, "loss": 0.1248, "step": 371 }, { "epoch": 0.9393939393939394, "grad_norm": 4.991886615753174, "learning_rate": 1.3737373737373739e-05, "loss": 0.1584, "step": 372 }, { "epoch": 0.9419191919191919, "grad_norm": 5.338135242462158, "learning_rate": 1.3720538720538721e-05, "loss": 0.1409, "step": 373 }, { "epoch": 0.9444444444444444, "grad_norm": 1.5582698583602905, "learning_rate": 1.3703703703703706e-05, "loss": 0.0927, "step": 374 }, { "epoch": 0.946969696969697, "grad_norm": 2.0467121601104736, "learning_rate": 1.3686868686868689e-05, "loss": 0.1042, "step": 375 }, { "epoch": 0.9494949494949495, "grad_norm": 3.5191733837127686, "learning_rate": 1.3670033670033671e-05, "loss": 0.1543, "step": 376 }, { "epoch": 0.952020202020202, "grad_norm": 0.8805956244468689, "learning_rate": 1.3653198653198654e-05, "loss": 0.0977, "step": 377 }, { "epoch": 0.9545454545454546, "grad_norm": 1.0059103965759277, "learning_rate": 1.3636363636363637e-05, "loss": 0.111, "step": 378 }, { "epoch": 0.9570707070707071, "grad_norm": 2.59409761428833, "learning_rate": 1.3619528619528621e-05, "loss": 0.0839, "step": 379 }, { "epoch": 0.9595959595959596, "grad_norm": 3.308858633041382, "learning_rate": 1.3602693602693604e-05, "loss": 0.1505, "step": 380 }, { "epoch": 0.9621212121212122, "grad_norm": 7.8376970291137695, "learning_rate": 1.3585858585858586e-05, "loss": 0.1694, "step": 381 }, { "epoch": 0.9646464646464646, "grad_norm": 2.214016914367676, "learning_rate": 1.3569023569023569e-05, "loss": 0.2274, "step": 382 }, { "epoch": 0.9671717171717171, "grad_norm": 2.3379106521606445, "learning_rate": 1.3552188552188555e-05, "loss": 0.1562, "step": 383 }, { "epoch": 0.9696969696969697, "grad_norm": 4.5499043464660645, "learning_rate": 1.3535353535353538e-05, "loss": 0.1899, "step": 384 }, { "epoch": 0.9722222222222222, "grad_norm": 5.938162803649902, "learning_rate": 1.351851851851852e-05, "loss": 0.1627, "step": 385 }, { "epoch": 0.9747474747474747, "grad_norm": 2.1362643241882324, "learning_rate": 1.3501683501683503e-05, "loss": 0.1437, "step": 386 }, { "epoch": 0.9772727272727273, "grad_norm": 5.690845012664795, "learning_rate": 1.3484848484848486e-05, "loss": 0.1572, "step": 387 }, { "epoch": 0.9797979797979798, "grad_norm": 1.170046329498291, "learning_rate": 1.346801346801347e-05, "loss": 0.0697, "step": 388 }, { "epoch": 0.9823232323232324, "grad_norm": 2.7204504013061523, "learning_rate": 1.3451178451178453e-05, "loss": 0.1125, "step": 389 }, { "epoch": 0.9848484848484849, "grad_norm": 2.044360637664795, "learning_rate": 1.3434343434343436e-05, "loss": 0.0664, "step": 390 }, { "epoch": 0.9873737373737373, "grad_norm": 2.956345558166504, "learning_rate": 1.3417508417508418e-05, "loss": 0.1254, "step": 391 }, { "epoch": 0.98989898989899, "grad_norm": 3.5149104595184326, "learning_rate": 1.3400673400673401e-05, "loss": 0.243, "step": 392 }, { "epoch": 0.9924242424242424, "grad_norm": 3.848884344100952, "learning_rate": 1.3383838383838385e-05, "loss": 0.2502, "step": 393 }, { "epoch": 0.9949494949494949, "grad_norm": 5.738306522369385, "learning_rate": 1.3367003367003368e-05, "loss": 0.1173, "step": 394 }, { "epoch": 0.9974747474747475, "grad_norm": 3.4760327339172363, "learning_rate": 1.335016835016835e-05, "loss": 0.251, "step": 395 }, { "epoch": 1.0, "grad_norm": 3.0074808597564697, "learning_rate": 1.3333333333333333e-05, "loss": 0.2484, "step": 396 }, { "epoch": 1.0, "eval_accuracy": 0.7363636363636363, "eval_f1": 0.8896302474284127, "eval_loss": 0.1552901417016983, "eval_runtime": 38.4517, "eval_samples_per_second": 22.886, "eval_steps_per_second": 0.494, "step": 396 }, { "epoch": 1.0025252525252526, "grad_norm": 2.1957874298095703, "learning_rate": 1.331649831649832e-05, "loss": 0.2159, "step": 397 }, { "epoch": 1.005050505050505, "grad_norm": 1.4837441444396973, "learning_rate": 1.3299663299663302e-05, "loss": 0.1777, "step": 398 }, { "epoch": 1.0075757575757576, "grad_norm": 1.9972238540649414, "learning_rate": 1.3282828282828285e-05, "loss": 0.1156, "step": 399 }, { "epoch": 1.0101010101010102, "grad_norm": 2.300161361694336, "learning_rate": 1.3265993265993267e-05, "loss": 0.1664, "step": 400 }, { "epoch": 1.0126262626262625, "grad_norm": 5.714986801147461, "learning_rate": 1.324915824915825e-05, "loss": 0.1888, "step": 401 }, { "epoch": 1.0151515151515151, "grad_norm": 2.443563222885132, "learning_rate": 1.3232323232323234e-05, "loss": 0.1548, "step": 402 }, { "epoch": 1.0176767676767677, "grad_norm": 4.469695091247559, "learning_rate": 1.3215488215488217e-05, "loss": 0.197, "step": 403 }, { "epoch": 1.02020202020202, "grad_norm": 2.7210092544555664, "learning_rate": 1.31986531986532e-05, "loss": 0.1858, "step": 404 }, { "epoch": 1.0227272727272727, "grad_norm": 2.2780253887176514, "learning_rate": 1.3181818181818183e-05, "loss": 0.1188, "step": 405 }, { "epoch": 1.0252525252525253, "grad_norm": 4.452651500701904, "learning_rate": 1.3164983164983165e-05, "loss": 0.1813, "step": 406 }, { "epoch": 1.0277777777777777, "grad_norm": 3.2592673301696777, "learning_rate": 1.314814814814815e-05, "loss": 0.1642, "step": 407 }, { "epoch": 1.0303030303030303, "grad_norm": 2.8365068435668945, "learning_rate": 1.3131313131313132e-05, "loss": 0.1951, "step": 408 }, { "epoch": 1.0328282828282829, "grad_norm": 1.204214334487915, "learning_rate": 1.3114478114478115e-05, "loss": 0.0943, "step": 409 }, { "epoch": 1.0353535353535352, "grad_norm": 3.9835519790649414, "learning_rate": 1.3097643097643098e-05, "loss": 0.1914, "step": 410 }, { "epoch": 1.0378787878787878, "grad_norm": 2.162397861480713, "learning_rate": 1.3080808080808084e-05, "loss": 0.1681, "step": 411 }, { "epoch": 1.0404040404040404, "grad_norm": 5.173393726348877, "learning_rate": 1.3063973063973066e-05, "loss": 0.1114, "step": 412 }, { "epoch": 1.0429292929292928, "grad_norm": 3.75376558303833, "learning_rate": 1.3047138047138049e-05, "loss": 0.2572, "step": 413 }, { "epoch": 1.0454545454545454, "grad_norm": 2.164644479751587, "learning_rate": 1.3030303030303032e-05, "loss": 0.0901, "step": 414 }, { "epoch": 1.047979797979798, "grad_norm": 1.5438755750656128, "learning_rate": 1.3013468013468014e-05, "loss": 0.1127, "step": 415 }, { "epoch": 1.0505050505050506, "grad_norm": 1.1772854328155518, "learning_rate": 1.2996632996632999e-05, "loss": 0.1177, "step": 416 }, { "epoch": 1.053030303030303, "grad_norm": 1.5766481161117554, "learning_rate": 1.2979797979797981e-05, "loss": 0.186, "step": 417 }, { "epoch": 1.0555555555555556, "grad_norm": 3.8098015785217285, "learning_rate": 1.2962962962962964e-05, "loss": 0.2378, "step": 418 }, { "epoch": 1.0580808080808082, "grad_norm": 2.2948317527770996, "learning_rate": 1.2946127946127947e-05, "loss": 0.2139, "step": 419 }, { "epoch": 1.0606060606060606, "grad_norm": 3.4112160205841064, "learning_rate": 1.2929292929292931e-05, "loss": 0.1936, "step": 420 }, { "epoch": 1.0631313131313131, "grad_norm": 6.034069061279297, "learning_rate": 1.2912457912457914e-05, "loss": 0.0954, "step": 421 }, { "epoch": 1.0656565656565657, "grad_norm": 1.4731574058532715, "learning_rate": 1.2895622895622897e-05, "loss": 0.1835, "step": 422 }, { "epoch": 1.0681818181818181, "grad_norm": 2.9212472438812256, "learning_rate": 1.287878787878788e-05, "loss": 0.1626, "step": 423 }, { "epoch": 1.0707070707070707, "grad_norm": 2.7289297580718994, "learning_rate": 1.2861952861952862e-05, "loss": 0.0934, "step": 424 }, { "epoch": 1.0732323232323233, "grad_norm": 2.0637686252593994, "learning_rate": 1.2845117845117846e-05, "loss": 0.1157, "step": 425 }, { "epoch": 1.0757575757575757, "grad_norm": 5.231685638427734, "learning_rate": 1.2828282828282829e-05, "loss": 0.1704, "step": 426 }, { "epoch": 1.0782828282828283, "grad_norm": 2.5837466716766357, "learning_rate": 1.2811447811447812e-05, "loss": 0.1756, "step": 427 }, { "epoch": 1.0808080808080809, "grad_norm": 1.4013397693634033, "learning_rate": 1.2794612794612794e-05, "loss": 0.0978, "step": 428 }, { "epoch": 1.0833333333333333, "grad_norm": 2.0431172847747803, "learning_rate": 1.2777777777777777e-05, "loss": 0.1109, "step": 429 }, { "epoch": 1.0858585858585859, "grad_norm": 1.8190507888793945, "learning_rate": 1.2760942760942763e-05, "loss": 0.1234, "step": 430 }, { "epoch": 1.0883838383838385, "grad_norm": 2.79791259765625, "learning_rate": 1.2744107744107746e-05, "loss": 0.1882, "step": 431 }, { "epoch": 1.0909090909090908, "grad_norm": 1.9243866205215454, "learning_rate": 1.2727272727272728e-05, "loss": 0.2005, "step": 432 }, { "epoch": 1.0934343434343434, "grad_norm": 1.8950653076171875, "learning_rate": 1.2710437710437711e-05, "loss": 0.1346, "step": 433 }, { "epoch": 1.095959595959596, "grad_norm": 3.081726551055908, "learning_rate": 1.2693602693602696e-05, "loss": 0.1366, "step": 434 }, { "epoch": 1.0984848484848484, "grad_norm": 1.9953432083129883, "learning_rate": 1.2676767676767678e-05, "loss": 0.1675, "step": 435 }, { "epoch": 1.101010101010101, "grad_norm": 1.969563603401184, "learning_rate": 1.2659932659932661e-05, "loss": 0.0675, "step": 436 }, { "epoch": 1.1035353535353536, "grad_norm": 2.647690773010254, "learning_rate": 1.2643097643097644e-05, "loss": 0.1804, "step": 437 }, { "epoch": 1.106060606060606, "grad_norm": 1.8048343658447266, "learning_rate": 1.2626262626262626e-05, "loss": 0.1463, "step": 438 }, { "epoch": 1.1085858585858586, "grad_norm": 3.305330514907837, "learning_rate": 1.260942760942761e-05, "loss": 0.1339, "step": 439 }, { "epoch": 1.1111111111111112, "grad_norm": 1.416746973991394, "learning_rate": 1.2592592592592593e-05, "loss": 0.1269, "step": 440 }, { "epoch": 1.1136363636363635, "grad_norm": 5.987617015838623, "learning_rate": 1.2575757575757576e-05, "loss": 0.1437, "step": 441 }, { "epoch": 1.1161616161616161, "grad_norm": 2.646730422973633, "learning_rate": 1.2558922558922559e-05, "loss": 0.1424, "step": 442 }, { "epoch": 1.1186868686868687, "grad_norm": 12.504968643188477, "learning_rate": 1.2542087542087541e-05, "loss": 0.2163, "step": 443 }, { "epoch": 1.121212121212121, "grad_norm": 3.575183153152466, "learning_rate": 1.2525252525252527e-05, "loss": 0.2721, "step": 444 }, { "epoch": 1.1237373737373737, "grad_norm": 2.08457088470459, "learning_rate": 1.250841750841751e-05, "loss": 0.0727, "step": 445 }, { "epoch": 1.1262626262626263, "grad_norm": 3.8432371616363525, "learning_rate": 1.2491582491582493e-05, "loss": 0.1455, "step": 446 }, { "epoch": 1.128787878787879, "grad_norm": 1.7455401420593262, "learning_rate": 1.2474747474747475e-05, "loss": 0.1424, "step": 447 }, { "epoch": 1.1313131313131313, "grad_norm": 2.4613749980926514, "learning_rate": 1.245791245791246e-05, "loss": 0.2304, "step": 448 }, { "epoch": 1.1338383838383839, "grad_norm": 0.8536304831504822, "learning_rate": 1.2441077441077443e-05, "loss": 0.0967, "step": 449 }, { "epoch": 1.1363636363636362, "grad_norm": 1.8662675619125366, "learning_rate": 1.2424242424242425e-05, "loss": 0.1165, "step": 450 }, { "epoch": 1.1388888888888888, "grad_norm": 6.983151435852051, "learning_rate": 1.2407407407407408e-05, "loss": 0.1065, "step": 451 }, { "epoch": 1.1414141414141414, "grad_norm": 3.3465776443481445, "learning_rate": 1.239057239057239e-05, "loss": 0.1295, "step": 452 }, { "epoch": 1.143939393939394, "grad_norm": 3.347223997116089, "learning_rate": 1.2373737373737375e-05, "loss": 0.0947, "step": 453 }, { "epoch": 1.1464646464646464, "grad_norm": 2.8548214435577393, "learning_rate": 1.2356902356902358e-05, "loss": 0.1031, "step": 454 }, { "epoch": 1.148989898989899, "grad_norm": 5.722323417663574, "learning_rate": 1.234006734006734e-05, "loss": 0.2435, "step": 455 }, { "epoch": 1.1515151515151516, "grad_norm": 4.030499458312988, "learning_rate": 1.2323232323232323e-05, "loss": 0.2088, "step": 456 }, { "epoch": 1.154040404040404, "grad_norm": 1.1742424964904785, "learning_rate": 1.2306397306397306e-05, "loss": 0.149, "step": 457 }, { "epoch": 1.1565656565656566, "grad_norm": 3.6498565673828125, "learning_rate": 1.2289562289562292e-05, "loss": 0.1704, "step": 458 }, { "epoch": 1.1590909090909092, "grad_norm": 7.278556823730469, "learning_rate": 1.2272727272727274e-05, "loss": 0.2622, "step": 459 }, { "epoch": 1.1616161616161615, "grad_norm": 5.127131462097168, "learning_rate": 1.2255892255892257e-05, "loss": 0.1456, "step": 460 }, { "epoch": 1.1641414141414141, "grad_norm": 5.602865219116211, "learning_rate": 1.223905723905724e-05, "loss": 0.2437, "step": 461 }, { "epoch": 1.1666666666666667, "grad_norm": 2.088778257369995, "learning_rate": 1.2222222222222224e-05, "loss": 0.1368, "step": 462 }, { "epoch": 1.1691919191919191, "grad_norm": 3.3843162059783936, "learning_rate": 1.2205387205387207e-05, "loss": 0.1656, "step": 463 }, { "epoch": 1.1717171717171717, "grad_norm": 6.630326271057129, "learning_rate": 1.218855218855219e-05, "loss": 0.1077, "step": 464 }, { "epoch": 1.1742424242424243, "grad_norm": 1.9552891254425049, "learning_rate": 1.2171717171717172e-05, "loss": 0.069, "step": 465 }, { "epoch": 1.1767676767676767, "grad_norm": 2.806879997253418, "learning_rate": 1.2154882154882155e-05, "loss": 0.1524, "step": 466 }, { "epoch": 1.1792929292929293, "grad_norm": 5.727405071258545, "learning_rate": 1.213804713804714e-05, "loss": 0.2153, "step": 467 }, { "epoch": 1.1818181818181819, "grad_norm": 2.179191827774048, "learning_rate": 1.2121212121212122e-05, "loss": 0.0931, "step": 468 }, { "epoch": 1.1843434343434343, "grad_norm": 4.29899263381958, "learning_rate": 1.2104377104377105e-05, "loss": 0.3174, "step": 469 }, { "epoch": 1.1868686868686869, "grad_norm": 3.667917490005493, "learning_rate": 1.2087542087542087e-05, "loss": 0.1926, "step": 470 }, { "epoch": 1.1893939393939394, "grad_norm": 2.626540422439575, "learning_rate": 1.207070707070707e-05, "loss": 0.1145, "step": 471 }, { "epoch": 1.1919191919191918, "grad_norm": 2.2297780513763428, "learning_rate": 1.2053872053872056e-05, "loss": 0.1552, "step": 472 }, { "epoch": 1.1944444444444444, "grad_norm": 5.271230220794678, "learning_rate": 1.2037037037037039e-05, "loss": 0.0832, "step": 473 }, { "epoch": 1.196969696969697, "grad_norm": 3.2105331420898438, "learning_rate": 1.2020202020202021e-05, "loss": 0.1523, "step": 474 }, { "epoch": 1.1994949494949494, "grad_norm": 3.91426157951355, "learning_rate": 1.2003367003367004e-05, "loss": 0.0819, "step": 475 }, { "epoch": 1.202020202020202, "grad_norm": 3.227917194366455, "learning_rate": 1.1986531986531988e-05, "loss": 0.2343, "step": 476 }, { "epoch": 1.2045454545454546, "grad_norm": 3.9899652004241943, "learning_rate": 1.1969696969696971e-05, "loss": 0.1466, "step": 477 }, { "epoch": 1.2070707070707072, "grad_norm": 1.5494883060455322, "learning_rate": 1.1952861952861954e-05, "loss": 0.1265, "step": 478 }, { "epoch": 1.2095959595959596, "grad_norm": 3.4442131519317627, "learning_rate": 1.1936026936026937e-05, "loss": 0.1355, "step": 479 }, { "epoch": 1.2121212121212122, "grad_norm": 2.505126714706421, "learning_rate": 1.191919191919192e-05, "loss": 0.1529, "step": 480 }, { "epoch": 1.2146464646464645, "grad_norm": 3.982832431793213, "learning_rate": 1.1902356902356904e-05, "loss": 0.1428, "step": 481 }, { "epoch": 1.2171717171717171, "grad_norm": 2.3964552879333496, "learning_rate": 1.1885521885521886e-05, "loss": 0.1062, "step": 482 }, { "epoch": 1.2196969696969697, "grad_norm": 11.200109481811523, "learning_rate": 1.1868686868686869e-05, "loss": 0.1621, "step": 483 }, { "epoch": 1.2222222222222223, "grad_norm": 3.8272476196289062, "learning_rate": 1.1851851851851852e-05, "loss": 0.1814, "step": 484 }, { "epoch": 1.2247474747474747, "grad_norm": 2.3707261085510254, "learning_rate": 1.1835016835016838e-05, "loss": 0.1253, "step": 485 }, { "epoch": 1.2272727272727273, "grad_norm": 6.070870399475098, "learning_rate": 1.181818181818182e-05, "loss": 0.1005, "step": 486 }, { "epoch": 1.22979797979798, "grad_norm": 2.2206804752349854, "learning_rate": 1.1801346801346803e-05, "loss": 0.0647, "step": 487 }, { "epoch": 1.2323232323232323, "grad_norm": 1.8281488418579102, "learning_rate": 1.1784511784511786e-05, "loss": 0.0642, "step": 488 }, { "epoch": 1.2348484848484849, "grad_norm": 2.690546751022339, "learning_rate": 1.1767676767676768e-05, "loss": 0.1824, "step": 489 }, { "epoch": 1.2373737373737375, "grad_norm": 8.035049438476562, "learning_rate": 1.1750841750841753e-05, "loss": 0.2047, "step": 490 }, { "epoch": 1.2398989898989898, "grad_norm": 2.475928783416748, "learning_rate": 1.1734006734006735e-05, "loss": 0.0882, "step": 491 }, { "epoch": 1.2424242424242424, "grad_norm": 2.2812142372131348, "learning_rate": 1.1717171717171718e-05, "loss": 0.1556, "step": 492 }, { "epoch": 1.244949494949495, "grad_norm": 1.1551276445388794, "learning_rate": 1.17003367003367e-05, "loss": 0.0905, "step": 493 }, { "epoch": 1.2474747474747474, "grad_norm": 1.8045101165771484, "learning_rate": 1.1683501683501684e-05, "loss": 0.1399, "step": 494 }, { "epoch": 1.25, "grad_norm": 2.3668212890625, "learning_rate": 1.1666666666666668e-05, "loss": 0.1231, "step": 495 }, { "epoch": 1.2525252525252526, "grad_norm": 4.466938018798828, "learning_rate": 1.164983164983165e-05, "loss": 0.1478, "step": 496 }, { "epoch": 1.255050505050505, "grad_norm": 6.409605026245117, "learning_rate": 1.1632996632996633e-05, "loss": 0.0892, "step": 497 }, { "epoch": 1.2575757575757576, "grad_norm": 3.112314224243164, "learning_rate": 1.1616161616161616e-05, "loss": 0.2378, "step": 498 }, { "epoch": 1.2601010101010102, "grad_norm": 2.3682100772857666, "learning_rate": 1.1599326599326602e-05, "loss": 0.0708, "step": 499 }, { "epoch": 1.2626262626262625, "grad_norm": 1.4351693391799927, "learning_rate": 1.1582491582491585e-05, "loss": 0.1386, "step": 500 }, { "epoch": 1.2651515151515151, "grad_norm": 1.9613323211669922, "learning_rate": 1.1565656565656567e-05, "loss": 0.0956, "step": 501 }, { "epoch": 1.2676767676767677, "grad_norm": 3.0842106342315674, "learning_rate": 1.154882154882155e-05, "loss": 0.1891, "step": 502 }, { "epoch": 1.2702020202020203, "grad_norm": 1.4221595525741577, "learning_rate": 1.1531986531986533e-05, "loss": 0.1238, "step": 503 }, { "epoch": 1.2727272727272727, "grad_norm": 2.890872001647949, "learning_rate": 1.1515151515151517e-05, "loss": 0.1804, "step": 504 }, { "epoch": 1.2752525252525253, "grad_norm": 2.4430460929870605, "learning_rate": 1.14983164983165e-05, "loss": 0.1566, "step": 505 }, { "epoch": 1.2777777777777777, "grad_norm": 4.546942710876465, "learning_rate": 1.1481481481481482e-05, "loss": 0.088, "step": 506 }, { "epoch": 1.2803030303030303, "grad_norm": 3.6536505222320557, "learning_rate": 1.1464646464646465e-05, "loss": 0.2054, "step": 507 }, { "epoch": 1.2828282828282829, "grad_norm": 1.6071276664733887, "learning_rate": 1.1447811447811448e-05, "loss": 0.1044, "step": 508 }, { "epoch": 1.2853535353535355, "grad_norm": 3.828359365463257, "learning_rate": 1.1430976430976432e-05, "loss": 0.1279, "step": 509 }, { "epoch": 1.2878787878787878, "grad_norm": 3.433269500732422, "learning_rate": 1.1414141414141415e-05, "loss": 0.1197, "step": 510 }, { "epoch": 1.2904040404040404, "grad_norm": 1.6743693351745605, "learning_rate": 1.1397306397306398e-05, "loss": 0.1057, "step": 511 }, { "epoch": 1.2929292929292928, "grad_norm": 5.027686595916748, "learning_rate": 1.138047138047138e-05, "loss": 0.1509, "step": 512 }, { "epoch": 1.2954545454545454, "grad_norm": 1.6235765218734741, "learning_rate": 1.1363636363636366e-05, "loss": 0.1147, "step": 513 }, { "epoch": 1.297979797979798, "grad_norm": 2.351604700088501, "learning_rate": 1.1346801346801349e-05, "loss": 0.0844, "step": 514 }, { "epoch": 1.3005050505050506, "grad_norm": 4.877485275268555, "learning_rate": 1.1329966329966332e-05, "loss": 0.1426, "step": 515 }, { "epoch": 1.303030303030303, "grad_norm": 1.512285828590393, "learning_rate": 1.1313131313131314e-05, "loss": 0.1225, "step": 516 }, { "epoch": 1.3055555555555556, "grad_norm": 2.247408628463745, "learning_rate": 1.1296296296296297e-05, "loss": 0.105, "step": 517 }, { "epoch": 1.308080808080808, "grad_norm": 1.3952003717422485, "learning_rate": 1.1279461279461281e-05, "loss": 0.0738, "step": 518 }, { "epoch": 1.3106060606060606, "grad_norm": 1.185707688331604, "learning_rate": 1.1262626262626264e-05, "loss": 0.0644, "step": 519 }, { "epoch": 1.3131313131313131, "grad_norm": 1.2581713199615479, "learning_rate": 1.1245791245791247e-05, "loss": 0.1065, "step": 520 }, { "epoch": 1.3156565656565657, "grad_norm": 5.530824661254883, "learning_rate": 1.122895622895623e-05, "loss": 0.1829, "step": 521 }, { "epoch": 1.3181818181818181, "grad_norm": 2.5609781742095947, "learning_rate": 1.1212121212121212e-05, "loss": 0.2349, "step": 522 }, { "epoch": 1.3207070707070707, "grad_norm": 2.8253445625305176, "learning_rate": 1.1195286195286197e-05, "loss": 0.1403, "step": 523 }, { "epoch": 1.3232323232323233, "grad_norm": 4.705146312713623, "learning_rate": 1.117845117845118e-05, "loss": 0.2046, "step": 524 }, { "epoch": 1.3257575757575757, "grad_norm": 4.86195182800293, "learning_rate": 1.1161616161616162e-05, "loss": 0.1632, "step": 525 }, { "epoch": 1.3282828282828283, "grad_norm": 2.6909475326538086, "learning_rate": 1.1144781144781145e-05, "loss": 0.0647, "step": 526 }, { "epoch": 1.3308080808080809, "grad_norm": 1.6233677864074707, "learning_rate": 1.112794612794613e-05, "loss": 0.1891, "step": 527 }, { "epoch": 1.3333333333333333, "grad_norm": 1.8622492551803589, "learning_rate": 1.1111111111111113e-05, "loss": 0.1507, "step": 528 }, { "epoch": 1.3358585858585859, "grad_norm": 3.173917770385742, "learning_rate": 1.1094276094276096e-05, "loss": 0.1618, "step": 529 }, { "epoch": 1.3383838383838385, "grad_norm": 5.947041034698486, "learning_rate": 1.1077441077441079e-05, "loss": 0.1692, "step": 530 }, { "epoch": 1.3409090909090908, "grad_norm": 2.8621153831481934, "learning_rate": 1.1060606060606061e-05, "loss": 0.218, "step": 531 }, { "epoch": 1.3434343434343434, "grad_norm": 3.8391976356506348, "learning_rate": 1.1043771043771046e-05, "loss": 0.2624, "step": 532 }, { "epoch": 1.345959595959596, "grad_norm": 4.155307769775391, "learning_rate": 1.1026936026936028e-05, "loss": 0.1923, "step": 533 }, { "epoch": 1.3484848484848486, "grad_norm": 2.796172618865967, "learning_rate": 1.1010101010101011e-05, "loss": 0.1886, "step": 534 }, { "epoch": 1.351010101010101, "grad_norm": 3.59019136428833, "learning_rate": 1.0993265993265994e-05, "loss": 0.1854, "step": 535 }, { "epoch": 1.3535353535353536, "grad_norm": 2.077014684677124, "learning_rate": 1.0976430976430978e-05, "loss": 0.0919, "step": 536 }, { "epoch": 1.356060606060606, "grad_norm": 2.869927167892456, "learning_rate": 1.0959595959595961e-05, "loss": 0.1828, "step": 537 }, { "epoch": 1.3585858585858586, "grad_norm": 2.4379348754882812, "learning_rate": 1.0942760942760944e-05, "loss": 0.1257, "step": 538 }, { "epoch": 1.3611111111111112, "grad_norm": 2.5572493076324463, "learning_rate": 1.0925925925925926e-05, "loss": 0.169, "step": 539 }, { "epoch": 1.3636363636363638, "grad_norm": 7.126609802246094, "learning_rate": 1.0909090909090909e-05, "loss": 0.1178, "step": 540 }, { "epoch": 1.3661616161616161, "grad_norm": 3.068500280380249, "learning_rate": 1.0892255892255893e-05, "loss": 0.1507, "step": 541 }, { "epoch": 1.3686868686868687, "grad_norm": 3.512056350708008, "learning_rate": 1.0875420875420876e-05, "loss": 0.1816, "step": 542 }, { "epoch": 1.371212121212121, "grad_norm": 4.917716979980469, "learning_rate": 1.0858585858585859e-05, "loss": 0.1301, "step": 543 }, { "epoch": 1.3737373737373737, "grad_norm": 2.207784414291382, "learning_rate": 1.0841750841750841e-05, "loss": 0.1251, "step": 544 }, { "epoch": 1.3762626262626263, "grad_norm": 4.091345310211182, "learning_rate": 1.0824915824915824e-05, "loss": 0.2324, "step": 545 }, { "epoch": 1.378787878787879, "grad_norm": 3.5930373668670654, "learning_rate": 1.080808080808081e-05, "loss": 0.2327, "step": 546 }, { "epoch": 1.3813131313131313, "grad_norm": 0.9397197365760803, "learning_rate": 1.0791245791245793e-05, "loss": 0.0909, "step": 547 }, { "epoch": 1.3838383838383839, "grad_norm": 1.6392264366149902, "learning_rate": 1.0774410774410775e-05, "loss": 0.065, "step": 548 }, { "epoch": 1.3863636363636362, "grad_norm": 3.9621989727020264, "learning_rate": 1.0757575757575758e-05, "loss": 0.0888, "step": 549 }, { "epoch": 1.3888888888888888, "grad_norm": 1.8630791902542114, "learning_rate": 1.0740740740740742e-05, "loss": 0.0705, "step": 550 }, { "epoch": 1.3914141414141414, "grad_norm": 2.29435133934021, "learning_rate": 1.0723905723905725e-05, "loss": 0.1626, "step": 551 }, { "epoch": 1.393939393939394, "grad_norm": 3.3439769744873047, "learning_rate": 1.0707070707070708e-05, "loss": 0.1741, "step": 552 }, { "epoch": 1.3964646464646464, "grad_norm": 3.894381523132324, "learning_rate": 1.069023569023569e-05, "loss": 0.2404, "step": 553 }, { "epoch": 1.398989898989899, "grad_norm": 2.4891560077667236, "learning_rate": 1.0673400673400673e-05, "loss": 0.1854, "step": 554 }, { "epoch": 1.4015151515151514, "grad_norm": 2.0606627464294434, "learning_rate": 1.0656565656565658e-05, "loss": 0.1896, "step": 555 }, { "epoch": 1.404040404040404, "grad_norm": 1.3142637014389038, "learning_rate": 1.063973063973064e-05, "loss": 0.0976, "step": 556 }, { "epoch": 1.4065656565656566, "grad_norm": 1.7551708221435547, "learning_rate": 1.0622895622895623e-05, "loss": 0.1013, "step": 557 }, { "epoch": 1.4090909090909092, "grad_norm": 2.389742612838745, "learning_rate": 1.0606060606060606e-05, "loss": 0.0802, "step": 558 }, { "epoch": 1.4116161616161615, "grad_norm": 5.079484462738037, "learning_rate": 1.0589225589225588e-05, "loss": 0.1066, "step": 559 }, { "epoch": 1.4141414141414141, "grad_norm": 1.7105693817138672, "learning_rate": 1.0572390572390574e-05, "loss": 0.0917, "step": 560 }, { "epoch": 1.4166666666666667, "grad_norm": 2.481248617172241, "learning_rate": 1.0555555555555557e-05, "loss": 0.0901, "step": 561 }, { "epoch": 1.4191919191919191, "grad_norm": 4.0751495361328125, "learning_rate": 1.053872053872054e-05, "loss": 0.1493, "step": 562 }, { "epoch": 1.4217171717171717, "grad_norm": 2.6854546070098877, "learning_rate": 1.0521885521885522e-05, "loss": 0.1751, "step": 563 }, { "epoch": 1.4242424242424243, "grad_norm": 7.801976203918457, "learning_rate": 1.0505050505050507e-05, "loss": 0.09, "step": 564 }, { "epoch": 1.4267676767676767, "grad_norm": 1.9461811780929565, "learning_rate": 1.048821548821549e-05, "loss": 0.0539, "step": 565 }, { "epoch": 1.4292929292929293, "grad_norm": 1.0220575332641602, "learning_rate": 1.0471380471380472e-05, "loss": 0.0629, "step": 566 }, { "epoch": 1.4318181818181819, "grad_norm": 3.8231167793273926, "learning_rate": 1.0454545454545455e-05, "loss": 0.0949, "step": 567 }, { "epoch": 1.4343434343434343, "grad_norm": 4.782219886779785, "learning_rate": 1.0437710437710438e-05, "loss": 0.2014, "step": 568 }, { "epoch": 1.4368686868686869, "grad_norm": 1.7311866283416748, "learning_rate": 1.0420875420875422e-05, "loss": 0.1586, "step": 569 }, { "epoch": 1.4393939393939394, "grad_norm": 1.6415760517120361, "learning_rate": 1.0404040404040405e-05, "loss": 0.0832, "step": 570 }, { "epoch": 1.441919191919192, "grad_norm": 3.1272056102752686, "learning_rate": 1.0387205387205387e-05, "loss": 0.1086, "step": 571 }, { "epoch": 1.4444444444444444, "grad_norm": 1.3914761543273926, "learning_rate": 1.037037037037037e-05, "loss": 0.0353, "step": 572 }, { "epoch": 1.446969696969697, "grad_norm": 2.641190528869629, "learning_rate": 1.0353535353535353e-05, "loss": 0.1669, "step": 573 }, { "epoch": 1.4494949494949494, "grad_norm": 1.974168300628662, "learning_rate": 1.0336700336700339e-05, "loss": 0.1607, "step": 574 }, { "epoch": 1.452020202020202, "grad_norm": 1.5384374856948853, "learning_rate": 1.0319865319865321e-05, "loss": 0.066, "step": 575 }, { "epoch": 1.4545454545454546, "grad_norm": 2.555971145629883, "learning_rate": 1.0303030303030304e-05, "loss": 0.1178, "step": 576 }, { "epoch": 1.4570707070707072, "grad_norm": 3.460545301437378, "learning_rate": 1.0286195286195287e-05, "loss": 0.163, "step": 577 }, { "epoch": 1.4595959595959596, "grad_norm": 2.9746346473693848, "learning_rate": 1.0269360269360271e-05, "loss": 0.2179, "step": 578 }, { "epoch": 1.4621212121212122, "grad_norm": 1.8450326919555664, "learning_rate": 1.0252525252525254e-05, "loss": 0.0707, "step": 579 }, { "epoch": 1.4646464646464645, "grad_norm": 1.596994400024414, "learning_rate": 1.0235690235690236e-05, "loss": 0.0712, "step": 580 }, { "epoch": 1.4671717171717171, "grad_norm": 2.0924813747406006, "learning_rate": 1.021885521885522e-05, "loss": 0.1328, "step": 581 }, { "epoch": 1.4696969696969697, "grad_norm": 2.734872579574585, "learning_rate": 1.0202020202020202e-05, "loss": 0.2434, "step": 582 }, { "epoch": 1.4722222222222223, "grad_norm": 2.7146146297454834, "learning_rate": 1.0185185185185186e-05, "loss": 0.1953, "step": 583 }, { "epoch": 1.4747474747474747, "grad_norm": 2.9375946521759033, "learning_rate": 1.0168350168350169e-05, "loss": 0.1656, "step": 584 }, { "epoch": 1.4772727272727273, "grad_norm": 2.132500648498535, "learning_rate": 1.0151515151515152e-05, "loss": 0.112, "step": 585 }, { "epoch": 1.4797979797979797, "grad_norm": 2.179478883743286, "learning_rate": 1.0134680134680134e-05, "loss": 0.0973, "step": 586 }, { "epoch": 1.4823232323232323, "grad_norm": 3.4565017223358154, "learning_rate": 1.0117845117845117e-05, "loss": 0.1256, "step": 587 }, { "epoch": 1.4848484848484849, "grad_norm": 1.9032288789749146, "learning_rate": 1.0101010101010103e-05, "loss": 0.0915, "step": 588 }, { "epoch": 1.4873737373737375, "grad_norm": 8.383233070373535, "learning_rate": 1.0084175084175086e-05, "loss": 0.1538, "step": 589 }, { "epoch": 1.4898989898989898, "grad_norm": 4.910621166229248, "learning_rate": 1.0067340067340068e-05, "loss": 0.1799, "step": 590 }, { "epoch": 1.4924242424242424, "grad_norm": 2.6224441528320312, "learning_rate": 1.0050505050505051e-05, "loss": 0.155, "step": 591 }, { "epoch": 1.494949494949495, "grad_norm": 3.4021310806274414, "learning_rate": 1.0033670033670035e-05, "loss": 0.1196, "step": 592 }, { "epoch": 1.4974747474747474, "grad_norm": 2.7120611667633057, "learning_rate": 1.0016835016835018e-05, "loss": 0.08, "step": 593 }, { "epoch": 1.5, "grad_norm": 1.137710452079773, "learning_rate": 1e-05, "loss": 0.078, "step": 594 }, { "epoch": 1.5025252525252526, "grad_norm": 2.9225590229034424, "learning_rate": 9.983164983164983e-06, "loss": 0.2341, "step": 595 }, { "epoch": 1.5050505050505052, "grad_norm": 1.6335861682891846, "learning_rate": 9.966329966329968e-06, "loss": 0.1192, "step": 596 }, { "epoch": 1.5075757575757576, "grad_norm": 2.710495948791504, "learning_rate": 9.94949494949495e-06, "loss": 0.1237, "step": 597 }, { "epoch": 1.51010101010101, "grad_norm": 2.903191328048706, "learning_rate": 9.932659932659933e-06, "loss": 0.0784, "step": 598 }, { "epoch": 1.5126262626262625, "grad_norm": 3.359354019165039, "learning_rate": 9.915824915824916e-06, "loss": 0.2288, "step": 599 }, { "epoch": 1.5151515151515151, "grad_norm": 2.92893648147583, "learning_rate": 9.8989898989899e-06, "loss": 0.1936, "step": 600 }, { "epoch": 1.5176767676767677, "grad_norm": 3.7757456302642822, "learning_rate": 9.882154882154883e-06, "loss": 0.2431, "step": 601 }, { "epoch": 1.5202020202020203, "grad_norm": 2.7293543815612793, "learning_rate": 9.865319865319866e-06, "loss": 0.1475, "step": 602 }, { "epoch": 1.5227272727272727, "grad_norm": 4.0022873878479, "learning_rate": 9.84848484848485e-06, "loss": 0.1055, "step": 603 }, { "epoch": 1.5252525252525253, "grad_norm": 4.107253074645996, "learning_rate": 9.831649831649833e-06, "loss": 0.1306, "step": 604 }, { "epoch": 1.5277777777777777, "grad_norm": 2.5653955936431885, "learning_rate": 9.814814814814815e-06, "loss": 0.1317, "step": 605 }, { "epoch": 1.5303030303030303, "grad_norm": 2.9474546909332275, "learning_rate": 9.797979797979798e-06, "loss": 0.1486, "step": 606 }, { "epoch": 1.5328282828282829, "grad_norm": 1.209354043006897, "learning_rate": 9.781144781144782e-06, "loss": 0.1019, "step": 607 }, { "epoch": 1.5353535353535355, "grad_norm": 2.3573384284973145, "learning_rate": 9.764309764309765e-06, "loss": 0.0792, "step": 608 }, { "epoch": 1.5378787878787878, "grad_norm": 2.1612727642059326, "learning_rate": 9.747474747474748e-06, "loss": 0.1402, "step": 609 }, { "epoch": 1.5404040404040404, "grad_norm": 1.7895665168762207, "learning_rate": 9.730639730639732e-06, "loss": 0.118, "step": 610 }, { "epoch": 1.5429292929292928, "grad_norm": 1.4610426425933838, "learning_rate": 9.713804713804715e-06, "loss": 0.1428, "step": 611 }, { "epoch": 1.5454545454545454, "grad_norm": 2.2483487129211426, "learning_rate": 9.696969696969698e-06, "loss": 0.1732, "step": 612 }, { "epoch": 1.547979797979798, "grad_norm": 5.811710834503174, "learning_rate": 9.68013468013468e-06, "loss": 0.1112, "step": 613 }, { "epoch": 1.5505050505050506, "grad_norm": 6.1415815353393555, "learning_rate": 9.663299663299665e-06, "loss": 0.138, "step": 614 }, { "epoch": 1.553030303030303, "grad_norm": 1.204952597618103, "learning_rate": 9.646464646464647e-06, "loss": 0.0998, "step": 615 }, { "epoch": 1.5555555555555556, "grad_norm": 2.5513834953308105, "learning_rate": 9.62962962962963e-06, "loss": 0.0971, "step": 616 }, { "epoch": 1.558080808080808, "grad_norm": 4.2005181312561035, "learning_rate": 9.612794612794614e-06, "loss": 0.1096, "step": 617 }, { "epoch": 1.5606060606060606, "grad_norm": 2.5134921073913574, "learning_rate": 9.595959595959597e-06, "loss": 0.1817, "step": 618 }, { "epoch": 1.5631313131313131, "grad_norm": 3.6018764972686768, "learning_rate": 9.57912457912458e-06, "loss": 0.0849, "step": 619 }, { "epoch": 1.5656565656565657, "grad_norm": 1.6318095922470093, "learning_rate": 9.562289562289562e-06, "loss": 0.0661, "step": 620 }, { "epoch": 1.5681818181818183, "grad_norm": 3.3563179969787598, "learning_rate": 9.545454545454547e-06, "loss": 0.1141, "step": 621 }, { "epoch": 1.5707070707070707, "grad_norm": 2.074086904525757, "learning_rate": 9.52861952861953e-06, "loss": 0.1207, "step": 622 }, { "epoch": 1.573232323232323, "grad_norm": 2.5464348793029785, "learning_rate": 9.511784511784512e-06, "loss": 0.1951, "step": 623 }, { "epoch": 1.5757575757575757, "grad_norm": 5.284518718719482, "learning_rate": 9.494949494949497e-06, "loss": 0.1868, "step": 624 }, { "epoch": 1.5782828282828283, "grad_norm": 2.5765862464904785, "learning_rate": 9.47811447811448e-06, "loss": 0.187, "step": 625 }, { "epoch": 1.5808080808080809, "grad_norm": 4.491573333740234, "learning_rate": 9.461279461279462e-06, "loss": 0.1033, "step": 626 }, { "epoch": 1.5833333333333335, "grad_norm": 4.794037818908691, "learning_rate": 9.444444444444445e-06, "loss": 0.1638, "step": 627 }, { "epoch": 1.5858585858585859, "grad_norm": 1.3392722606658936, "learning_rate": 9.427609427609429e-06, "loss": 0.0673, "step": 628 }, { "epoch": 1.5883838383838382, "grad_norm": 2.59481143951416, "learning_rate": 9.410774410774412e-06, "loss": 0.1506, "step": 629 }, { "epoch": 1.5909090909090908, "grad_norm": 1.926398754119873, "learning_rate": 9.393939393939396e-06, "loss": 0.0817, "step": 630 }, { "epoch": 1.5934343434343434, "grad_norm": 3.796034812927246, "learning_rate": 9.377104377104379e-06, "loss": 0.1526, "step": 631 }, { "epoch": 1.595959595959596, "grad_norm": 3.06642484664917, "learning_rate": 9.360269360269361e-06, "loss": 0.158, "step": 632 }, { "epoch": 1.5984848484848486, "grad_norm": 2.3332364559173584, "learning_rate": 9.343434343434344e-06, "loss": 0.1412, "step": 633 }, { "epoch": 1.601010101010101, "grad_norm": 13.372260093688965, "learning_rate": 9.326599326599327e-06, "loss": 0.0737, "step": 634 }, { "epoch": 1.6035353535353534, "grad_norm": 2.744684934616089, "learning_rate": 9.309764309764311e-06, "loss": 0.1845, "step": 635 }, { "epoch": 1.606060606060606, "grad_norm": 4.262907981872559, "learning_rate": 9.292929292929294e-06, "loss": 0.2397, "step": 636 }, { "epoch": 1.6085858585858586, "grad_norm": 2.6066222190856934, "learning_rate": 9.276094276094278e-06, "loss": 0.0889, "step": 637 }, { "epoch": 1.6111111111111112, "grad_norm": 5.02886962890625, "learning_rate": 9.25925925925926e-06, "loss": 0.3094, "step": 638 }, { "epoch": 1.6136363636363638, "grad_norm": 1.2655010223388672, "learning_rate": 9.242424242424244e-06, "loss": 0.1043, "step": 639 }, { "epoch": 1.6161616161616161, "grad_norm": 2.1592676639556885, "learning_rate": 9.225589225589226e-06, "loss": 0.1541, "step": 640 }, { "epoch": 1.6186868686868687, "grad_norm": 1.4674041271209717, "learning_rate": 9.208754208754209e-06, "loss": 0.0803, "step": 641 }, { "epoch": 1.621212121212121, "grad_norm": 1.3324946165084839, "learning_rate": 9.191919191919193e-06, "loss": 0.1697, "step": 642 }, { "epoch": 1.6237373737373737, "grad_norm": 4.259162902832031, "learning_rate": 9.175084175084176e-06, "loss": 0.1512, "step": 643 }, { "epoch": 1.6262626262626263, "grad_norm": 1.390676498413086, "learning_rate": 9.15824915824916e-06, "loss": 0.0868, "step": 644 }, { "epoch": 1.628787878787879, "grad_norm": 2.026618242263794, "learning_rate": 9.141414141414143e-06, "loss": 0.0679, "step": 645 }, { "epoch": 1.6313131313131313, "grad_norm": 3.238002061843872, "learning_rate": 9.124579124579126e-06, "loss": 0.1706, "step": 646 }, { "epoch": 1.6338383838383839, "grad_norm": 1.8931351900100708, "learning_rate": 9.107744107744108e-06, "loss": 0.0645, "step": 647 }, { "epoch": 1.6363636363636362, "grad_norm": 1.5486174821853638, "learning_rate": 9.090909090909091e-06, "loss": 0.139, "step": 648 }, { "epoch": 1.6388888888888888, "grad_norm": 1.988709807395935, "learning_rate": 9.074074074074075e-06, "loss": 0.2034, "step": 649 }, { "epoch": 1.6414141414141414, "grad_norm": 2.529951572418213, "learning_rate": 9.057239057239058e-06, "loss": 0.0846, "step": 650 }, { "epoch": 1.643939393939394, "grad_norm": 6.469368934631348, "learning_rate": 9.040404040404042e-06, "loss": 0.1614, "step": 651 }, { "epoch": 1.6464646464646466, "grad_norm": 1.5296707153320312, "learning_rate": 9.023569023569025e-06, "loss": 0.074, "step": 652 }, { "epoch": 1.648989898989899, "grad_norm": 3.4863650798797607, "learning_rate": 9.006734006734008e-06, "loss": 0.1207, "step": 653 }, { "epoch": 1.6515151515151514, "grad_norm": 4.34932804107666, "learning_rate": 8.98989898989899e-06, "loss": 0.1209, "step": 654 }, { "epoch": 1.654040404040404, "grad_norm": 2.05281400680542, "learning_rate": 8.973063973063973e-06, "loss": 0.085, "step": 655 }, { "epoch": 1.6565656565656566, "grad_norm": 5.7974677085876465, "learning_rate": 8.956228956228958e-06, "loss": 0.1432, "step": 656 }, { "epoch": 1.6590909090909092, "grad_norm": 13.796086311340332, "learning_rate": 8.93939393939394e-06, "loss": 0.1743, "step": 657 }, { "epoch": 1.6616161616161618, "grad_norm": 1.2835731506347656, "learning_rate": 8.922558922558923e-06, "loss": 0.1032, "step": 658 }, { "epoch": 1.6641414141414141, "grad_norm": 1.330572247505188, "learning_rate": 8.905723905723906e-06, "loss": 0.1194, "step": 659 }, { "epoch": 1.6666666666666665, "grad_norm": 2.2639822959899902, "learning_rate": 8.888888888888888e-06, "loss": 0.173, "step": 660 }, { "epoch": 1.6691919191919191, "grad_norm": 2.2905423641204834, "learning_rate": 8.872053872053873e-06, "loss": 0.1052, "step": 661 }, { "epoch": 1.6717171717171717, "grad_norm": 6.86669397354126, "learning_rate": 8.855218855218855e-06, "loss": 0.1658, "step": 662 }, { "epoch": 1.6742424242424243, "grad_norm": 1.9337157011032104, "learning_rate": 8.83838383838384e-06, "loss": 0.139, "step": 663 }, { "epoch": 1.676767676767677, "grad_norm": 1.348889708518982, "learning_rate": 8.821548821548822e-06, "loss": 0.1243, "step": 664 }, { "epoch": 1.6792929292929293, "grad_norm": 1.4817837476730347, "learning_rate": 8.804713804713805e-06, "loss": 0.0633, "step": 665 }, { "epoch": 1.6818181818181817, "grad_norm": 3.970458507537842, "learning_rate": 8.787878787878788e-06, "loss": 0.1427, "step": 666 }, { "epoch": 1.6843434343434343, "grad_norm": 6.352334976196289, "learning_rate": 8.77104377104377e-06, "loss": 0.1437, "step": 667 }, { "epoch": 1.6868686868686869, "grad_norm": 0.6994425654411316, "learning_rate": 8.754208754208755e-06, "loss": 0.0398, "step": 668 }, { "epoch": 1.6893939393939394, "grad_norm": 4.77330207824707, "learning_rate": 8.737373737373738e-06, "loss": 0.1319, "step": 669 }, { "epoch": 1.691919191919192, "grad_norm": 3.855506420135498, "learning_rate": 8.720538720538722e-06, "loss": 0.1467, "step": 670 }, { "epoch": 1.6944444444444444, "grad_norm": 4.957710266113281, "learning_rate": 8.703703703703705e-06, "loss": 0.1414, "step": 671 }, { "epoch": 1.696969696969697, "grad_norm": 2.640568971633911, "learning_rate": 8.686868686868687e-06, "loss": 0.2187, "step": 672 }, { "epoch": 1.6994949494949494, "grad_norm": 3.6980481147766113, "learning_rate": 8.67003367003367e-06, "loss": 0.1197, "step": 673 }, { "epoch": 1.702020202020202, "grad_norm": 3.419555187225342, "learning_rate": 8.653198653198653e-06, "loss": 0.164, "step": 674 }, { "epoch": 1.7045454545454546, "grad_norm": 3.6955320835113525, "learning_rate": 8.636363636363637e-06, "loss": 0.1821, "step": 675 }, { "epoch": 1.7070707070707072, "grad_norm": 1.2104640007019043, "learning_rate": 8.61952861952862e-06, "loss": 0.0747, "step": 676 }, { "epoch": 1.7095959595959596, "grad_norm": 3.7086238861083984, "learning_rate": 8.602693602693604e-06, "loss": 0.1402, "step": 677 }, { "epoch": 1.7121212121212122, "grad_norm": 1.6543469429016113, "learning_rate": 8.585858585858587e-06, "loss": 0.0869, "step": 678 }, { "epoch": 1.7146464646464645, "grad_norm": 4.50585412979126, "learning_rate": 8.56902356902357e-06, "loss": 0.0926, "step": 679 }, { "epoch": 1.7171717171717171, "grad_norm": 2.2351365089416504, "learning_rate": 8.552188552188552e-06, "loss": 0.0886, "step": 680 }, { "epoch": 1.7196969696969697, "grad_norm": 1.8379594087600708, "learning_rate": 8.535353535353535e-06, "loss": 0.0671, "step": 681 }, { "epoch": 1.7222222222222223, "grad_norm": 2.2375223636627197, "learning_rate": 8.518518518518519e-06, "loss": 0.1455, "step": 682 }, { "epoch": 1.7247474747474747, "grad_norm": 1.758262038230896, "learning_rate": 8.501683501683502e-06, "loss": 0.067, "step": 683 }, { "epoch": 1.7272727272727273, "grad_norm": 1.4083460569381714, "learning_rate": 8.484848484848486e-06, "loss": 0.0492, "step": 684 }, { "epoch": 1.7297979797979797, "grad_norm": 2.864366292953491, "learning_rate": 8.468013468013469e-06, "loss": 0.1483, "step": 685 }, { "epoch": 1.7323232323232323, "grad_norm": 1.695508360862732, "learning_rate": 8.451178451178452e-06, "loss": 0.0559, "step": 686 }, { "epoch": 1.7348484848484849, "grad_norm": 5.666776180267334, "learning_rate": 8.434343434343434e-06, "loss": 0.1655, "step": 687 }, { "epoch": 1.7373737373737375, "grad_norm": 4.942101001739502, "learning_rate": 8.417508417508419e-06, "loss": 0.0525, "step": 688 }, { "epoch": 1.73989898989899, "grad_norm": 2.151745557785034, "learning_rate": 8.400673400673401e-06, "loss": 0.137, "step": 689 }, { "epoch": 1.7424242424242424, "grad_norm": 2.4058070182800293, "learning_rate": 8.383838383838384e-06, "loss": 0.0805, "step": 690 }, { "epoch": 1.7449494949494948, "grad_norm": 4.35892915725708, "learning_rate": 8.367003367003368e-06, "loss": 0.0764, "step": 691 }, { "epoch": 1.7474747474747474, "grad_norm": 1.3333408832550049, "learning_rate": 8.350168350168351e-06, "loss": 0.0576, "step": 692 }, { "epoch": 1.75, "grad_norm": 4.402344703674316, "learning_rate": 8.333333333333334e-06, "loss": 0.1821, "step": 693 }, { "epoch": 1.7525252525252526, "grad_norm": 4.358558654785156, "learning_rate": 8.316498316498316e-06, "loss": 0.1767, "step": 694 }, { "epoch": 1.7550505050505052, "grad_norm": 2.602311372756958, "learning_rate": 8.2996632996633e-06, "loss": 0.1474, "step": 695 }, { "epoch": 1.7575757575757576, "grad_norm": 3.5266802310943604, "learning_rate": 8.282828282828283e-06, "loss": 0.1917, "step": 696 }, { "epoch": 1.76010101010101, "grad_norm": 5.978867053985596, "learning_rate": 8.265993265993266e-06, "loss": 0.1884, "step": 697 }, { "epoch": 1.7626262626262625, "grad_norm": 2.8455355167388916, "learning_rate": 8.24915824915825e-06, "loss": 0.1302, "step": 698 }, { "epoch": 1.7651515151515151, "grad_norm": 4.014955520629883, "learning_rate": 8.232323232323233e-06, "loss": 0.1731, "step": 699 }, { "epoch": 1.7676767676767677, "grad_norm": 4.700746536254883, "learning_rate": 8.215488215488216e-06, "loss": 0.1765, "step": 700 }, { "epoch": 1.7702020202020203, "grad_norm": 3.462686061859131, "learning_rate": 8.198653198653199e-06, "loss": 0.0926, "step": 701 }, { "epoch": 1.7727272727272727, "grad_norm": 1.5547245740890503, "learning_rate": 8.181818181818183e-06, "loss": 0.0325, "step": 702 }, { "epoch": 1.7752525252525253, "grad_norm": 2.274096727371216, "learning_rate": 8.164983164983166e-06, "loss": 0.0642, "step": 703 }, { "epoch": 1.7777777777777777, "grad_norm": 2.7937772274017334, "learning_rate": 8.148148148148148e-06, "loss": 0.1084, "step": 704 }, { "epoch": 1.7803030303030303, "grad_norm": 1.720742106437683, "learning_rate": 8.131313131313133e-06, "loss": 0.101, "step": 705 }, { "epoch": 1.7828282828282829, "grad_norm": 4.517067909240723, "learning_rate": 8.114478114478115e-06, "loss": 0.1059, "step": 706 }, { "epoch": 1.7853535353535355, "grad_norm": 2.7258083820343018, "learning_rate": 8.097643097643098e-06, "loss": 0.1329, "step": 707 }, { "epoch": 1.7878787878787878, "grad_norm": 2.474179983139038, "learning_rate": 8.08080808080808e-06, "loss": 0.1007, "step": 708 }, { "epoch": 1.7904040404040404, "grad_norm": 2.3355281352996826, "learning_rate": 8.063973063973065e-06, "loss": 0.1863, "step": 709 }, { "epoch": 1.7929292929292928, "grad_norm": 3.959667444229126, "learning_rate": 8.047138047138048e-06, "loss": 0.0882, "step": 710 }, { "epoch": 1.7954545454545454, "grad_norm": 5.953159809112549, "learning_rate": 8.03030303030303e-06, "loss": 0.1024, "step": 711 }, { "epoch": 1.797979797979798, "grad_norm": 3.069732427597046, "learning_rate": 8.013468013468015e-06, "loss": 0.084, "step": 712 }, { "epoch": 1.8005050505050506, "grad_norm": 3.06427001953125, "learning_rate": 7.996632996632998e-06, "loss": 0.2176, "step": 713 }, { "epoch": 1.803030303030303, "grad_norm": 5.320972442626953, "learning_rate": 7.97979797979798e-06, "loss": 0.1877, "step": 714 }, { "epoch": 1.8055555555555556, "grad_norm": 3.8155035972595215, "learning_rate": 7.962962962962963e-06, "loss": 0.14, "step": 715 }, { "epoch": 1.808080808080808, "grad_norm": 2.791696310043335, "learning_rate": 7.946127946127947e-06, "loss": 0.0694, "step": 716 }, { "epoch": 1.8106060606060606, "grad_norm": 1.7592320442199707, "learning_rate": 7.92929292929293e-06, "loss": 0.0426, "step": 717 }, { "epoch": 1.8131313131313131, "grad_norm": 8.306157112121582, "learning_rate": 7.912457912457913e-06, "loss": 0.1455, "step": 718 }, { "epoch": 1.8156565656565657, "grad_norm": 3.3673255443573, "learning_rate": 7.895622895622897e-06, "loss": 0.1412, "step": 719 }, { "epoch": 1.8181818181818183, "grad_norm": 3.755908966064453, "learning_rate": 7.87878787878788e-06, "loss": 0.1096, "step": 720 }, { "epoch": 1.8207070707070707, "grad_norm": 1.6641695499420166, "learning_rate": 7.861952861952862e-06, "loss": 0.1231, "step": 721 }, { "epoch": 1.823232323232323, "grad_norm": 3.577352285385132, "learning_rate": 7.845117845117845e-06, "loss": 0.07, "step": 722 }, { "epoch": 1.8257575757575757, "grad_norm": 3.3195016384124756, "learning_rate": 7.82828282828283e-06, "loss": 0.2131, "step": 723 }, { "epoch": 1.8282828282828283, "grad_norm": 2.113675594329834, "learning_rate": 7.811447811447812e-06, "loss": 0.077, "step": 724 }, { "epoch": 1.8308080808080809, "grad_norm": 2.248725414276123, "learning_rate": 7.794612794612795e-06, "loss": 0.1106, "step": 725 }, { "epoch": 1.8333333333333335, "grad_norm": 3.8289642333984375, "learning_rate": 7.77777777777778e-06, "loss": 0.0919, "step": 726 }, { "epoch": 1.8358585858585859, "grad_norm": 2.4651291370391846, "learning_rate": 7.760942760942762e-06, "loss": 0.0724, "step": 727 }, { "epoch": 1.8383838383838382, "grad_norm": 4.7950358390808105, "learning_rate": 7.744107744107745e-06, "loss": 0.1148, "step": 728 }, { "epoch": 1.8409090909090908, "grad_norm": 8.350399017333984, "learning_rate": 7.727272727272727e-06, "loss": 0.1526, "step": 729 }, { "epoch": 1.8434343434343434, "grad_norm": 2.8314502239227295, "learning_rate": 7.710437710437712e-06, "loss": 0.1417, "step": 730 }, { "epoch": 1.845959595959596, "grad_norm": 3.023043155670166, "learning_rate": 7.693602693602694e-06, "loss": 0.1971, "step": 731 }, { "epoch": 1.8484848484848486, "grad_norm": 1.6119197607040405, "learning_rate": 7.676767676767677e-06, "loss": 0.0754, "step": 732 }, { "epoch": 1.851010101010101, "grad_norm": 5.730337142944336, "learning_rate": 7.659932659932661e-06, "loss": 0.0786, "step": 733 }, { "epoch": 1.8535353535353534, "grad_norm": 0.6242827773094177, "learning_rate": 7.643097643097644e-06, "loss": 0.0237, "step": 734 }, { "epoch": 1.856060606060606, "grad_norm": 3.5328094959259033, "learning_rate": 7.6262626262626275e-06, "loss": 0.1308, "step": 735 }, { "epoch": 1.8585858585858586, "grad_norm": 2.5661208629608154, "learning_rate": 7.60942760942761e-06, "loss": 0.1202, "step": 736 }, { "epoch": 1.8611111111111112, "grad_norm": 1.5449377298355103, "learning_rate": 7.592592592592594e-06, "loss": 0.0886, "step": 737 }, { "epoch": 1.8636363636363638, "grad_norm": 4.09519100189209, "learning_rate": 7.5757575757575764e-06, "loss": 0.1398, "step": 738 }, { "epoch": 1.8661616161616161, "grad_norm": 3.5463318824768066, "learning_rate": 7.558922558922559e-06, "loss": 0.1023, "step": 739 }, { "epoch": 1.8686868686868687, "grad_norm": 2.5558698177337646, "learning_rate": 7.542087542087543e-06, "loss": 0.1335, "step": 740 }, { "epoch": 1.871212121212121, "grad_norm": 1.5937213897705078, "learning_rate": 7.525252525252525e-06, "loss": 0.0928, "step": 741 }, { "epoch": 1.8737373737373737, "grad_norm": 2.4672536849975586, "learning_rate": 7.508417508417509e-06, "loss": 0.2052, "step": 742 }, { "epoch": 1.8762626262626263, "grad_norm": 1.365451693534851, "learning_rate": 7.491582491582492e-06, "loss": 0.1414, "step": 743 }, { "epoch": 1.878787878787879, "grad_norm": 0.8678475618362427, "learning_rate": 7.474747474747476e-06, "loss": 0.0786, "step": 744 }, { "epoch": 1.8813131313131313, "grad_norm": 3.8532655239105225, "learning_rate": 7.457912457912459e-06, "loss": 0.2117, "step": 745 }, { "epoch": 1.8838383838383839, "grad_norm": 5.75984525680542, "learning_rate": 7.441077441077442e-06, "loss": 0.1238, "step": 746 }, { "epoch": 1.8863636363636362, "grad_norm": 1.1473771333694458, "learning_rate": 7.424242424242425e-06, "loss": 0.0895, "step": 747 }, { "epoch": 1.8888888888888888, "grad_norm": 1.526085376739502, "learning_rate": 7.4074074074074075e-06, "loss": 0.1088, "step": 748 }, { "epoch": 1.8914141414141414, "grad_norm": 4.124934673309326, "learning_rate": 7.390572390572391e-06, "loss": 0.0826, "step": 749 }, { "epoch": 1.893939393939394, "grad_norm": 6.274197101593018, "learning_rate": 7.373737373737374e-06, "loss": 0.1513, "step": 750 }, { "epoch": 1.8964646464646466, "grad_norm": 1.4224315881729126, "learning_rate": 7.356902356902358e-06, "loss": 0.1091, "step": 751 }, { "epoch": 1.898989898989899, "grad_norm": 4.506265640258789, "learning_rate": 7.340067340067341e-06, "loss": 0.1017, "step": 752 }, { "epoch": 1.9015151515151514, "grad_norm": 1.0609605312347412, "learning_rate": 7.323232323232324e-06, "loss": 0.0597, "step": 753 }, { "epoch": 1.904040404040404, "grad_norm": 3.9881186485290527, "learning_rate": 7.306397306397307e-06, "loss": 0.1244, "step": 754 }, { "epoch": 1.9065656565656566, "grad_norm": 1.8625434637069702, "learning_rate": 7.28956228956229e-06, "loss": 0.147, "step": 755 }, { "epoch": 1.9090909090909092, "grad_norm": 8.011527061462402, "learning_rate": 7.272727272727273e-06, "loss": 0.0823, "step": 756 }, { "epoch": 1.9116161616161618, "grad_norm": 2.0574049949645996, "learning_rate": 7.255892255892256e-06, "loss": 0.0667, "step": 757 }, { "epoch": 1.9141414141414141, "grad_norm": 1.5154629945755005, "learning_rate": 7.23905723905724e-06, "loss": 0.1717, "step": 758 }, { "epoch": 1.9166666666666665, "grad_norm": 2.105567455291748, "learning_rate": 7.222222222222223e-06, "loss": 0.1676, "step": 759 }, { "epoch": 1.9191919191919191, "grad_norm": 1.6874696016311646, "learning_rate": 7.2053872053872064e-06, "loss": 0.1089, "step": 760 }, { "epoch": 1.9217171717171717, "grad_norm": 2.980811357498169, "learning_rate": 7.188552188552189e-06, "loss": 0.1806, "step": 761 }, { "epoch": 1.9242424242424243, "grad_norm": 2.0981791019439697, "learning_rate": 7.171717171717172e-06, "loss": 0.0859, "step": 762 }, { "epoch": 1.926767676767677, "grad_norm": 1.835482120513916, "learning_rate": 7.154882154882155e-06, "loss": 0.1364, "step": 763 }, { "epoch": 1.9292929292929293, "grad_norm": 4.000125885009766, "learning_rate": 7.138047138047138e-06, "loss": 0.1354, "step": 764 }, { "epoch": 1.9318181818181817, "grad_norm": 4.924983978271484, "learning_rate": 7.121212121212122e-06, "loss": 0.1154, "step": 765 }, { "epoch": 1.9343434343434343, "grad_norm": 1.5840011835098267, "learning_rate": 7.104377104377105e-06, "loss": 0.1016, "step": 766 }, { "epoch": 1.9368686868686869, "grad_norm": 1.5436311960220337, "learning_rate": 7.087542087542089e-06, "loss": 0.1168, "step": 767 }, { "epoch": 1.9393939393939394, "grad_norm": 2.4922754764556885, "learning_rate": 7.070707070707071e-06, "loss": 0.1187, "step": 768 }, { "epoch": 1.941919191919192, "grad_norm": 3.206899881362915, "learning_rate": 7.053872053872054e-06, "loss": 0.1184, "step": 769 }, { "epoch": 1.9444444444444444, "grad_norm": 4.3798828125, "learning_rate": 7.0370370370370375e-06, "loss": 0.1997, "step": 770 }, { "epoch": 1.946969696969697, "grad_norm": 1.3223721981048584, "learning_rate": 7.02020202020202e-06, "loss": 0.073, "step": 771 }, { "epoch": 1.9494949494949494, "grad_norm": 2.0767436027526855, "learning_rate": 7.0033670033670045e-06, "loss": 0.1251, "step": 772 }, { "epoch": 1.952020202020202, "grad_norm": 1.8936235904693604, "learning_rate": 6.986531986531987e-06, "loss": 0.0956, "step": 773 }, { "epoch": 1.9545454545454546, "grad_norm": 6.86482048034668, "learning_rate": 6.969696969696971e-06, "loss": 0.1269, "step": 774 }, { "epoch": 1.9570707070707072, "grad_norm": 2.885071039199829, "learning_rate": 6.9528619528619534e-06, "loss": 0.0974, "step": 775 }, { "epoch": 1.9595959595959596, "grad_norm": 4.58144474029541, "learning_rate": 6.936026936026936e-06, "loss": 0.3284, "step": 776 }, { "epoch": 1.9621212121212122, "grad_norm": 4.064563274383545, "learning_rate": 6.91919191919192e-06, "loss": 0.1659, "step": 777 }, { "epoch": 1.9646464646464645, "grad_norm": 1.5637133121490479, "learning_rate": 6.902356902356902e-06, "loss": 0.1369, "step": 778 }, { "epoch": 1.9671717171717171, "grad_norm": 2.932281494140625, "learning_rate": 6.885521885521887e-06, "loss": 0.0865, "step": 779 }, { "epoch": 1.9696969696969697, "grad_norm": 1.1261810064315796, "learning_rate": 6.868686868686869e-06, "loss": 0.1245, "step": 780 }, { "epoch": 1.9722222222222223, "grad_norm": 3.991880178451538, "learning_rate": 6.851851851851853e-06, "loss": 0.188, "step": 781 }, { "epoch": 1.9747474747474747, "grad_norm": 1.7972675561904907, "learning_rate": 6.835016835016836e-06, "loss": 0.1832, "step": 782 }, { "epoch": 1.9772727272727273, "grad_norm": 2.0975348949432373, "learning_rate": 6.818181818181818e-06, "loss": 0.0416, "step": 783 }, { "epoch": 1.9797979797979797, "grad_norm": 2.6938462257385254, "learning_rate": 6.801346801346802e-06, "loss": 0.1471, "step": 784 }, { "epoch": 1.9823232323232323, "grad_norm": 2.680722951889038, "learning_rate": 6.7845117845117845e-06, "loss": 0.1255, "step": 785 }, { "epoch": 1.9848484848484849, "grad_norm": 4.923444747924805, "learning_rate": 6.767676767676769e-06, "loss": 0.1087, "step": 786 }, { "epoch": 1.9873737373737375, "grad_norm": 3.3977975845336914, "learning_rate": 6.7508417508417515e-06, "loss": 0.1198, "step": 787 }, { "epoch": 1.98989898989899, "grad_norm": 2.9619626998901367, "learning_rate": 6.734006734006735e-06, "loss": 0.104, "step": 788 }, { "epoch": 1.9924242424242424, "grad_norm": 1.3148123025894165, "learning_rate": 6.717171717171718e-06, "loss": 0.0854, "step": 789 }, { "epoch": 1.9949494949494948, "grad_norm": 1.7584114074707031, "learning_rate": 6.7003367003367004e-06, "loss": 0.136, "step": 790 }, { "epoch": 1.9974747474747474, "grad_norm": 8.245304107666016, "learning_rate": 6.683501683501684e-06, "loss": 0.1525, "step": 791 }, { "epoch": 2.0, "grad_norm": 1.205091118812561, "learning_rate": 6.666666666666667e-06, "loss": 0.0617, "step": 792 }, { "epoch": 2.0, "eval_accuracy": 0.759090909090909, "eval_f1": 0.8949265317438315, "eval_loss": 0.13888753950595856, "eval_runtime": 43.47, "eval_samples_per_second": 20.244, "eval_steps_per_second": 0.437, "step": 792 }, { "epoch": 2.0025252525252526, "grad_norm": 2.01649808883667, "learning_rate": 6.649831649831651e-06, "loss": 0.1073, "step": 793 }, { "epoch": 2.005050505050505, "grad_norm": 6.579789161682129, "learning_rate": 6.632996632996634e-06, "loss": 0.1704, "step": 794 }, { "epoch": 2.007575757575758, "grad_norm": 2.323598623275757, "learning_rate": 6.616161616161617e-06, "loss": 0.1983, "step": 795 }, { "epoch": 2.01010101010101, "grad_norm": 2.126936674118042, "learning_rate": 6.5993265993266e-06, "loss": 0.1026, "step": 796 }, { "epoch": 2.0126262626262625, "grad_norm": 1.035873293876648, "learning_rate": 6.582491582491583e-06, "loss": 0.0488, "step": 797 }, { "epoch": 2.015151515151515, "grad_norm": 2.2837603092193604, "learning_rate": 6.565656565656566e-06, "loss": 0.1894, "step": 798 }, { "epoch": 2.0176767676767677, "grad_norm": 7.866192817687988, "learning_rate": 6.548821548821549e-06, "loss": 0.2146, "step": 799 }, { "epoch": 2.0202020202020203, "grad_norm": 4.450189590454102, "learning_rate": 6.531986531986533e-06, "loss": 0.0731, "step": 800 }, { "epoch": 2.022727272727273, "grad_norm": 2.2905592918395996, "learning_rate": 6.515151515151516e-06, "loss": 0.0736, "step": 801 }, { "epoch": 2.025252525252525, "grad_norm": 1.7175313234329224, "learning_rate": 6.498316498316499e-06, "loss": 0.1525, "step": 802 }, { "epoch": 2.0277777777777777, "grad_norm": 3.22578763961792, "learning_rate": 6.481481481481482e-06, "loss": 0.1093, "step": 803 }, { "epoch": 2.0303030303030303, "grad_norm": 1.8242607116699219, "learning_rate": 6.464646464646466e-06, "loss": 0.1138, "step": 804 }, { "epoch": 2.032828282828283, "grad_norm": 2.7062501907348633, "learning_rate": 6.447811447811448e-06, "loss": 0.0932, "step": 805 }, { "epoch": 2.0353535353535355, "grad_norm": 1.2171615362167358, "learning_rate": 6.430976430976431e-06, "loss": 0.0692, "step": 806 }, { "epoch": 2.037878787878788, "grad_norm": 5.950473308563232, "learning_rate": 6.4141414141414145e-06, "loss": 0.264, "step": 807 }, { "epoch": 2.04040404040404, "grad_norm": 4.191005706787109, "learning_rate": 6.397306397306397e-06, "loss": 0.0666, "step": 808 }, { "epoch": 2.042929292929293, "grad_norm": 3.99367618560791, "learning_rate": 6.3804713804713816e-06, "loss": 0.1528, "step": 809 }, { "epoch": 2.0454545454545454, "grad_norm": 0.7054336667060852, "learning_rate": 6.363636363636364e-06, "loss": 0.0477, "step": 810 }, { "epoch": 2.047979797979798, "grad_norm": 3.71244478225708, "learning_rate": 6.346801346801348e-06, "loss": 0.1287, "step": 811 }, { "epoch": 2.0505050505050506, "grad_norm": 3.171588897705078, "learning_rate": 6.3299663299663304e-06, "loss": 0.087, "step": 812 }, { "epoch": 2.053030303030303, "grad_norm": 1.4060291051864624, "learning_rate": 6.313131313131313e-06, "loss": 0.1075, "step": 813 }, { "epoch": 2.0555555555555554, "grad_norm": 2.073291540145874, "learning_rate": 6.296296296296297e-06, "loss": 0.0653, "step": 814 }, { "epoch": 2.058080808080808, "grad_norm": 6.517178058624268, "learning_rate": 6.279461279461279e-06, "loss": 0.1234, "step": 815 }, { "epoch": 2.0606060606060606, "grad_norm": 11.045914649963379, "learning_rate": 6.262626262626264e-06, "loss": 0.1273, "step": 816 }, { "epoch": 2.063131313131313, "grad_norm": 1.7747228145599365, "learning_rate": 6.245791245791246e-06, "loss": 0.1618, "step": 817 }, { "epoch": 2.0656565656565657, "grad_norm": 1.5213820934295654, "learning_rate": 6.22895622895623e-06, "loss": 0.106, "step": 818 }, { "epoch": 2.0681818181818183, "grad_norm": 1.4155036211013794, "learning_rate": 6.212121212121213e-06, "loss": 0.0759, "step": 819 }, { "epoch": 2.0707070707070705, "grad_norm": 1.0913715362548828, "learning_rate": 6.195286195286195e-06, "loss": 0.0908, "step": 820 }, { "epoch": 2.073232323232323, "grad_norm": 4.059940814971924, "learning_rate": 6.178451178451179e-06, "loss": 0.1544, "step": 821 }, { "epoch": 2.0757575757575757, "grad_norm": 1.2122453451156616, "learning_rate": 6.1616161616161615e-06, "loss": 0.0959, "step": 822 }, { "epoch": 2.0782828282828283, "grad_norm": 2.069533109664917, "learning_rate": 6.144781144781146e-06, "loss": 0.0488, "step": 823 }, { "epoch": 2.080808080808081, "grad_norm": 1.685937523841858, "learning_rate": 6.1279461279461286e-06, "loss": 0.1315, "step": 824 }, { "epoch": 2.0833333333333335, "grad_norm": 3.1984479427337646, "learning_rate": 6.111111111111112e-06, "loss": 0.117, "step": 825 }, { "epoch": 2.0858585858585856, "grad_norm": 3.422079086303711, "learning_rate": 6.094276094276095e-06, "loss": 0.1104, "step": 826 }, { "epoch": 2.0883838383838382, "grad_norm": 1.3577680587768555, "learning_rate": 6.0774410774410774e-06, "loss": 0.0583, "step": 827 }, { "epoch": 2.090909090909091, "grad_norm": 2.0477261543273926, "learning_rate": 6.060606060606061e-06, "loss": 0.2046, "step": 828 }, { "epoch": 2.0934343434343434, "grad_norm": 2.3478550910949707, "learning_rate": 6.043771043771044e-06, "loss": 0.1482, "step": 829 }, { "epoch": 2.095959595959596, "grad_norm": 1.0065677165985107, "learning_rate": 6.026936026936028e-06, "loss": 0.0322, "step": 830 }, { "epoch": 2.0984848484848486, "grad_norm": 2.0075066089630127, "learning_rate": 6.010101010101011e-06, "loss": 0.1149, "step": 831 }, { "epoch": 2.101010101010101, "grad_norm": 2.6007728576660156, "learning_rate": 5.993265993265994e-06, "loss": 0.1527, "step": 832 }, { "epoch": 2.1035353535353534, "grad_norm": 2.199341058731079, "learning_rate": 5.976430976430977e-06, "loss": 0.0776, "step": 833 }, { "epoch": 2.106060606060606, "grad_norm": 2.4440650939941406, "learning_rate": 5.95959595959596e-06, "loss": 0.0902, "step": 834 }, { "epoch": 2.1085858585858586, "grad_norm": 1.7312313318252563, "learning_rate": 5.942760942760943e-06, "loss": 0.0723, "step": 835 }, { "epoch": 2.111111111111111, "grad_norm": 2.232499122619629, "learning_rate": 5.925925925925926e-06, "loss": 0.1162, "step": 836 }, { "epoch": 2.1136363636363638, "grad_norm": 2.4596776962280273, "learning_rate": 5.90909090909091e-06, "loss": 0.1808, "step": 837 }, { "epoch": 2.1161616161616164, "grad_norm": 4.917704105377197, "learning_rate": 5.892255892255893e-06, "loss": 0.1169, "step": 838 }, { "epoch": 2.1186868686868685, "grad_norm": 3.716489553451538, "learning_rate": 5.875420875420876e-06, "loss": 0.0809, "step": 839 }, { "epoch": 2.121212121212121, "grad_norm": 4.413392066955566, "learning_rate": 5.858585858585859e-06, "loss": 0.1834, "step": 840 }, { "epoch": 2.1237373737373737, "grad_norm": 1.872174859046936, "learning_rate": 5.841750841750842e-06, "loss": 0.1462, "step": 841 }, { "epoch": 2.1262626262626263, "grad_norm": 3.7974910736083984, "learning_rate": 5.824915824915825e-06, "loss": 0.0939, "step": 842 }, { "epoch": 2.128787878787879, "grad_norm": 1.4759098291397095, "learning_rate": 5.808080808080808e-06, "loss": 0.0713, "step": 843 }, { "epoch": 2.1313131313131315, "grad_norm": 2.160318613052368, "learning_rate": 5.791245791245792e-06, "loss": 0.2184, "step": 844 }, { "epoch": 2.1338383838383836, "grad_norm": 2.485347270965576, "learning_rate": 5.774410774410775e-06, "loss": 0.1732, "step": 845 }, { "epoch": 2.1363636363636362, "grad_norm": 0.8993260264396667, "learning_rate": 5.7575757575757586e-06, "loss": 0.0909, "step": 846 }, { "epoch": 2.138888888888889, "grad_norm": 1.436485767364502, "learning_rate": 5.740740740740741e-06, "loss": 0.0394, "step": 847 }, { "epoch": 2.1414141414141414, "grad_norm": 0.9625018835067749, "learning_rate": 5.723905723905724e-06, "loss": 0.0382, "step": 848 }, { "epoch": 2.143939393939394, "grad_norm": 1.4799765348434448, "learning_rate": 5.7070707070707075e-06, "loss": 0.094, "step": 849 }, { "epoch": 2.1464646464646466, "grad_norm": 3.625958204269409, "learning_rate": 5.69023569023569e-06, "loss": 0.1394, "step": 850 }, { "epoch": 2.148989898989899, "grad_norm": 1.3515892028808594, "learning_rate": 5.6734006734006745e-06, "loss": 0.1068, "step": 851 }, { "epoch": 2.1515151515151514, "grad_norm": 1.9746239185333252, "learning_rate": 5.656565656565657e-06, "loss": 0.1404, "step": 852 }, { "epoch": 2.154040404040404, "grad_norm": 3.5076723098754883, "learning_rate": 5.639730639730641e-06, "loss": 0.1236, "step": 853 }, { "epoch": 2.1565656565656566, "grad_norm": 1.3625231981277466, "learning_rate": 5.622895622895623e-06, "loss": 0.0698, "step": 854 }, { "epoch": 2.159090909090909, "grad_norm": 2.441847324371338, "learning_rate": 5.606060606060606e-06, "loss": 0.1029, "step": 855 }, { "epoch": 2.1616161616161618, "grad_norm": 3.1259806156158447, "learning_rate": 5.58922558922559e-06, "loss": 0.105, "step": 856 }, { "epoch": 2.1641414141414144, "grad_norm": 5.127650260925293, "learning_rate": 5.572390572390572e-06, "loss": 0.1202, "step": 857 }, { "epoch": 2.1666666666666665, "grad_norm": 1.3531067371368408, "learning_rate": 5.555555555555557e-06, "loss": 0.0812, "step": 858 }, { "epoch": 2.169191919191919, "grad_norm": 5.6110920906066895, "learning_rate": 5.538720538720539e-06, "loss": 0.0898, "step": 859 }, { "epoch": 2.1717171717171717, "grad_norm": 2.4415769577026367, "learning_rate": 5.521885521885523e-06, "loss": 0.231, "step": 860 }, { "epoch": 2.1742424242424243, "grad_norm": 3.1470277309417725, "learning_rate": 5.5050505050505056e-06, "loss": 0.0609, "step": 861 }, { "epoch": 2.176767676767677, "grad_norm": 2.625209093093872, "learning_rate": 5.488215488215489e-06, "loss": 0.1126, "step": 862 }, { "epoch": 2.179292929292929, "grad_norm": 9.551560401916504, "learning_rate": 5.471380471380472e-06, "loss": 0.0788, "step": 863 }, { "epoch": 2.1818181818181817, "grad_norm": 2.088391065597534, "learning_rate": 5.4545454545454545e-06, "loss": 0.1863, "step": 864 }, { "epoch": 2.1843434343434343, "grad_norm": 2.9452109336853027, "learning_rate": 5.437710437710438e-06, "loss": 0.1449, "step": 865 }, { "epoch": 2.186868686868687, "grad_norm": 2.6503803730010986, "learning_rate": 5.420875420875421e-06, "loss": 0.1128, "step": 866 }, { "epoch": 2.1893939393939394, "grad_norm": 6.2185587882995605, "learning_rate": 5.404040404040405e-06, "loss": 0.2125, "step": 867 }, { "epoch": 2.191919191919192, "grad_norm": 1.5772247314453125, "learning_rate": 5.387205387205388e-06, "loss": 0.117, "step": 868 }, { "epoch": 2.1944444444444446, "grad_norm": 4.648830413818359, "learning_rate": 5.370370370370371e-06, "loss": 0.1646, "step": 869 }, { "epoch": 2.196969696969697, "grad_norm": 2.4655864238739014, "learning_rate": 5.353535353535354e-06, "loss": 0.0718, "step": 870 }, { "epoch": 2.1994949494949494, "grad_norm": 1.3793933391571045, "learning_rate": 5.336700336700337e-06, "loss": 0.1087, "step": 871 }, { "epoch": 2.202020202020202, "grad_norm": 2.5595717430114746, "learning_rate": 5.31986531986532e-06, "loss": 0.1177, "step": 872 }, { "epoch": 2.2045454545454546, "grad_norm": 4.922736167907715, "learning_rate": 5.303030303030303e-06, "loss": 0.0976, "step": 873 }, { "epoch": 2.207070707070707, "grad_norm": 2.5227010250091553, "learning_rate": 5.286195286195287e-06, "loss": 0.1744, "step": 874 }, { "epoch": 2.20959595959596, "grad_norm": 1.9036935567855835, "learning_rate": 5.26936026936027e-06, "loss": 0.1184, "step": 875 }, { "epoch": 2.212121212121212, "grad_norm": 1.5138955116271973, "learning_rate": 5.252525252525253e-06, "loss": 0.1052, "step": 876 }, { "epoch": 2.2146464646464645, "grad_norm": 2.0152668952941895, "learning_rate": 5.235690235690236e-06, "loss": 0.0952, "step": 877 }, { "epoch": 2.217171717171717, "grad_norm": 13.834627151489258, "learning_rate": 5.218855218855219e-06, "loss": 0.0788, "step": 878 }, { "epoch": 2.2196969696969697, "grad_norm": 2.163512945175171, "learning_rate": 5.202020202020202e-06, "loss": 0.1584, "step": 879 }, { "epoch": 2.2222222222222223, "grad_norm": 1.2292289733886719, "learning_rate": 5.185185185185185e-06, "loss": 0.1036, "step": 880 }, { "epoch": 2.224747474747475, "grad_norm": 2.1541199684143066, "learning_rate": 5.168350168350169e-06, "loss": 0.0994, "step": 881 }, { "epoch": 2.227272727272727, "grad_norm": 2.9435672760009766, "learning_rate": 5.151515151515152e-06, "loss": 0.1071, "step": 882 }, { "epoch": 2.2297979797979797, "grad_norm": 4.930500507354736, "learning_rate": 5.1346801346801356e-06, "loss": 0.2478, "step": 883 }, { "epoch": 2.2323232323232323, "grad_norm": 13.543425559997559, "learning_rate": 5.117845117845118e-06, "loss": 0.0954, "step": 884 }, { "epoch": 2.234848484848485, "grad_norm": 1.8627355098724365, "learning_rate": 5.101010101010101e-06, "loss": 0.2159, "step": 885 }, { "epoch": 2.2373737373737375, "grad_norm": 1.9947534799575806, "learning_rate": 5.0841750841750845e-06, "loss": 0.0787, "step": 886 }, { "epoch": 2.23989898989899, "grad_norm": 5.217324733734131, "learning_rate": 5.067340067340067e-06, "loss": 0.1191, "step": 887 }, { "epoch": 2.242424242424242, "grad_norm": 1.540475845336914, "learning_rate": 5.0505050505050515e-06, "loss": 0.0773, "step": 888 }, { "epoch": 2.244949494949495, "grad_norm": 4.879143714904785, "learning_rate": 5.033670033670034e-06, "loss": 0.1236, "step": 889 }, { "epoch": 2.2474747474747474, "grad_norm": 4.0901641845703125, "learning_rate": 5.016835016835018e-06, "loss": 0.0619, "step": 890 }, { "epoch": 2.25, "grad_norm": 1.8532190322875977, "learning_rate": 5e-06, "loss": 0.0767, "step": 891 }, { "epoch": 2.2525252525252526, "grad_norm": 3.4842894077301025, "learning_rate": 4.983164983164984e-06, "loss": 0.0984, "step": 892 }, { "epoch": 2.255050505050505, "grad_norm": 1.4197821617126465, "learning_rate": 4.966329966329967e-06, "loss": 0.0441, "step": 893 }, { "epoch": 2.257575757575758, "grad_norm": 1.3725179433822632, "learning_rate": 4.94949494949495e-06, "loss": 0.0636, "step": 894 }, { "epoch": 2.26010101010101, "grad_norm": 3.0550286769866943, "learning_rate": 4.932659932659933e-06, "loss": 0.1222, "step": 895 }, { "epoch": 2.2626262626262625, "grad_norm": 1.3511768579483032, "learning_rate": 4.915824915824916e-06, "loss": 0.102, "step": 896 }, { "epoch": 2.265151515151515, "grad_norm": 2.8341774940490723, "learning_rate": 4.898989898989899e-06, "loss": 0.1176, "step": 897 }, { "epoch": 2.2676767676767677, "grad_norm": 5.220274925231934, "learning_rate": 4.8821548821548826e-06, "loss": 0.1828, "step": 898 }, { "epoch": 2.2702020202020203, "grad_norm": 2.0751826763153076, "learning_rate": 4.865319865319866e-06, "loss": 0.0472, "step": 899 }, { "epoch": 2.2727272727272725, "grad_norm": 1.0210275650024414, "learning_rate": 4.848484848484849e-06, "loss": 0.1155, "step": 900 }, { "epoch": 2.275252525252525, "grad_norm": 2.244605541229248, "learning_rate": 4.831649831649832e-06, "loss": 0.1298, "step": 901 }, { "epoch": 2.2777777777777777, "grad_norm": 1.2191749811172485, "learning_rate": 4.814814814814815e-06, "loss": 0.0553, "step": 902 }, { "epoch": 2.2803030303030303, "grad_norm": 2.009685516357422, "learning_rate": 4.7979797979797985e-06, "loss": 0.2061, "step": 903 }, { "epoch": 2.282828282828283, "grad_norm": 2.537893056869507, "learning_rate": 4.781144781144781e-06, "loss": 0.1638, "step": 904 }, { "epoch": 2.2853535353535355, "grad_norm": 1.8385186195373535, "learning_rate": 4.764309764309765e-06, "loss": 0.1457, "step": 905 }, { "epoch": 2.287878787878788, "grad_norm": 3.0959956645965576, "learning_rate": 4.747474747474748e-06, "loss": 0.0624, "step": 906 }, { "epoch": 2.29040404040404, "grad_norm": 1.0412582159042358, "learning_rate": 4.730639730639731e-06, "loss": 0.0605, "step": 907 }, { "epoch": 2.292929292929293, "grad_norm": 1.1493240594863892, "learning_rate": 4.7138047138047145e-06, "loss": 0.0818, "step": 908 }, { "epoch": 2.2954545454545454, "grad_norm": 1.573701024055481, "learning_rate": 4.696969696969698e-06, "loss": 0.1485, "step": 909 }, { "epoch": 2.297979797979798, "grad_norm": 3.5485622882843018, "learning_rate": 4.680134680134681e-06, "loss": 0.0746, "step": 910 }, { "epoch": 2.3005050505050506, "grad_norm": 2.589240550994873, "learning_rate": 4.663299663299663e-06, "loss": 0.0669, "step": 911 }, { "epoch": 2.303030303030303, "grad_norm": 3.300288677215576, "learning_rate": 4.646464646464647e-06, "loss": 0.1589, "step": 912 }, { "epoch": 2.3055555555555554, "grad_norm": 2.2439637184143066, "learning_rate": 4.62962962962963e-06, "loss": 0.0927, "step": 913 }, { "epoch": 2.308080808080808, "grad_norm": 3.438167095184326, "learning_rate": 4.612794612794613e-06, "loss": 0.1405, "step": 914 }, { "epoch": 2.3106060606060606, "grad_norm": 1.1554774045944214, "learning_rate": 4.595959595959597e-06, "loss": 0.1113, "step": 915 }, { "epoch": 2.313131313131313, "grad_norm": 2.269124984741211, "learning_rate": 4.57912457912458e-06, "loss": 0.1054, "step": 916 }, { "epoch": 2.3156565656565657, "grad_norm": 3.707484722137451, "learning_rate": 4.562289562289563e-06, "loss": 0.1573, "step": 917 }, { "epoch": 2.3181818181818183, "grad_norm": 3.806281089782715, "learning_rate": 4.5454545454545455e-06, "loss": 0.1247, "step": 918 }, { "epoch": 2.320707070707071, "grad_norm": 5.063516616821289, "learning_rate": 4.528619528619529e-06, "loss": 0.077, "step": 919 }, { "epoch": 2.323232323232323, "grad_norm": 1.84391450881958, "learning_rate": 4.5117845117845126e-06, "loss": 0.13, "step": 920 }, { "epoch": 2.3257575757575757, "grad_norm": 2.5902676582336426, "learning_rate": 4.494949494949495e-06, "loss": 0.1043, "step": 921 }, { "epoch": 2.3282828282828283, "grad_norm": 1.1772695779800415, "learning_rate": 4.478114478114479e-06, "loss": 0.0875, "step": 922 }, { "epoch": 2.330808080808081, "grad_norm": 1.865903377532959, "learning_rate": 4.4612794612794615e-06, "loss": 0.1552, "step": 923 }, { "epoch": 2.3333333333333335, "grad_norm": 1.9699102640151978, "learning_rate": 4.444444444444444e-06, "loss": 0.0433, "step": 924 }, { "epoch": 2.3358585858585856, "grad_norm": 3.4536280632019043, "learning_rate": 4.427609427609428e-06, "loss": 0.1309, "step": 925 }, { "epoch": 2.3383838383838382, "grad_norm": 9.139911651611328, "learning_rate": 4.410774410774411e-06, "loss": 0.1629, "step": 926 }, { "epoch": 2.340909090909091, "grad_norm": 2.665511131286621, "learning_rate": 4.393939393939394e-06, "loss": 0.1347, "step": 927 }, { "epoch": 2.3434343434343434, "grad_norm": 1.851479172706604, "learning_rate": 4.377104377104377e-06, "loss": 0.0453, "step": 928 }, { "epoch": 2.345959595959596, "grad_norm": 4.813875675201416, "learning_rate": 4.360269360269361e-06, "loss": 0.1395, "step": 929 }, { "epoch": 2.3484848484848486, "grad_norm": 1.4313777685165405, "learning_rate": 4.343434343434344e-06, "loss": 0.1065, "step": 930 }, { "epoch": 2.351010101010101, "grad_norm": 3.5636346340179443, "learning_rate": 4.326599326599326e-06, "loss": 0.2765, "step": 931 }, { "epoch": 2.3535353535353534, "grad_norm": 2.2551841735839844, "learning_rate": 4.30976430976431e-06, "loss": 0.0752, "step": 932 }, { "epoch": 2.356060606060606, "grad_norm": 2.989997625350952, "learning_rate": 4.292929292929293e-06, "loss": 0.0626, "step": 933 }, { "epoch": 2.3585858585858586, "grad_norm": 2.648948907852173, "learning_rate": 4.276094276094276e-06, "loss": 0.1131, "step": 934 }, { "epoch": 2.361111111111111, "grad_norm": 4.4058685302734375, "learning_rate": 4.2592592592592596e-06, "loss": 0.1696, "step": 935 }, { "epoch": 2.3636363636363638, "grad_norm": 2.665522575378418, "learning_rate": 4.242424242424243e-06, "loss": 0.1484, "step": 936 }, { "epoch": 2.3661616161616164, "grad_norm": 0.8671731352806091, "learning_rate": 4.225589225589226e-06, "loss": 0.0346, "step": 937 }, { "epoch": 2.3686868686868685, "grad_norm": 5.202394962310791, "learning_rate": 4.208754208754209e-06, "loss": 0.1108, "step": 938 }, { "epoch": 2.371212121212121, "grad_norm": 1.2443658113479614, "learning_rate": 4.191919191919192e-06, "loss": 0.0727, "step": 939 }, { "epoch": 2.3737373737373737, "grad_norm": 2.493161678314209, "learning_rate": 4.1750841750841755e-06, "loss": 0.1396, "step": 940 }, { "epoch": 2.3762626262626263, "grad_norm": 1.5535367727279663, "learning_rate": 4.158249158249158e-06, "loss": 0.1128, "step": 941 }, { "epoch": 2.378787878787879, "grad_norm": 1.4870634078979492, "learning_rate": 4.141414141414142e-06, "loss": 0.1251, "step": 942 }, { "epoch": 2.3813131313131315, "grad_norm": 1.0928040742874146, "learning_rate": 4.124579124579125e-06, "loss": 0.1148, "step": 943 }, { "epoch": 2.3838383838383836, "grad_norm": 1.3592982292175293, "learning_rate": 4.107744107744108e-06, "loss": 0.1567, "step": 944 }, { "epoch": 2.3863636363636362, "grad_norm": 3.2275450229644775, "learning_rate": 4.0909090909090915e-06, "loss": 0.1898, "step": 945 }, { "epoch": 2.388888888888889, "grad_norm": 5.524433135986328, "learning_rate": 4.074074074074074e-06, "loss": 0.1526, "step": 946 }, { "epoch": 2.3914141414141414, "grad_norm": 2.3239119052886963, "learning_rate": 4.057239057239058e-06, "loss": 0.1879, "step": 947 }, { "epoch": 2.393939393939394, "grad_norm": 2.8176567554473877, "learning_rate": 4.04040404040404e-06, "loss": 0.0453, "step": 948 }, { "epoch": 2.3964646464646466, "grad_norm": 4.552126884460449, "learning_rate": 4.023569023569024e-06, "loss": 0.1098, "step": 949 }, { "epoch": 2.398989898989899, "grad_norm": 3.1059579849243164, "learning_rate": 4.0067340067340074e-06, "loss": 0.1238, "step": 950 }, { "epoch": 2.4015151515151514, "grad_norm": 2.0037975311279297, "learning_rate": 3.98989898989899e-06, "loss": 0.1101, "step": 951 }, { "epoch": 2.404040404040404, "grad_norm": 1.432120442390442, "learning_rate": 3.973063973063974e-06, "loss": 0.1475, "step": 952 }, { "epoch": 2.4065656565656566, "grad_norm": 4.496235370635986, "learning_rate": 3.956228956228956e-06, "loss": 0.1285, "step": 953 }, { "epoch": 2.409090909090909, "grad_norm": 2.675267457962036, "learning_rate": 3.93939393939394e-06, "loss": 0.1076, "step": 954 }, { "epoch": 2.4116161616161618, "grad_norm": 1.4617221355438232, "learning_rate": 3.9225589225589225e-06, "loss": 0.0742, "step": 955 }, { "epoch": 2.4141414141414144, "grad_norm": 2.676470994949341, "learning_rate": 3.905723905723906e-06, "loss": 0.1042, "step": 956 }, { "epoch": 2.4166666666666665, "grad_norm": 3.1182193756103516, "learning_rate": 3.88888888888889e-06, "loss": 0.0782, "step": 957 }, { "epoch": 2.419191919191919, "grad_norm": 1.4750274419784546, "learning_rate": 3.872053872053872e-06, "loss": 0.0824, "step": 958 }, { "epoch": 2.4217171717171717, "grad_norm": 5.715966701507568, "learning_rate": 3.855218855218856e-06, "loss": 0.1555, "step": 959 }, { "epoch": 2.4242424242424243, "grad_norm": 1.0495116710662842, "learning_rate": 3.8383838383838385e-06, "loss": 0.1218, "step": 960 }, { "epoch": 2.426767676767677, "grad_norm": 3.0049309730529785, "learning_rate": 3.821548821548822e-06, "loss": 0.1283, "step": 961 }, { "epoch": 2.429292929292929, "grad_norm": 1.6869391202926636, "learning_rate": 3.804713804713805e-06, "loss": 0.0797, "step": 962 }, { "epoch": 2.4318181818181817, "grad_norm": 2.2413532733917236, "learning_rate": 3.7878787878787882e-06, "loss": 0.1478, "step": 963 }, { "epoch": 2.4343434343434343, "grad_norm": 2.301522731781006, "learning_rate": 3.7710437710437713e-06, "loss": 0.1209, "step": 964 }, { "epoch": 2.436868686868687, "grad_norm": 3.226301431655884, "learning_rate": 3.7542087542087544e-06, "loss": 0.1697, "step": 965 }, { "epoch": 2.4393939393939394, "grad_norm": 2.822960376739502, "learning_rate": 3.737373737373738e-06, "loss": 0.0748, "step": 966 }, { "epoch": 2.441919191919192, "grad_norm": 8.013906478881836, "learning_rate": 3.720538720538721e-06, "loss": 0.1067, "step": 967 }, { "epoch": 2.4444444444444446, "grad_norm": 1.2187291383743286, "learning_rate": 3.7037037037037037e-06, "loss": 0.1054, "step": 968 }, { "epoch": 2.446969696969697, "grad_norm": 1.9397814273834229, "learning_rate": 3.686868686868687e-06, "loss": 0.0697, "step": 969 }, { "epoch": 2.4494949494949494, "grad_norm": 2.722252130508423, "learning_rate": 3.6700336700336704e-06, "loss": 0.1208, "step": 970 }, { "epoch": 2.452020202020202, "grad_norm": 1.2536653280258179, "learning_rate": 3.6531986531986535e-06, "loss": 0.0446, "step": 971 }, { "epoch": 2.4545454545454546, "grad_norm": 2.2456796169281006, "learning_rate": 3.6363636363636366e-06, "loss": 0.1854, "step": 972 }, { "epoch": 2.457070707070707, "grad_norm": 3.275261163711548, "learning_rate": 3.61952861952862e-06, "loss": 0.1852, "step": 973 }, { "epoch": 2.45959595959596, "grad_norm": 1.8232449293136597, "learning_rate": 3.6026936026936032e-06, "loss": 0.1223, "step": 974 }, { "epoch": 2.462121212121212, "grad_norm": 1.9537675380706787, "learning_rate": 3.585858585858586e-06, "loss": 0.0819, "step": 975 }, { "epoch": 2.4646464646464645, "grad_norm": 2.161625862121582, "learning_rate": 3.569023569023569e-06, "loss": 0.2049, "step": 976 }, { "epoch": 2.467171717171717, "grad_norm": 2.769174575805664, "learning_rate": 3.5521885521885525e-06, "loss": 0.0915, "step": 977 }, { "epoch": 2.4696969696969697, "grad_norm": 3.9444172382354736, "learning_rate": 3.5353535353535356e-06, "loss": 0.114, "step": 978 }, { "epoch": 2.4722222222222223, "grad_norm": 1.980569839477539, "learning_rate": 3.5185185185185187e-06, "loss": 0.0776, "step": 979 }, { "epoch": 2.474747474747475, "grad_norm": 2.1277084350585938, "learning_rate": 3.5016835016835023e-06, "loss": 0.1238, "step": 980 }, { "epoch": 2.4772727272727275, "grad_norm": 2.6043457984924316, "learning_rate": 3.4848484848484854e-06, "loss": 0.1507, "step": 981 }, { "epoch": 2.4797979797979797, "grad_norm": 1.3472402095794678, "learning_rate": 3.468013468013468e-06, "loss": 0.1353, "step": 982 }, { "epoch": 2.4823232323232323, "grad_norm": 4.820988655090332, "learning_rate": 3.451178451178451e-06, "loss": 0.1156, "step": 983 }, { "epoch": 2.484848484848485, "grad_norm": 3.138719320297241, "learning_rate": 3.4343434343434347e-06, "loss": 0.1345, "step": 984 }, { "epoch": 2.4873737373737375, "grad_norm": 1.767815113067627, "learning_rate": 3.417508417508418e-06, "loss": 0.0567, "step": 985 }, { "epoch": 2.48989898989899, "grad_norm": 1.7450860738754272, "learning_rate": 3.400673400673401e-06, "loss": 0.1188, "step": 986 }, { "epoch": 2.492424242424242, "grad_norm": 1.7766708135604858, "learning_rate": 3.3838383838383844e-06, "loss": 0.1241, "step": 987 }, { "epoch": 2.494949494949495, "grad_norm": 4.628079414367676, "learning_rate": 3.3670033670033675e-06, "loss": 0.1263, "step": 988 }, { "epoch": 2.4974747474747474, "grad_norm": 1.5541713237762451, "learning_rate": 3.3501683501683502e-06, "loss": 0.0608, "step": 989 }, { "epoch": 2.5, "grad_norm": 4.456207752227783, "learning_rate": 3.3333333333333333e-06, "loss": 0.1484, "step": 990 }, { "epoch": 2.5025252525252526, "grad_norm": 3.9640469551086426, "learning_rate": 3.316498316498317e-06, "loss": 0.1574, "step": 991 }, { "epoch": 2.505050505050505, "grad_norm": 1.5159541368484497, "learning_rate": 3.2996632996633e-06, "loss": 0.0687, "step": 992 }, { "epoch": 2.507575757575758, "grad_norm": 2.402961254119873, "learning_rate": 3.282828282828283e-06, "loss": 0.1176, "step": 993 }, { "epoch": 2.51010101010101, "grad_norm": 1.6217000484466553, "learning_rate": 3.2659932659932666e-06, "loss": 0.1544, "step": 994 }, { "epoch": 2.5126262626262625, "grad_norm": 3.1921989917755127, "learning_rate": 3.2491582491582497e-06, "loss": 0.1447, "step": 995 }, { "epoch": 2.515151515151515, "grad_norm": 1.179274082183838, "learning_rate": 3.232323232323233e-06, "loss": 0.0994, "step": 996 }, { "epoch": 2.5176767676767677, "grad_norm": 3.9791829586029053, "learning_rate": 3.2154882154882155e-06, "loss": 0.1909, "step": 997 }, { "epoch": 2.5202020202020203, "grad_norm": 2.757751941680908, "learning_rate": 3.1986531986531986e-06, "loss": 0.105, "step": 998 }, { "epoch": 2.5227272727272725, "grad_norm": 0.8614385724067688, "learning_rate": 3.181818181818182e-06, "loss": 0.0791, "step": 999 }, { "epoch": 2.525252525252525, "grad_norm": 0.6211748123168945, "learning_rate": 3.1649831649831652e-06, "loss": 0.0379, "step": 1000 }, { "epoch": 2.5277777777777777, "grad_norm": 2.238368272781372, "learning_rate": 3.1481481481481483e-06, "loss": 0.1195, "step": 1001 }, { "epoch": 2.5303030303030303, "grad_norm": 2.4499704837799072, "learning_rate": 3.131313131313132e-06, "loss": 0.1324, "step": 1002 }, { "epoch": 2.532828282828283, "grad_norm": 3.4274697303771973, "learning_rate": 3.114478114478115e-06, "loss": 0.1922, "step": 1003 }, { "epoch": 2.5353535353535355, "grad_norm": 2.302090883255005, "learning_rate": 3.0976430976430976e-06, "loss": 0.1323, "step": 1004 }, { "epoch": 2.537878787878788, "grad_norm": 3.9652259349823, "learning_rate": 3.0808080808080807e-06, "loss": 0.1251, "step": 1005 }, { "epoch": 2.5404040404040407, "grad_norm": 6.590030670166016, "learning_rate": 3.0639730639730643e-06, "loss": 0.0688, "step": 1006 }, { "epoch": 2.542929292929293, "grad_norm": 0.5998873114585876, "learning_rate": 3.0471380471380474e-06, "loss": 0.0546, "step": 1007 }, { "epoch": 2.5454545454545454, "grad_norm": 4.4240899085998535, "learning_rate": 3.0303030303030305e-06, "loss": 0.1345, "step": 1008 }, { "epoch": 2.547979797979798, "grad_norm": 2.6441352367401123, "learning_rate": 3.013468013468014e-06, "loss": 0.0666, "step": 1009 }, { "epoch": 2.5505050505050506, "grad_norm": 1.1558561325073242, "learning_rate": 2.996632996632997e-06, "loss": 0.0784, "step": 1010 }, { "epoch": 2.5530303030303028, "grad_norm": 1.2861305475234985, "learning_rate": 2.97979797979798e-06, "loss": 0.0839, "step": 1011 }, { "epoch": 2.5555555555555554, "grad_norm": 2.3291330337524414, "learning_rate": 2.962962962962963e-06, "loss": 0.0824, "step": 1012 }, { "epoch": 2.558080808080808, "grad_norm": 1.6665867567062378, "learning_rate": 2.9461279461279464e-06, "loss": 0.1121, "step": 1013 }, { "epoch": 2.5606060606060606, "grad_norm": 1.4039171934127808, "learning_rate": 2.9292929292929295e-06, "loss": 0.0941, "step": 1014 }, { "epoch": 2.563131313131313, "grad_norm": 1.706173062324524, "learning_rate": 2.9124579124579126e-06, "loss": 0.1561, "step": 1015 }, { "epoch": 2.5656565656565657, "grad_norm": 1.4657055139541626, "learning_rate": 2.895622895622896e-06, "loss": 0.0968, "step": 1016 }, { "epoch": 2.5681818181818183, "grad_norm": 2.3425521850585938, "learning_rate": 2.8787878787878793e-06, "loss": 0.0576, "step": 1017 }, { "epoch": 2.570707070707071, "grad_norm": 1.266230821609497, "learning_rate": 2.861952861952862e-06, "loss": 0.0754, "step": 1018 }, { "epoch": 2.573232323232323, "grad_norm": 2.496561288833618, "learning_rate": 2.845117845117845e-06, "loss": 0.0982, "step": 1019 }, { "epoch": 2.5757575757575757, "grad_norm": 2.888542890548706, "learning_rate": 2.8282828282828286e-06, "loss": 0.0865, "step": 1020 }, { "epoch": 2.5782828282828283, "grad_norm": 1.9701051712036133, "learning_rate": 2.8114478114478117e-06, "loss": 0.0496, "step": 1021 }, { "epoch": 2.580808080808081, "grad_norm": 5.326476573944092, "learning_rate": 2.794612794612795e-06, "loss": 0.1212, "step": 1022 }, { "epoch": 2.5833333333333335, "grad_norm": 3.695080041885376, "learning_rate": 2.7777777777777783e-06, "loss": 0.0967, "step": 1023 }, { "epoch": 2.5858585858585856, "grad_norm": 2.2361230850219727, "learning_rate": 2.7609427609427614e-06, "loss": 0.0793, "step": 1024 }, { "epoch": 2.5883838383838382, "grad_norm": 1.3065497875213623, "learning_rate": 2.7441077441077445e-06, "loss": 0.0676, "step": 1025 }, { "epoch": 2.590909090909091, "grad_norm": 2.1756739616394043, "learning_rate": 2.7272727272727272e-06, "loss": 0.1675, "step": 1026 }, { "epoch": 2.5934343434343434, "grad_norm": 2.2035090923309326, "learning_rate": 2.7104377104377103e-06, "loss": 0.1765, "step": 1027 }, { "epoch": 2.595959595959596, "grad_norm": 1.7042522430419922, "learning_rate": 2.693602693602694e-06, "loss": 0.1223, "step": 1028 }, { "epoch": 2.5984848484848486, "grad_norm": 1.2529280185699463, "learning_rate": 2.676767676767677e-06, "loss": 0.0723, "step": 1029 }, { "epoch": 2.601010101010101, "grad_norm": 1.5967926979064941, "learning_rate": 2.65993265993266e-06, "loss": 0.1243, "step": 1030 }, { "epoch": 2.6035353535353534, "grad_norm": 1.8551892042160034, "learning_rate": 2.6430976430976436e-06, "loss": 0.0677, "step": 1031 }, { "epoch": 2.606060606060606, "grad_norm": 0.9810446500778198, "learning_rate": 2.6262626262626267e-06, "loss": 0.0399, "step": 1032 }, { "epoch": 2.6085858585858586, "grad_norm": 4.027339935302734, "learning_rate": 2.6094276094276094e-06, "loss": 0.1253, "step": 1033 }, { "epoch": 2.611111111111111, "grad_norm": 1.6822688579559326, "learning_rate": 2.5925925925925925e-06, "loss": 0.1235, "step": 1034 }, { "epoch": 2.6136363636363638, "grad_norm": 2.5733704566955566, "learning_rate": 2.575757575757576e-06, "loss": 0.094, "step": 1035 }, { "epoch": 2.616161616161616, "grad_norm": 2.587446689605713, "learning_rate": 2.558922558922559e-06, "loss": 0.0614, "step": 1036 }, { "epoch": 2.6186868686868685, "grad_norm": 3.116171360015869, "learning_rate": 2.5420875420875422e-06, "loss": 0.063, "step": 1037 }, { "epoch": 2.621212121212121, "grad_norm": 4.079165458679199, "learning_rate": 2.5252525252525258e-06, "loss": 0.1302, "step": 1038 }, { "epoch": 2.6237373737373737, "grad_norm": 3.22881817817688, "learning_rate": 2.508417508417509e-06, "loss": 0.1311, "step": 1039 }, { "epoch": 2.6262626262626263, "grad_norm": 2.3561739921569824, "learning_rate": 2.491582491582492e-06, "loss": 0.1138, "step": 1040 }, { "epoch": 2.628787878787879, "grad_norm": 1.6347684860229492, "learning_rate": 2.474747474747475e-06, "loss": 0.1246, "step": 1041 }, { "epoch": 2.6313131313131315, "grad_norm": 2.9931626319885254, "learning_rate": 2.457912457912458e-06, "loss": 0.1445, "step": 1042 }, { "epoch": 2.633838383838384, "grad_norm": 0.5848364233970642, "learning_rate": 2.4410774410774413e-06, "loss": 0.0661, "step": 1043 }, { "epoch": 2.6363636363636362, "grad_norm": 4.181141376495361, "learning_rate": 2.4242424242424244e-06, "loss": 0.1234, "step": 1044 }, { "epoch": 2.638888888888889, "grad_norm": 5.948246002197266, "learning_rate": 2.4074074074074075e-06, "loss": 0.134, "step": 1045 }, { "epoch": 2.6414141414141414, "grad_norm": 1.8077932596206665, "learning_rate": 2.3905723905723906e-06, "loss": 0.1052, "step": 1046 }, { "epoch": 2.643939393939394, "grad_norm": 4.848948955535889, "learning_rate": 2.373737373737374e-06, "loss": 0.1963, "step": 1047 }, { "epoch": 2.6464646464646466, "grad_norm": 2.3405141830444336, "learning_rate": 2.3569023569023572e-06, "loss": 0.24, "step": 1048 }, { "epoch": 2.648989898989899, "grad_norm": 3.162492036819458, "learning_rate": 2.3400673400673403e-06, "loss": 0.0911, "step": 1049 }, { "epoch": 2.6515151515151514, "grad_norm": 4.6703619956970215, "learning_rate": 2.3232323232323234e-06, "loss": 0.0713, "step": 1050 }, { "epoch": 2.654040404040404, "grad_norm": 1.252194881439209, "learning_rate": 2.3063973063973065e-06, "loss": 0.0678, "step": 1051 }, { "epoch": 2.6565656565656566, "grad_norm": 1.4940955638885498, "learning_rate": 2.28956228956229e-06, "loss": 0.0321, "step": 1052 }, { "epoch": 2.659090909090909, "grad_norm": 2.759089469909668, "learning_rate": 2.2727272727272728e-06, "loss": 0.0759, "step": 1053 }, { "epoch": 2.6616161616161618, "grad_norm": 4.008279800415039, "learning_rate": 2.2558922558922563e-06, "loss": 0.1421, "step": 1054 }, { "epoch": 2.6641414141414144, "grad_norm": 2.280316114425659, "learning_rate": 2.2390572390572394e-06, "loss": 0.0971, "step": 1055 }, { "epoch": 2.6666666666666665, "grad_norm": 1.5876095294952393, "learning_rate": 2.222222222222222e-06, "loss": 0.0945, "step": 1056 }, { "epoch": 2.669191919191919, "grad_norm": 2.7003700733184814, "learning_rate": 2.2053872053872056e-06, "loss": 0.1862, "step": 1057 }, { "epoch": 2.6717171717171717, "grad_norm": 2.837354898452759, "learning_rate": 2.1885521885521887e-06, "loss": 0.0816, "step": 1058 }, { "epoch": 2.6742424242424243, "grad_norm": 1.9325331449508667, "learning_rate": 2.171717171717172e-06, "loss": 0.09, "step": 1059 }, { "epoch": 2.676767676767677, "grad_norm": 1.9655112028121948, "learning_rate": 2.154882154882155e-06, "loss": 0.189, "step": 1060 }, { "epoch": 2.679292929292929, "grad_norm": 0.8985033631324768, "learning_rate": 2.138047138047138e-06, "loss": 0.0415, "step": 1061 }, { "epoch": 2.6818181818181817, "grad_norm": 2.287306785583496, "learning_rate": 2.1212121212121216e-06, "loss": 0.2154, "step": 1062 }, { "epoch": 2.6843434343434343, "grad_norm": 2.1749632358551025, "learning_rate": 2.1043771043771047e-06, "loss": 0.1085, "step": 1063 }, { "epoch": 2.686868686868687, "grad_norm": 3.1133999824523926, "learning_rate": 2.0875420875420878e-06, "loss": 0.109, "step": 1064 }, { "epoch": 2.6893939393939394, "grad_norm": 1.5289435386657715, "learning_rate": 2.070707070707071e-06, "loss": 0.0488, "step": 1065 }, { "epoch": 2.691919191919192, "grad_norm": 2.7709944248199463, "learning_rate": 2.053872053872054e-06, "loss": 0.1032, "step": 1066 }, { "epoch": 2.6944444444444446, "grad_norm": 3.149768114089966, "learning_rate": 2.037037037037037e-06, "loss": 0.0486, "step": 1067 }, { "epoch": 2.6969696969696972, "grad_norm": 3.0890722274780273, "learning_rate": 2.02020202020202e-06, "loss": 0.1869, "step": 1068 }, { "epoch": 2.6994949494949494, "grad_norm": 4.697057247161865, "learning_rate": 2.0033670033670037e-06, "loss": 0.2966, "step": 1069 }, { "epoch": 2.702020202020202, "grad_norm": 3.644277334213257, "learning_rate": 1.986531986531987e-06, "loss": 0.0869, "step": 1070 }, { "epoch": 2.7045454545454546, "grad_norm": 1.996146559715271, "learning_rate": 1.96969696969697e-06, "loss": 0.1297, "step": 1071 }, { "epoch": 2.707070707070707, "grad_norm": 1.3258694410324097, "learning_rate": 1.952861952861953e-06, "loss": 0.0937, "step": 1072 }, { "epoch": 2.7095959595959593, "grad_norm": 2.5805246829986572, "learning_rate": 1.936026936026936e-06, "loss": 0.107, "step": 1073 }, { "epoch": 2.712121212121212, "grad_norm": 1.8007394075393677, "learning_rate": 1.9191919191919192e-06, "loss": 0.0987, "step": 1074 }, { "epoch": 2.7146464646464645, "grad_norm": 2.052168369293213, "learning_rate": 1.9023569023569026e-06, "loss": 0.0753, "step": 1075 }, { "epoch": 2.717171717171717, "grad_norm": 1.795806646347046, "learning_rate": 1.8855218855218857e-06, "loss": 0.0898, "step": 1076 }, { "epoch": 2.7196969696969697, "grad_norm": 2.1112513542175293, "learning_rate": 1.868686868686869e-06, "loss": 0.0948, "step": 1077 }, { "epoch": 2.7222222222222223, "grad_norm": 1.7274150848388672, "learning_rate": 1.8518518518518519e-06, "loss": 0.0699, "step": 1078 }, { "epoch": 2.724747474747475, "grad_norm": 3.7306082248687744, "learning_rate": 1.8350168350168352e-06, "loss": 0.138, "step": 1079 }, { "epoch": 2.7272727272727275, "grad_norm": 1.8672465085983276, "learning_rate": 1.8181818181818183e-06, "loss": 0.0541, "step": 1080 }, { "epoch": 2.7297979797979797, "grad_norm": 2.303978443145752, "learning_rate": 1.8013468013468016e-06, "loss": 0.1123, "step": 1081 }, { "epoch": 2.7323232323232323, "grad_norm": 1.74871027469635, "learning_rate": 1.7845117845117845e-06, "loss": 0.1266, "step": 1082 }, { "epoch": 2.734848484848485, "grad_norm": 3.29699969291687, "learning_rate": 1.7676767676767678e-06, "loss": 0.1918, "step": 1083 }, { "epoch": 2.7373737373737375, "grad_norm": 2.935121774673462, "learning_rate": 1.7508417508417511e-06, "loss": 0.115, "step": 1084 }, { "epoch": 2.73989898989899, "grad_norm": 4.8938140869140625, "learning_rate": 1.734006734006734e-06, "loss": 0.1566, "step": 1085 }, { "epoch": 2.742424242424242, "grad_norm": 3.4594430923461914, "learning_rate": 1.7171717171717173e-06, "loss": 0.0905, "step": 1086 }, { "epoch": 2.744949494949495, "grad_norm": 2.121217966079712, "learning_rate": 1.7003367003367005e-06, "loss": 0.1123, "step": 1087 }, { "epoch": 2.7474747474747474, "grad_norm": 2.414285182952881, "learning_rate": 1.6835016835016838e-06, "loss": 0.1652, "step": 1088 }, { "epoch": 2.75, "grad_norm": 3.6288323402404785, "learning_rate": 1.6666666666666667e-06, "loss": 0.0463, "step": 1089 }, { "epoch": 2.7525252525252526, "grad_norm": 1.9368160963058472, "learning_rate": 1.64983164983165e-06, "loss": 0.0567, "step": 1090 }, { "epoch": 2.755050505050505, "grad_norm": 1.847935438156128, "learning_rate": 1.6329966329966333e-06, "loss": 0.0958, "step": 1091 }, { "epoch": 2.757575757575758, "grad_norm": 1.821707010269165, "learning_rate": 1.6161616161616164e-06, "loss": 0.0851, "step": 1092 }, { "epoch": 2.76010101010101, "grad_norm": 3.361027240753174, "learning_rate": 1.5993265993265993e-06, "loss": 0.0712, "step": 1093 }, { "epoch": 2.7626262626262625, "grad_norm": 1.8871111869812012, "learning_rate": 1.5824915824915826e-06, "loss": 0.0637, "step": 1094 }, { "epoch": 2.765151515151515, "grad_norm": 3.3805835247039795, "learning_rate": 1.565656565656566e-06, "loss": 0.166, "step": 1095 }, { "epoch": 2.7676767676767677, "grad_norm": 1.451699137687683, "learning_rate": 1.5488215488215488e-06, "loss": 0.1075, "step": 1096 }, { "epoch": 2.7702020202020203, "grad_norm": 1.6252110004425049, "learning_rate": 1.5319865319865321e-06, "loss": 0.0511, "step": 1097 }, { "epoch": 2.7727272727272725, "grad_norm": 1.8269497156143188, "learning_rate": 1.5151515151515152e-06, "loss": 0.0603, "step": 1098 }, { "epoch": 2.775252525252525, "grad_norm": 1.9480081796646118, "learning_rate": 1.4983164983164986e-06, "loss": 0.1297, "step": 1099 }, { "epoch": 2.7777777777777777, "grad_norm": 1.0791457891464233, "learning_rate": 1.4814814814814815e-06, "loss": 0.127, "step": 1100 }, { "epoch": 2.7803030303030303, "grad_norm": 1.6918015480041504, "learning_rate": 1.4646464646464648e-06, "loss": 0.1229, "step": 1101 }, { "epoch": 2.782828282828283, "grad_norm": 1.6666957139968872, "learning_rate": 1.447811447811448e-06, "loss": 0.1041, "step": 1102 }, { "epoch": 2.7853535353535355, "grad_norm": 1.4526945352554321, "learning_rate": 1.430976430976431e-06, "loss": 0.1327, "step": 1103 }, { "epoch": 2.787878787878788, "grad_norm": 4.764105319976807, "learning_rate": 1.4141414141414143e-06, "loss": 0.1007, "step": 1104 }, { "epoch": 2.7904040404040407, "grad_norm": 1.458585262298584, "learning_rate": 1.3973063973063974e-06, "loss": 0.0867, "step": 1105 }, { "epoch": 2.792929292929293, "grad_norm": 1.1463141441345215, "learning_rate": 1.3804713804713807e-06, "loss": 0.0722, "step": 1106 }, { "epoch": 2.7954545454545454, "grad_norm": 2.6391751766204834, "learning_rate": 1.3636363636363636e-06, "loss": 0.0808, "step": 1107 }, { "epoch": 2.797979797979798, "grad_norm": 2.5230796337127686, "learning_rate": 1.346801346801347e-06, "loss": 0.1696, "step": 1108 }, { "epoch": 2.8005050505050506, "grad_norm": 2.990051507949829, "learning_rate": 1.32996632996633e-06, "loss": 0.1824, "step": 1109 }, { "epoch": 2.8030303030303028, "grad_norm": 5.150264739990234, "learning_rate": 1.3131313131313134e-06, "loss": 0.08, "step": 1110 }, { "epoch": 2.8055555555555554, "grad_norm": 2.4451775550842285, "learning_rate": 1.2962962962962962e-06, "loss": 0.0639, "step": 1111 }, { "epoch": 2.808080808080808, "grad_norm": 8.441463470458984, "learning_rate": 1.2794612794612796e-06, "loss": 0.071, "step": 1112 }, { "epoch": 2.8106060606060606, "grad_norm": 7.7809882164001465, "learning_rate": 1.2626262626262629e-06, "loss": 0.0897, "step": 1113 }, { "epoch": 2.813131313131313, "grad_norm": 8.197009086608887, "learning_rate": 1.245791245791246e-06, "loss": 0.1037, "step": 1114 }, { "epoch": 2.8156565656565657, "grad_norm": 2.672224283218384, "learning_rate": 1.228956228956229e-06, "loss": 0.0338, "step": 1115 }, { "epoch": 2.8181818181818183, "grad_norm": 2.55483078956604, "learning_rate": 1.2121212121212122e-06, "loss": 0.0677, "step": 1116 }, { "epoch": 2.820707070707071, "grad_norm": 7.761810779571533, "learning_rate": 1.1952861952861953e-06, "loss": 0.0817, "step": 1117 }, { "epoch": 2.823232323232323, "grad_norm": 2.313318967819214, "learning_rate": 1.1784511784511786e-06, "loss": 0.1253, "step": 1118 }, { "epoch": 2.8257575757575757, "grad_norm": 0.8076485991477966, "learning_rate": 1.1616161616161617e-06, "loss": 0.0363, "step": 1119 }, { "epoch": 2.8282828282828283, "grad_norm": 2.6288771629333496, "learning_rate": 1.144781144781145e-06, "loss": 0.1451, "step": 1120 }, { "epoch": 2.830808080808081, "grad_norm": 1.7148422002792358, "learning_rate": 1.1279461279461281e-06, "loss": 0.1067, "step": 1121 }, { "epoch": 2.8333333333333335, "grad_norm": 1.2999204397201538, "learning_rate": 1.111111111111111e-06, "loss": 0.0544, "step": 1122 }, { "epoch": 2.8358585858585856, "grad_norm": 2.9060170650482178, "learning_rate": 1.0942760942760944e-06, "loss": 0.1528, "step": 1123 }, { "epoch": 2.8383838383838382, "grad_norm": 2.594888210296631, "learning_rate": 1.0774410774410775e-06, "loss": 0.104, "step": 1124 }, { "epoch": 2.840909090909091, "grad_norm": 7.884887218475342, "learning_rate": 1.0606060606060608e-06, "loss": 0.1301, "step": 1125 }, { "epoch": 2.8434343434343434, "grad_norm": 1.9427886009216309, "learning_rate": 1.0437710437710439e-06, "loss": 0.2201, "step": 1126 }, { "epoch": 2.845959595959596, "grad_norm": 6.63613748550415, "learning_rate": 1.026936026936027e-06, "loss": 0.1505, "step": 1127 }, { "epoch": 2.8484848484848486, "grad_norm": 2.172806739807129, "learning_rate": 1.01010101010101e-06, "loss": 0.1074, "step": 1128 }, { "epoch": 2.851010101010101, "grad_norm": 2.2825562953948975, "learning_rate": 9.932659932659934e-07, "loss": 0.1252, "step": 1129 }, { "epoch": 2.8535353535353534, "grad_norm": 1.1408872604370117, "learning_rate": 9.764309764309765e-07, "loss": 0.1207, "step": 1130 }, { "epoch": 2.856060606060606, "grad_norm": 2.4947509765625, "learning_rate": 9.595959595959596e-07, "loss": 0.0963, "step": 1131 }, { "epoch": 2.8585858585858586, "grad_norm": 7.295626640319824, "learning_rate": 9.427609427609428e-07, "loss": 0.1011, "step": 1132 }, { "epoch": 2.861111111111111, "grad_norm": 9.468647956848145, "learning_rate": 9.259259259259259e-07, "loss": 0.0915, "step": 1133 }, { "epoch": 2.8636363636363638, "grad_norm": 1.7602087259292603, "learning_rate": 9.090909090909091e-07, "loss": 0.0556, "step": 1134 }, { "epoch": 2.866161616161616, "grad_norm": 1.6855865716934204, "learning_rate": 8.922558922558923e-07, "loss": 0.0916, "step": 1135 }, { "epoch": 2.8686868686868685, "grad_norm": 3.8684542179107666, "learning_rate": 8.754208754208756e-07, "loss": 0.0927, "step": 1136 }, { "epoch": 2.871212121212121, "grad_norm": 1.5681943893432617, "learning_rate": 8.585858585858587e-07, "loss": 0.0907, "step": 1137 }, { "epoch": 2.8737373737373737, "grad_norm": 2.357790470123291, "learning_rate": 8.417508417508419e-07, "loss": 0.0963, "step": 1138 }, { "epoch": 2.8762626262626263, "grad_norm": 2.0638039112091064, "learning_rate": 8.24915824915825e-07, "loss": 0.1217, "step": 1139 }, { "epoch": 2.878787878787879, "grad_norm": 7.039210319519043, "learning_rate": 8.080808080808082e-07, "loss": 0.1581, "step": 1140 }, { "epoch": 2.8813131313131315, "grad_norm": 2.2965760231018066, "learning_rate": 7.912457912457913e-07, "loss": 0.1482, "step": 1141 }, { "epoch": 2.883838383838384, "grad_norm": 1.5618226528167725, "learning_rate": 7.744107744107744e-07, "loss": 0.1567, "step": 1142 }, { "epoch": 2.8863636363636362, "grad_norm": 1.2720274925231934, "learning_rate": 7.575757575757576e-07, "loss": 0.1048, "step": 1143 }, { "epoch": 2.888888888888889, "grad_norm": 1.6947522163391113, "learning_rate": 7.407407407407407e-07, "loss": 0.0891, "step": 1144 }, { "epoch": 2.8914141414141414, "grad_norm": 3.2767159938812256, "learning_rate": 7.23905723905724e-07, "loss": 0.2284, "step": 1145 }, { "epoch": 2.893939393939394, "grad_norm": 12.075784683227539, "learning_rate": 7.070707070707071e-07, "loss": 0.1004, "step": 1146 }, { "epoch": 2.8964646464646466, "grad_norm": 1.556806206703186, "learning_rate": 6.902356902356904e-07, "loss": 0.1137, "step": 1147 }, { "epoch": 2.898989898989899, "grad_norm": 3.214446783065796, "learning_rate": 6.734006734006735e-07, "loss": 0.1453, "step": 1148 }, { "epoch": 2.9015151515151514, "grad_norm": 2.274674892425537, "learning_rate": 6.565656565656567e-07, "loss": 0.1567, "step": 1149 }, { "epoch": 2.904040404040404, "grad_norm": 4.8869781494140625, "learning_rate": 6.397306397306398e-07, "loss": 0.2099, "step": 1150 }, { "epoch": 2.9065656565656566, "grad_norm": 4.9651923179626465, "learning_rate": 6.22895622895623e-07, "loss": 0.1808, "step": 1151 }, { "epoch": 2.909090909090909, "grad_norm": 4.156426906585693, "learning_rate": 6.060606060606061e-07, "loss": 0.0797, "step": 1152 }, { "epoch": 2.9116161616161618, "grad_norm": 2.8879013061523438, "learning_rate": 5.892255892255893e-07, "loss": 0.1232, "step": 1153 }, { "epoch": 2.9141414141414144, "grad_norm": 1.8005517721176147, "learning_rate": 5.723905723905725e-07, "loss": 0.112, "step": 1154 }, { "epoch": 2.9166666666666665, "grad_norm": 3.8166842460632324, "learning_rate": 5.555555555555555e-07, "loss": 0.0776, "step": 1155 }, { "epoch": 2.919191919191919, "grad_norm": 4.17734432220459, "learning_rate": 5.387205387205387e-07, "loss": 0.2496, "step": 1156 }, { "epoch": 2.9217171717171717, "grad_norm": 2.027888536453247, "learning_rate": 5.218855218855219e-07, "loss": 0.1184, "step": 1157 }, { "epoch": 2.9242424242424243, "grad_norm": 0.865708589553833, "learning_rate": 5.05050505050505e-07, "loss": 0.0604, "step": 1158 }, { "epoch": 2.926767676767677, "grad_norm": 1.5890415906906128, "learning_rate": 4.882154882154883e-07, "loss": 0.1305, "step": 1159 }, { "epoch": 2.929292929292929, "grad_norm": 1.054485559463501, "learning_rate": 4.713804713804714e-07, "loss": 0.077, "step": 1160 }, { "epoch": 2.9318181818181817, "grad_norm": 1.1664531230926514, "learning_rate": 4.5454545454545457e-07, "loss": 0.065, "step": 1161 }, { "epoch": 2.9343434343434343, "grad_norm": 1.196090579032898, "learning_rate": 4.377104377104378e-07, "loss": 0.0854, "step": 1162 }, { "epoch": 2.936868686868687, "grad_norm": 1.983268141746521, "learning_rate": 4.2087542087542094e-07, "loss": 0.1021, "step": 1163 }, { "epoch": 2.9393939393939394, "grad_norm": 5.308765888214111, "learning_rate": 4.040404040404041e-07, "loss": 0.1535, "step": 1164 }, { "epoch": 2.941919191919192, "grad_norm": 3.1391713619232178, "learning_rate": 3.872053872053872e-07, "loss": 0.1295, "step": 1165 }, { "epoch": 2.9444444444444446, "grad_norm": 1.9112738370895386, "learning_rate": 3.7037037037037036e-07, "loss": 0.1338, "step": 1166 }, { "epoch": 2.9469696969696972, "grad_norm": 1.7345768213272095, "learning_rate": 3.535353535353536e-07, "loss": 0.1048, "step": 1167 }, { "epoch": 2.9494949494949494, "grad_norm": 1.8400707244873047, "learning_rate": 3.3670033670033673e-07, "loss": 0.1345, "step": 1168 }, { "epoch": 2.952020202020202, "grad_norm": 5.5112152099609375, "learning_rate": 3.198653198653199e-07, "loss": 0.0901, "step": 1169 }, { "epoch": 2.9545454545454546, "grad_norm": 1.7662899494171143, "learning_rate": 3.0303030303030305e-07, "loss": 0.1025, "step": 1170 }, { "epoch": 2.957070707070707, "grad_norm": 5.364653587341309, "learning_rate": 2.8619528619528626e-07, "loss": 0.098, "step": 1171 }, { "epoch": 2.9595959595959593, "grad_norm": 1.2001750469207764, "learning_rate": 2.6936026936026936e-07, "loss": 0.049, "step": 1172 }, { "epoch": 2.962121212121212, "grad_norm": 2.842573642730713, "learning_rate": 2.525252525252525e-07, "loss": 0.0885, "step": 1173 }, { "epoch": 2.9646464646464645, "grad_norm": 1.9140822887420654, "learning_rate": 2.356902356902357e-07, "loss": 0.1336, "step": 1174 }, { "epoch": 2.967171717171717, "grad_norm": 1.2715041637420654, "learning_rate": 2.188552188552189e-07, "loss": 0.044, "step": 1175 }, { "epoch": 2.9696969696969697, "grad_norm": 1.805606722831726, "learning_rate": 2.0202020202020205e-07, "loss": 0.1139, "step": 1176 }, { "epoch": 2.9722222222222223, "grad_norm": 0.7524275183677673, "learning_rate": 1.8518518518518518e-07, "loss": 0.038, "step": 1177 }, { "epoch": 2.974747474747475, "grad_norm": 1.4970057010650635, "learning_rate": 1.6835016835016837e-07, "loss": 0.1246, "step": 1178 }, { "epoch": 2.9772727272727275, "grad_norm": 2.653041124343872, "learning_rate": 1.5151515151515152e-07, "loss": 0.1941, "step": 1179 }, { "epoch": 2.9797979797979797, "grad_norm": 2.8758771419525146, "learning_rate": 1.3468013468013468e-07, "loss": 0.1387, "step": 1180 }, { "epoch": 2.9823232323232323, "grad_norm": 4.085249423980713, "learning_rate": 1.1784511784511785e-07, "loss": 0.0822, "step": 1181 }, { "epoch": 2.984848484848485, "grad_norm": 2.2607507705688477, "learning_rate": 1.0101010101010103e-07, "loss": 0.1064, "step": 1182 }, { "epoch": 2.9873737373737375, "grad_norm": 2.853379726409912, "learning_rate": 8.417508417508418e-08, "loss": 0.0615, "step": 1183 }, { "epoch": 2.98989898989899, "grad_norm": 3.8462393283843994, "learning_rate": 6.734006734006734e-08, "loss": 0.1311, "step": 1184 }, { "epoch": 2.992424242424242, "grad_norm": 4.459750652313232, "learning_rate": 5.050505050505051e-08, "loss": 0.2523, "step": 1185 }, { "epoch": 2.994949494949495, "grad_norm": 2.9024791717529297, "learning_rate": 3.367003367003367e-08, "loss": 0.0775, "step": 1186 }, { "epoch": 2.9974747474747474, "grad_norm": 2.9558804035186768, "learning_rate": 1.6835016835016835e-08, "loss": 0.1257, "step": 1187 }, { "epoch": 3.0, "grad_norm": 2.027782678604126, "learning_rate": 0.0, "loss": 0.1071, "step": 1188 }, { "epoch": 3.0, "eval_accuracy": 0.775, "eval_f1": 0.9063876651982378, "eval_loss": 0.13235561549663544, "eval_runtime": 45.6825, "eval_samples_per_second": 19.263, "eval_steps_per_second": 0.416, "step": 1188 } ], "logging_steps": 1, "max_steps": 1188, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 251588081479680.0, "train_batch_size": 20, "trial_name": null, "trial_params": null }