ab-classifier-large / trainer_state.json
dejanseo's picture
Upload 11 files
3ff93e5 verified
{
"best_global_step": 1188,
"best_metric": 0.9063876651982378,
"best_model_checkpoint": "./albert_multilabel_large\\checkpoint-1188",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1188,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025252525252525255,
"grad_norm": 11.870144844055176,
"learning_rate": 1.9983164983164986e-05,
"loss": 0.758,
"step": 1
},
{
"epoch": 0.005050505050505051,
"grad_norm": 3.960209369659424,
"learning_rate": 1.9966329966329967e-05,
"loss": 0.6626,
"step": 2
},
{
"epoch": 0.007575757575757576,
"grad_norm": 4.163188457489014,
"learning_rate": 1.994949494949495e-05,
"loss": 0.6126,
"step": 3
},
{
"epoch": 0.010101010101010102,
"grad_norm": 3.630805253982544,
"learning_rate": 1.9932659932659936e-05,
"loss": 0.6105,
"step": 4
},
{
"epoch": 0.012626262626262626,
"grad_norm": 3.7358059883117676,
"learning_rate": 1.9915824915824917e-05,
"loss": 0.5486,
"step": 5
},
{
"epoch": 0.015151515151515152,
"grad_norm": 4.360195636749268,
"learning_rate": 1.98989898989899e-05,
"loss": 0.4834,
"step": 6
},
{
"epoch": 0.017676767676767676,
"grad_norm": 4.5092549324035645,
"learning_rate": 1.9882154882154885e-05,
"loss": 0.4723,
"step": 7
},
{
"epoch": 0.020202020202020204,
"grad_norm": 3.9408679008483887,
"learning_rate": 1.9865319865319866e-05,
"loss": 0.4853,
"step": 8
},
{
"epoch": 0.022727272727272728,
"grad_norm": 253.27503967285156,
"learning_rate": 1.984848484848485e-05,
"loss": 0.5783,
"step": 9
},
{
"epoch": 0.025252525252525252,
"grad_norm": 17.573854446411133,
"learning_rate": 1.9831649831649832e-05,
"loss": 0.3991,
"step": 10
},
{
"epoch": 0.027777777777777776,
"grad_norm": 4.111778259277344,
"learning_rate": 1.9814814814814816e-05,
"loss": 0.4054,
"step": 11
},
{
"epoch": 0.030303030303030304,
"grad_norm": 2.9756879806518555,
"learning_rate": 1.97979797979798e-05,
"loss": 0.4636,
"step": 12
},
{
"epoch": 0.03282828282828283,
"grad_norm": 1.9542008638381958,
"learning_rate": 1.978114478114478e-05,
"loss": 0.3732,
"step": 13
},
{
"epoch": 0.03535353535353535,
"grad_norm": 2.1436798572540283,
"learning_rate": 1.9764309764309766e-05,
"loss": 0.3341,
"step": 14
},
{
"epoch": 0.03787878787878788,
"grad_norm": 2.5457680225372314,
"learning_rate": 1.9747474747474747e-05,
"loss": 0.358,
"step": 15
},
{
"epoch": 0.04040404040404041,
"grad_norm": 2.3681640625,
"learning_rate": 1.973063973063973e-05,
"loss": 0.3813,
"step": 16
},
{
"epoch": 0.04292929292929293,
"grad_norm": 3.3765199184417725,
"learning_rate": 1.9713804713804716e-05,
"loss": 0.3288,
"step": 17
},
{
"epoch": 0.045454545454545456,
"grad_norm": 2.5906496047973633,
"learning_rate": 1.96969696969697e-05,
"loss": 0.3025,
"step": 18
},
{
"epoch": 0.047979797979797977,
"grad_norm": 1.6513965129852295,
"learning_rate": 1.968013468013468e-05,
"loss": 0.287,
"step": 19
},
{
"epoch": 0.050505050505050504,
"grad_norm": 2.1033501625061035,
"learning_rate": 1.9663299663299665e-05,
"loss": 0.4552,
"step": 20
},
{
"epoch": 0.05303030303030303,
"grad_norm": 2.6947014331817627,
"learning_rate": 1.964646464646465e-05,
"loss": 0.3561,
"step": 21
},
{
"epoch": 0.05555555555555555,
"grad_norm": 1.4776068925857544,
"learning_rate": 1.962962962962963e-05,
"loss": 0.3564,
"step": 22
},
{
"epoch": 0.05808080808080808,
"grad_norm": 1.8511464595794678,
"learning_rate": 1.9612794612794615e-05,
"loss": 0.3851,
"step": 23
},
{
"epoch": 0.06060606060606061,
"grad_norm": 1.9145028591156006,
"learning_rate": 1.9595959595959596e-05,
"loss": 0.3591,
"step": 24
},
{
"epoch": 0.06313131313131314,
"grad_norm": 3.7978272438049316,
"learning_rate": 1.957912457912458e-05,
"loss": 0.2914,
"step": 25
},
{
"epoch": 0.06565656565656566,
"grad_norm": 1.9927159547805786,
"learning_rate": 1.9562289562289565e-05,
"loss": 0.276,
"step": 26
},
{
"epoch": 0.06818181818181818,
"grad_norm": 1.7165324687957764,
"learning_rate": 1.9545454545454546e-05,
"loss": 0.3855,
"step": 27
},
{
"epoch": 0.0707070707070707,
"grad_norm": 3.547311544418335,
"learning_rate": 1.952861952861953e-05,
"loss": 0.3146,
"step": 28
},
{
"epoch": 0.07323232323232323,
"grad_norm": 2.205611228942871,
"learning_rate": 1.951178451178451e-05,
"loss": 0.2515,
"step": 29
},
{
"epoch": 0.07575757575757576,
"grad_norm": 131.6199951171875,
"learning_rate": 1.9494949494949496e-05,
"loss": 0.4546,
"step": 30
},
{
"epoch": 0.07828282828282829,
"grad_norm": 330.8481140136719,
"learning_rate": 1.947811447811448e-05,
"loss": 0.425,
"step": 31
},
{
"epoch": 0.08080808080808081,
"grad_norm": 24.042455673217773,
"learning_rate": 1.9461279461279464e-05,
"loss": 0.3878,
"step": 32
},
{
"epoch": 0.08333333333333333,
"grad_norm": 9.382911682128906,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.3594,
"step": 33
},
{
"epoch": 0.08585858585858586,
"grad_norm": 2.793823003768921,
"learning_rate": 1.942760942760943e-05,
"loss": 0.3946,
"step": 34
},
{
"epoch": 0.08838383838383838,
"grad_norm": 2.098381280899048,
"learning_rate": 1.9410774410774414e-05,
"loss": 0.2873,
"step": 35
},
{
"epoch": 0.09090909090909091,
"grad_norm": 3.7400736808776855,
"learning_rate": 1.9393939393939395e-05,
"loss": 0.1903,
"step": 36
},
{
"epoch": 0.09343434343434344,
"grad_norm": 2.984248638153076,
"learning_rate": 1.937710437710438e-05,
"loss": 0.3758,
"step": 37
},
{
"epoch": 0.09595959595959595,
"grad_norm": 2.084982395172119,
"learning_rate": 1.936026936026936e-05,
"loss": 0.2711,
"step": 38
},
{
"epoch": 0.09848484848484848,
"grad_norm": 2.7394371032714844,
"learning_rate": 1.9343434343434345e-05,
"loss": 0.3333,
"step": 39
},
{
"epoch": 0.10101010101010101,
"grad_norm": 3.4980244636535645,
"learning_rate": 1.932659932659933e-05,
"loss": 0.253,
"step": 40
},
{
"epoch": 0.10353535353535354,
"grad_norm": 3.121978521347046,
"learning_rate": 1.930976430976431e-05,
"loss": 0.2335,
"step": 41
},
{
"epoch": 0.10606060606060606,
"grad_norm": 2.696462631225586,
"learning_rate": 1.9292929292929295e-05,
"loss": 0.3397,
"step": 42
},
{
"epoch": 0.10858585858585859,
"grad_norm": 3.063912868499756,
"learning_rate": 1.9276094276094276e-05,
"loss": 0.3242,
"step": 43
},
{
"epoch": 0.1111111111111111,
"grad_norm": 8.048778533935547,
"learning_rate": 1.925925925925926e-05,
"loss": 0.3208,
"step": 44
},
{
"epoch": 0.11363636363636363,
"grad_norm": 10.508525848388672,
"learning_rate": 1.9242424242424244e-05,
"loss": 0.3138,
"step": 45
},
{
"epoch": 0.11616161616161616,
"grad_norm": 2.972494125366211,
"learning_rate": 1.922558922558923e-05,
"loss": 0.2749,
"step": 46
},
{
"epoch": 0.11868686868686869,
"grad_norm": 2.6326518058776855,
"learning_rate": 1.920875420875421e-05,
"loss": 0.2419,
"step": 47
},
{
"epoch": 0.12121212121212122,
"grad_norm": 3.0405683517456055,
"learning_rate": 1.9191919191919194e-05,
"loss": 0.3422,
"step": 48
},
{
"epoch": 0.12373737373737374,
"grad_norm": 5.278780460357666,
"learning_rate": 1.917508417508418e-05,
"loss": 0.2458,
"step": 49
},
{
"epoch": 0.12626262626262627,
"grad_norm": 4.309386730194092,
"learning_rate": 1.915824915824916e-05,
"loss": 0.2252,
"step": 50
},
{
"epoch": 0.12878787878787878,
"grad_norm": 2.3794400691986084,
"learning_rate": 1.9141414141414144e-05,
"loss": 0.1916,
"step": 51
},
{
"epoch": 0.13131313131313133,
"grad_norm": 3.2079036235809326,
"learning_rate": 1.9124579124579125e-05,
"loss": 0.2763,
"step": 52
},
{
"epoch": 0.13383838383838384,
"grad_norm": 6.404500961303711,
"learning_rate": 1.910774410774411e-05,
"loss": 0.3073,
"step": 53
},
{
"epoch": 0.13636363636363635,
"grad_norm": 3.2996926307678223,
"learning_rate": 1.9090909090909094e-05,
"loss": 0.2442,
"step": 54
},
{
"epoch": 0.1388888888888889,
"grad_norm": 3.197521924972534,
"learning_rate": 1.9074074074074075e-05,
"loss": 0.2451,
"step": 55
},
{
"epoch": 0.1414141414141414,
"grad_norm": 2.8418166637420654,
"learning_rate": 1.905723905723906e-05,
"loss": 0.2383,
"step": 56
},
{
"epoch": 0.14393939393939395,
"grad_norm": 2.393613338470459,
"learning_rate": 1.904040404040404e-05,
"loss": 0.1834,
"step": 57
},
{
"epoch": 0.14646464646464646,
"grad_norm": 2.1811683177948,
"learning_rate": 1.9023569023569024e-05,
"loss": 0.2564,
"step": 58
},
{
"epoch": 0.14898989898989898,
"grad_norm": 2.4366374015808105,
"learning_rate": 1.900673400673401e-05,
"loss": 0.2541,
"step": 59
},
{
"epoch": 0.15151515151515152,
"grad_norm": 5.706679344177246,
"learning_rate": 1.8989898989898993e-05,
"loss": 0.3285,
"step": 60
},
{
"epoch": 0.15404040404040403,
"grad_norm": 1.8341200351715088,
"learning_rate": 1.8973063973063974e-05,
"loss": 0.2692,
"step": 61
},
{
"epoch": 0.15656565656565657,
"grad_norm": 3.0101611614227295,
"learning_rate": 1.895622895622896e-05,
"loss": 0.2135,
"step": 62
},
{
"epoch": 0.1590909090909091,
"grad_norm": 3.3140006065368652,
"learning_rate": 1.8939393939393943e-05,
"loss": 0.2027,
"step": 63
},
{
"epoch": 0.16161616161616163,
"grad_norm": 2.118210554122925,
"learning_rate": 1.8922558922558924e-05,
"loss": 0.2213,
"step": 64
},
{
"epoch": 0.16414141414141414,
"grad_norm": 3.6016147136688232,
"learning_rate": 1.8905723905723908e-05,
"loss": 0.1682,
"step": 65
},
{
"epoch": 0.16666666666666666,
"grad_norm": 7.3510823249816895,
"learning_rate": 1.888888888888889e-05,
"loss": 0.369,
"step": 66
},
{
"epoch": 0.1691919191919192,
"grad_norm": 3.6129775047302246,
"learning_rate": 1.8872053872053873e-05,
"loss": 0.2257,
"step": 67
},
{
"epoch": 0.1717171717171717,
"grad_norm": 4.521103858947754,
"learning_rate": 1.8855218855218858e-05,
"loss": 0.1425,
"step": 68
},
{
"epoch": 0.17424242424242425,
"grad_norm": 1.941278100013733,
"learning_rate": 1.883838383838384e-05,
"loss": 0.2962,
"step": 69
},
{
"epoch": 0.17676767676767677,
"grad_norm": 4.856161594390869,
"learning_rate": 1.8821548821548823e-05,
"loss": 0.2071,
"step": 70
},
{
"epoch": 0.17929292929292928,
"grad_norm": 4.528213024139404,
"learning_rate": 1.8804713804713804e-05,
"loss": 0.2222,
"step": 71
},
{
"epoch": 0.18181818181818182,
"grad_norm": 6.646481037139893,
"learning_rate": 1.8787878787878792e-05,
"loss": 0.2942,
"step": 72
},
{
"epoch": 0.18434343434343434,
"grad_norm": 2.1316299438476562,
"learning_rate": 1.8771043771043773e-05,
"loss": 0.2697,
"step": 73
},
{
"epoch": 0.18686868686868688,
"grad_norm": 3.7682583332061768,
"learning_rate": 1.8754208754208757e-05,
"loss": 0.2575,
"step": 74
},
{
"epoch": 0.1893939393939394,
"grad_norm": 2.1818718910217285,
"learning_rate": 1.873737373737374e-05,
"loss": 0.1755,
"step": 75
},
{
"epoch": 0.1919191919191919,
"grad_norm": 5.337326526641846,
"learning_rate": 1.8720538720538723e-05,
"loss": 0.2238,
"step": 76
},
{
"epoch": 0.19444444444444445,
"grad_norm": 5.185172080993652,
"learning_rate": 1.8703703703703707e-05,
"loss": 0.14,
"step": 77
},
{
"epoch": 0.19696969696969696,
"grad_norm": 5.610733509063721,
"learning_rate": 1.8686868686868688e-05,
"loss": 0.2109,
"step": 78
},
{
"epoch": 0.1994949494949495,
"grad_norm": 3.34989333152771,
"learning_rate": 1.8670033670033672e-05,
"loss": 0.2358,
"step": 79
},
{
"epoch": 0.20202020202020202,
"grad_norm": 4.732699394226074,
"learning_rate": 1.8653198653198653e-05,
"loss": 0.2582,
"step": 80
},
{
"epoch": 0.20454545454545456,
"grad_norm": 3.595618963241577,
"learning_rate": 1.8636363636363638e-05,
"loss": 0.2499,
"step": 81
},
{
"epoch": 0.20707070707070707,
"grad_norm": 4.39829158782959,
"learning_rate": 1.8619528619528622e-05,
"loss": 0.1776,
"step": 82
},
{
"epoch": 0.20959595959595959,
"grad_norm": 5.79127836227417,
"learning_rate": 1.8602693602693603e-05,
"loss": 0.1329,
"step": 83
},
{
"epoch": 0.21212121212121213,
"grad_norm": 3.827282428741455,
"learning_rate": 1.8585858585858588e-05,
"loss": 0.2073,
"step": 84
},
{
"epoch": 0.21464646464646464,
"grad_norm": 6.159754753112793,
"learning_rate": 1.856902356902357e-05,
"loss": 0.1658,
"step": 85
},
{
"epoch": 0.21717171717171718,
"grad_norm": 9.290190696716309,
"learning_rate": 1.8552188552188556e-05,
"loss": 0.286,
"step": 86
},
{
"epoch": 0.2196969696969697,
"grad_norm": 5.264730930328369,
"learning_rate": 1.8535353535353537e-05,
"loss": 0.2504,
"step": 87
},
{
"epoch": 0.2222222222222222,
"grad_norm": 3.915583848953247,
"learning_rate": 1.851851851851852e-05,
"loss": 0.2535,
"step": 88
},
{
"epoch": 0.22474747474747475,
"grad_norm": 3.885434627532959,
"learning_rate": 1.8501683501683503e-05,
"loss": 0.1451,
"step": 89
},
{
"epoch": 0.22727272727272727,
"grad_norm": 3.5729010105133057,
"learning_rate": 1.8484848484848487e-05,
"loss": 0.1989,
"step": 90
},
{
"epoch": 0.2297979797979798,
"grad_norm": 2.3339507579803467,
"learning_rate": 1.846801346801347e-05,
"loss": 0.3191,
"step": 91
},
{
"epoch": 0.23232323232323232,
"grad_norm": 3.946099281311035,
"learning_rate": 1.8451178451178452e-05,
"loss": 0.2271,
"step": 92
},
{
"epoch": 0.23484848484848486,
"grad_norm": 5.328370571136475,
"learning_rate": 1.8434343434343437e-05,
"loss": 0.3326,
"step": 93
},
{
"epoch": 0.23737373737373738,
"grad_norm": 4.987793445587158,
"learning_rate": 1.8417508417508418e-05,
"loss": 0.2377,
"step": 94
},
{
"epoch": 0.2398989898989899,
"grad_norm": 3.6775288581848145,
"learning_rate": 1.8400673400673402e-05,
"loss": 0.2323,
"step": 95
},
{
"epoch": 0.24242424242424243,
"grad_norm": 3.444467782974243,
"learning_rate": 1.8383838383838387e-05,
"loss": 0.2712,
"step": 96
},
{
"epoch": 0.24494949494949494,
"grad_norm": 7.329760551452637,
"learning_rate": 1.8367003367003367e-05,
"loss": 0.3223,
"step": 97
},
{
"epoch": 0.2474747474747475,
"grad_norm": 3.329362154006958,
"learning_rate": 1.8350168350168352e-05,
"loss": 0.1859,
"step": 98
},
{
"epoch": 0.25,
"grad_norm": 2.950449228286743,
"learning_rate": 1.8333333333333333e-05,
"loss": 0.2032,
"step": 99
},
{
"epoch": 0.25252525252525254,
"grad_norm": 3.4235892295837402,
"learning_rate": 1.831649831649832e-05,
"loss": 0.1982,
"step": 100
},
{
"epoch": 0.255050505050505,
"grad_norm": 4.13006067276001,
"learning_rate": 1.82996632996633e-05,
"loss": 0.1787,
"step": 101
},
{
"epoch": 0.25757575757575757,
"grad_norm": 2.0153565406799316,
"learning_rate": 1.8282828282828286e-05,
"loss": 0.1408,
"step": 102
},
{
"epoch": 0.2601010101010101,
"grad_norm": 3.2294890880584717,
"learning_rate": 1.8265993265993267e-05,
"loss": 0.2113,
"step": 103
},
{
"epoch": 0.26262626262626265,
"grad_norm": 3.2181968688964844,
"learning_rate": 1.824915824915825e-05,
"loss": 0.1296,
"step": 104
},
{
"epoch": 0.26515151515151514,
"grad_norm": 1.6924734115600586,
"learning_rate": 1.8232323232323236e-05,
"loss": 0.1773,
"step": 105
},
{
"epoch": 0.2676767676767677,
"grad_norm": 5.491613864898682,
"learning_rate": 1.8215488215488217e-05,
"loss": 0.1511,
"step": 106
},
{
"epoch": 0.2702020202020202,
"grad_norm": 4.4867143630981445,
"learning_rate": 1.81986531986532e-05,
"loss": 0.1978,
"step": 107
},
{
"epoch": 0.2727272727272727,
"grad_norm": 1.801491379737854,
"learning_rate": 1.8181818181818182e-05,
"loss": 0.2535,
"step": 108
},
{
"epoch": 0.27525252525252525,
"grad_norm": 2.2414021492004395,
"learning_rate": 1.8164983164983166e-05,
"loss": 0.2129,
"step": 109
},
{
"epoch": 0.2777777777777778,
"grad_norm": 1.8164544105529785,
"learning_rate": 1.814814814814815e-05,
"loss": 0.1744,
"step": 110
},
{
"epoch": 0.2803030303030303,
"grad_norm": 1.4675378799438477,
"learning_rate": 1.8131313131313132e-05,
"loss": 0.2502,
"step": 111
},
{
"epoch": 0.2828282828282828,
"grad_norm": 2.9425742626190186,
"learning_rate": 1.8114478114478116e-05,
"loss": 0.1312,
"step": 112
},
{
"epoch": 0.28535353535353536,
"grad_norm": 2.8444998264312744,
"learning_rate": 1.8097643097643097e-05,
"loss": 0.233,
"step": 113
},
{
"epoch": 0.2878787878787879,
"grad_norm": 1.8977577686309814,
"learning_rate": 1.8080808080808085e-05,
"loss": 0.0896,
"step": 114
},
{
"epoch": 0.2904040404040404,
"grad_norm": 4.595700740814209,
"learning_rate": 1.8063973063973066e-05,
"loss": 0.3304,
"step": 115
},
{
"epoch": 0.29292929292929293,
"grad_norm": 2.2750136852264404,
"learning_rate": 1.804713804713805e-05,
"loss": 0.209,
"step": 116
},
{
"epoch": 0.29545454545454547,
"grad_norm": 2.0217509269714355,
"learning_rate": 1.803030303030303e-05,
"loss": 0.2202,
"step": 117
},
{
"epoch": 0.29797979797979796,
"grad_norm": 2.943140745162964,
"learning_rate": 1.8013468013468016e-05,
"loss": 0.2903,
"step": 118
},
{
"epoch": 0.3005050505050505,
"grad_norm": 2.4190146923065186,
"learning_rate": 1.7996632996633e-05,
"loss": 0.1384,
"step": 119
},
{
"epoch": 0.30303030303030304,
"grad_norm": 3.664355993270874,
"learning_rate": 1.797979797979798e-05,
"loss": 0.1866,
"step": 120
},
{
"epoch": 0.3055555555555556,
"grad_norm": 3.616316795349121,
"learning_rate": 1.7962962962962965e-05,
"loss": 0.2016,
"step": 121
},
{
"epoch": 0.30808080808080807,
"grad_norm": 6.439982891082764,
"learning_rate": 1.7946127946127946e-05,
"loss": 0.2699,
"step": 122
},
{
"epoch": 0.3106060606060606,
"grad_norm": 3.2625112533569336,
"learning_rate": 1.792929292929293e-05,
"loss": 0.242,
"step": 123
},
{
"epoch": 0.31313131313131315,
"grad_norm": 4.760579586029053,
"learning_rate": 1.7912457912457915e-05,
"loss": 0.1812,
"step": 124
},
{
"epoch": 0.31565656565656564,
"grad_norm": 5.375882625579834,
"learning_rate": 1.7895622895622896e-05,
"loss": 0.0892,
"step": 125
},
{
"epoch": 0.3181818181818182,
"grad_norm": 1.5627996921539307,
"learning_rate": 1.787878787878788e-05,
"loss": 0.1608,
"step": 126
},
{
"epoch": 0.3207070707070707,
"grad_norm": 2.0782926082611084,
"learning_rate": 1.786195286195286e-05,
"loss": 0.1384,
"step": 127
},
{
"epoch": 0.32323232323232326,
"grad_norm": 3.5221481323242188,
"learning_rate": 1.7845117845117846e-05,
"loss": 0.2595,
"step": 128
},
{
"epoch": 0.32575757575757575,
"grad_norm": 1.7717233896255493,
"learning_rate": 1.782828282828283e-05,
"loss": 0.1401,
"step": 129
},
{
"epoch": 0.3282828282828283,
"grad_norm": 3.81760311126709,
"learning_rate": 1.781144781144781e-05,
"loss": 0.1153,
"step": 130
},
{
"epoch": 0.33080808080808083,
"grad_norm": 4.479602813720703,
"learning_rate": 1.7794612794612796e-05,
"loss": 0.2117,
"step": 131
},
{
"epoch": 0.3333333333333333,
"grad_norm": 1.1932178735733032,
"learning_rate": 1.7777777777777777e-05,
"loss": 0.1669,
"step": 132
},
{
"epoch": 0.33585858585858586,
"grad_norm": 3.330796003341675,
"learning_rate": 1.7760942760942764e-05,
"loss": 0.152,
"step": 133
},
{
"epoch": 0.3383838383838384,
"grad_norm": 3.5781233310699463,
"learning_rate": 1.7744107744107745e-05,
"loss": 0.269,
"step": 134
},
{
"epoch": 0.3409090909090909,
"grad_norm": 2.489184617996216,
"learning_rate": 1.772727272727273e-05,
"loss": 0.1631,
"step": 135
},
{
"epoch": 0.3434343434343434,
"grad_norm": 5.023707389831543,
"learning_rate": 1.771043771043771e-05,
"loss": 0.1623,
"step": 136
},
{
"epoch": 0.34595959595959597,
"grad_norm": 1.8095295429229736,
"learning_rate": 1.7693602693602695e-05,
"loss": 0.1941,
"step": 137
},
{
"epoch": 0.3484848484848485,
"grad_norm": 5.773559093475342,
"learning_rate": 1.767676767676768e-05,
"loss": 0.2198,
"step": 138
},
{
"epoch": 0.351010101010101,
"grad_norm": 2.3348917961120605,
"learning_rate": 1.765993265993266e-05,
"loss": 0.0749,
"step": 139
},
{
"epoch": 0.35353535353535354,
"grad_norm": 4.8729023933410645,
"learning_rate": 1.7643097643097645e-05,
"loss": 0.2523,
"step": 140
},
{
"epoch": 0.3560606060606061,
"grad_norm": 2.1227433681488037,
"learning_rate": 1.7626262626262626e-05,
"loss": 0.1616,
"step": 141
},
{
"epoch": 0.35858585858585856,
"grad_norm": 4.208232402801514,
"learning_rate": 1.760942760942761e-05,
"loss": 0.1206,
"step": 142
},
{
"epoch": 0.3611111111111111,
"grad_norm": 2.2808191776275635,
"learning_rate": 1.7592592592592595e-05,
"loss": 0.1413,
"step": 143
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.797044515609741,
"learning_rate": 1.7575757575757576e-05,
"loss": 0.1553,
"step": 144
},
{
"epoch": 0.3661616161616162,
"grad_norm": 2.0235748291015625,
"learning_rate": 1.755892255892256e-05,
"loss": 0.1748,
"step": 145
},
{
"epoch": 0.3686868686868687,
"grad_norm": 1.668614149093628,
"learning_rate": 1.754208754208754e-05,
"loss": 0.1397,
"step": 146
},
{
"epoch": 0.3712121212121212,
"grad_norm": 2.048588991165161,
"learning_rate": 1.752525252525253e-05,
"loss": 0.1636,
"step": 147
},
{
"epoch": 0.37373737373737376,
"grad_norm": 3.2544357776641846,
"learning_rate": 1.750841750841751e-05,
"loss": 0.1906,
"step": 148
},
{
"epoch": 0.37626262626262624,
"grad_norm": 2.5983431339263916,
"learning_rate": 1.7491582491582494e-05,
"loss": 0.1833,
"step": 149
},
{
"epoch": 0.3787878787878788,
"grad_norm": 3.579721689224243,
"learning_rate": 1.7474747474747475e-05,
"loss": 0.2445,
"step": 150
},
{
"epoch": 0.3813131313131313,
"grad_norm": 3.889470338821411,
"learning_rate": 1.745791245791246e-05,
"loss": 0.2457,
"step": 151
},
{
"epoch": 0.3838383838383838,
"grad_norm": 1.612406611442566,
"learning_rate": 1.7441077441077444e-05,
"loss": 0.1468,
"step": 152
},
{
"epoch": 0.38636363636363635,
"grad_norm": 3.572401285171509,
"learning_rate": 1.7424242424242425e-05,
"loss": 0.1659,
"step": 153
},
{
"epoch": 0.3888888888888889,
"grad_norm": 2.7137911319732666,
"learning_rate": 1.740740740740741e-05,
"loss": 0.2389,
"step": 154
},
{
"epoch": 0.39141414141414144,
"grad_norm": 2.293943166732788,
"learning_rate": 1.739057239057239e-05,
"loss": 0.2188,
"step": 155
},
{
"epoch": 0.3939393939393939,
"grad_norm": 5.641902923583984,
"learning_rate": 1.7373737373737375e-05,
"loss": 0.1843,
"step": 156
},
{
"epoch": 0.39646464646464646,
"grad_norm": 4.039111137390137,
"learning_rate": 1.735690235690236e-05,
"loss": 0.2627,
"step": 157
},
{
"epoch": 0.398989898989899,
"grad_norm": 2.942754030227661,
"learning_rate": 1.734006734006734e-05,
"loss": 0.2299,
"step": 158
},
{
"epoch": 0.4015151515151515,
"grad_norm": 3.7655181884765625,
"learning_rate": 1.7323232323232324e-05,
"loss": 0.2167,
"step": 159
},
{
"epoch": 0.40404040404040403,
"grad_norm": 2.5062334537506104,
"learning_rate": 1.7306397306397305e-05,
"loss": 0.164,
"step": 160
},
{
"epoch": 0.4065656565656566,
"grad_norm": 2.07106614112854,
"learning_rate": 1.7289562289562293e-05,
"loss": 0.1956,
"step": 161
},
{
"epoch": 0.4090909090909091,
"grad_norm": 6.250090599060059,
"learning_rate": 1.7272727272727274e-05,
"loss": 0.1399,
"step": 162
},
{
"epoch": 0.4116161616161616,
"grad_norm": 2.017141819000244,
"learning_rate": 1.725589225589226e-05,
"loss": 0.2036,
"step": 163
},
{
"epoch": 0.41414141414141414,
"grad_norm": 3.3339602947235107,
"learning_rate": 1.723905723905724e-05,
"loss": 0.1596,
"step": 164
},
{
"epoch": 0.4166666666666667,
"grad_norm": 3.8334908485412598,
"learning_rate": 1.7222222222222224e-05,
"loss": 0.1267,
"step": 165
},
{
"epoch": 0.41919191919191917,
"grad_norm": 2.0751090049743652,
"learning_rate": 1.7205387205387208e-05,
"loss": 0.2058,
"step": 166
},
{
"epoch": 0.4217171717171717,
"grad_norm": 3.0513789653778076,
"learning_rate": 1.718855218855219e-05,
"loss": 0.091,
"step": 167
},
{
"epoch": 0.42424242424242425,
"grad_norm": 5.1696648597717285,
"learning_rate": 1.7171717171717173e-05,
"loss": 0.2313,
"step": 168
},
{
"epoch": 0.42676767676767674,
"grad_norm": 3.9072530269622803,
"learning_rate": 1.7154882154882154e-05,
"loss": 0.2648,
"step": 169
},
{
"epoch": 0.4292929292929293,
"grad_norm": 4.278628349304199,
"learning_rate": 1.713804713804714e-05,
"loss": 0.1539,
"step": 170
},
{
"epoch": 0.4318181818181818,
"grad_norm": 1.6870406866073608,
"learning_rate": 1.7121212121212123e-05,
"loss": 0.1714,
"step": 171
},
{
"epoch": 0.43434343434343436,
"grad_norm": 1.6782217025756836,
"learning_rate": 1.7104377104377104e-05,
"loss": 0.1269,
"step": 172
},
{
"epoch": 0.43686868686868685,
"grad_norm": 5.854135513305664,
"learning_rate": 1.708754208754209e-05,
"loss": 0.1568,
"step": 173
},
{
"epoch": 0.4393939393939394,
"grad_norm": 3.947122097015381,
"learning_rate": 1.707070707070707e-05,
"loss": 0.2051,
"step": 174
},
{
"epoch": 0.44191919191919193,
"grad_norm": 2.085911273956299,
"learning_rate": 1.7053872053872057e-05,
"loss": 0.1101,
"step": 175
},
{
"epoch": 0.4444444444444444,
"grad_norm": 4.145143985748291,
"learning_rate": 1.7037037037037038e-05,
"loss": 0.1773,
"step": 176
},
{
"epoch": 0.44696969696969696,
"grad_norm": 4.920554161071777,
"learning_rate": 1.7020202020202023e-05,
"loss": 0.2379,
"step": 177
},
{
"epoch": 0.4494949494949495,
"grad_norm": 2.8730502128601074,
"learning_rate": 1.7003367003367004e-05,
"loss": 0.1019,
"step": 178
},
{
"epoch": 0.45202020202020204,
"grad_norm": 1.0413464307785034,
"learning_rate": 1.6986531986531988e-05,
"loss": 0.1205,
"step": 179
},
{
"epoch": 0.45454545454545453,
"grad_norm": 2.591437816619873,
"learning_rate": 1.6969696969696972e-05,
"loss": 0.1173,
"step": 180
},
{
"epoch": 0.45707070707070707,
"grad_norm": 4.737552165985107,
"learning_rate": 1.6952861952861953e-05,
"loss": 0.4692,
"step": 181
},
{
"epoch": 0.4595959595959596,
"grad_norm": 5.872066974639893,
"learning_rate": 1.6936026936026938e-05,
"loss": 0.178,
"step": 182
},
{
"epoch": 0.4621212121212121,
"grad_norm": 4.527502536773682,
"learning_rate": 1.691919191919192e-05,
"loss": 0.2855,
"step": 183
},
{
"epoch": 0.46464646464646464,
"grad_norm": 3.166898488998413,
"learning_rate": 1.6902356902356903e-05,
"loss": 0.201,
"step": 184
},
{
"epoch": 0.4671717171717172,
"grad_norm": 8.388322830200195,
"learning_rate": 1.6885521885521888e-05,
"loss": 0.2455,
"step": 185
},
{
"epoch": 0.4696969696969697,
"grad_norm": 3.2028310298919678,
"learning_rate": 1.686868686868687e-05,
"loss": 0.2577,
"step": 186
},
{
"epoch": 0.4722222222222222,
"grad_norm": 3.2072689533233643,
"learning_rate": 1.6851851851851853e-05,
"loss": 0.1123,
"step": 187
},
{
"epoch": 0.47474747474747475,
"grad_norm": 2.532289743423462,
"learning_rate": 1.6835016835016837e-05,
"loss": 0.2389,
"step": 188
},
{
"epoch": 0.4772727272727273,
"grad_norm": 3.049967050552368,
"learning_rate": 1.681818181818182e-05,
"loss": 0.1156,
"step": 189
},
{
"epoch": 0.4797979797979798,
"grad_norm": 2.940448760986328,
"learning_rate": 1.6801346801346803e-05,
"loss": 0.149,
"step": 190
},
{
"epoch": 0.4823232323232323,
"grad_norm": 2.2545042037963867,
"learning_rate": 1.6784511784511787e-05,
"loss": 0.1751,
"step": 191
},
{
"epoch": 0.48484848484848486,
"grad_norm": 2.66123628616333,
"learning_rate": 1.6767676767676768e-05,
"loss": 0.1685,
"step": 192
},
{
"epoch": 0.48737373737373735,
"grad_norm": 2.0476951599121094,
"learning_rate": 1.6750841750841752e-05,
"loss": 0.1705,
"step": 193
},
{
"epoch": 0.4898989898989899,
"grad_norm": 2.9459142684936523,
"learning_rate": 1.6734006734006737e-05,
"loss": 0.1873,
"step": 194
},
{
"epoch": 0.49242424242424243,
"grad_norm": 3.9844117164611816,
"learning_rate": 1.6717171717171718e-05,
"loss": 0.1531,
"step": 195
},
{
"epoch": 0.494949494949495,
"grad_norm": 6.765873908996582,
"learning_rate": 1.6700336700336702e-05,
"loss": 0.164,
"step": 196
},
{
"epoch": 0.49747474747474746,
"grad_norm": 2.809617757797241,
"learning_rate": 1.6683501683501683e-05,
"loss": 0.144,
"step": 197
},
{
"epoch": 0.5,
"grad_norm": 6.575211524963379,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.2556,
"step": 198
},
{
"epoch": 0.5025252525252525,
"grad_norm": 4.31246280670166,
"learning_rate": 1.6649831649831652e-05,
"loss": 0.1998,
"step": 199
},
{
"epoch": 0.5050505050505051,
"grad_norm": 3.3406026363372803,
"learning_rate": 1.6632996632996633e-05,
"loss": 0.1341,
"step": 200
},
{
"epoch": 0.5075757575757576,
"grad_norm": 2.613698720932007,
"learning_rate": 1.6616161616161617e-05,
"loss": 0.1245,
"step": 201
},
{
"epoch": 0.51010101010101,
"grad_norm": 4.394161224365234,
"learning_rate": 1.65993265993266e-05,
"loss": 0.1683,
"step": 202
},
{
"epoch": 0.5126262626262627,
"grad_norm": 1.0652117729187012,
"learning_rate": 1.6582491582491586e-05,
"loss": 0.1133,
"step": 203
},
{
"epoch": 0.5151515151515151,
"grad_norm": 2.15743350982666,
"learning_rate": 1.6565656565656567e-05,
"loss": 0.1525,
"step": 204
},
{
"epoch": 0.5176767676767676,
"grad_norm": 1.6530711650848389,
"learning_rate": 1.654882154882155e-05,
"loss": 0.1417,
"step": 205
},
{
"epoch": 0.5202020202020202,
"grad_norm": 6.711721897125244,
"learning_rate": 1.6531986531986532e-05,
"loss": 0.2334,
"step": 206
},
{
"epoch": 0.5227272727272727,
"grad_norm": 1.627074122428894,
"learning_rate": 1.6515151515151517e-05,
"loss": 0.1847,
"step": 207
},
{
"epoch": 0.5252525252525253,
"grad_norm": 1.3665039539337158,
"learning_rate": 1.64983164983165e-05,
"loss": 0.0733,
"step": 208
},
{
"epoch": 0.5277777777777778,
"grad_norm": 1.800305724143982,
"learning_rate": 1.6481481481481482e-05,
"loss": 0.0821,
"step": 209
},
{
"epoch": 0.5303030303030303,
"grad_norm": 2.238971710205078,
"learning_rate": 1.6464646464646466e-05,
"loss": 0.162,
"step": 210
},
{
"epoch": 0.5328282828282829,
"grad_norm": 3.941727638244629,
"learning_rate": 1.6447811447811447e-05,
"loss": 0.171,
"step": 211
},
{
"epoch": 0.5353535353535354,
"grad_norm": 2.0416862964630127,
"learning_rate": 1.6430976430976432e-05,
"loss": 0.121,
"step": 212
},
{
"epoch": 0.5378787878787878,
"grad_norm": 2.75635027885437,
"learning_rate": 1.6414141414141416e-05,
"loss": 0.1973,
"step": 213
},
{
"epoch": 0.5404040404040404,
"grad_norm": 5.226922512054443,
"learning_rate": 1.6397306397306397e-05,
"loss": 0.111,
"step": 214
},
{
"epoch": 0.5429292929292929,
"grad_norm": 6.741361618041992,
"learning_rate": 1.638047138047138e-05,
"loss": 0.0877,
"step": 215
},
{
"epoch": 0.5454545454545454,
"grad_norm": 2.957056999206543,
"learning_rate": 1.6363636363636366e-05,
"loss": 0.0627,
"step": 216
},
{
"epoch": 0.547979797979798,
"grad_norm": 3.5542659759521484,
"learning_rate": 1.634680134680135e-05,
"loss": 0.2518,
"step": 217
},
{
"epoch": 0.5505050505050505,
"grad_norm": 8.325895309448242,
"learning_rate": 1.632996632996633e-05,
"loss": 0.1792,
"step": 218
},
{
"epoch": 0.553030303030303,
"grad_norm": 4.116200923919678,
"learning_rate": 1.6313131313131316e-05,
"loss": 0.0821,
"step": 219
},
{
"epoch": 0.5555555555555556,
"grad_norm": 2.4049417972564697,
"learning_rate": 1.6296296296296297e-05,
"loss": 0.1445,
"step": 220
},
{
"epoch": 0.5580808080808081,
"grad_norm": 2.702348470687866,
"learning_rate": 1.627946127946128e-05,
"loss": 0.2077,
"step": 221
},
{
"epoch": 0.5606060606060606,
"grad_norm": 4.276516437530518,
"learning_rate": 1.6262626262626265e-05,
"loss": 0.189,
"step": 222
},
{
"epoch": 0.5631313131313131,
"grad_norm": 2.212054491043091,
"learning_rate": 1.6245791245791246e-05,
"loss": 0.2111,
"step": 223
},
{
"epoch": 0.5656565656565656,
"grad_norm": 2.9544410705566406,
"learning_rate": 1.622895622895623e-05,
"loss": 0.1649,
"step": 224
},
{
"epoch": 0.5681818181818182,
"grad_norm": 3.0044991970062256,
"learning_rate": 1.6212121212121212e-05,
"loss": 0.1728,
"step": 225
},
{
"epoch": 0.5707070707070707,
"grad_norm": 3.5259811878204346,
"learning_rate": 1.6195286195286196e-05,
"loss": 0.1621,
"step": 226
},
{
"epoch": 0.5732323232323232,
"grad_norm": 3.774447441101074,
"learning_rate": 1.617845117845118e-05,
"loss": 0.2288,
"step": 227
},
{
"epoch": 0.5757575757575758,
"grad_norm": 2.975698232650757,
"learning_rate": 1.616161616161616e-05,
"loss": 0.1691,
"step": 228
},
{
"epoch": 0.5782828282828283,
"grad_norm": 4.2801713943481445,
"learning_rate": 1.6144781144781146e-05,
"loss": 0.1332,
"step": 229
},
{
"epoch": 0.5808080808080808,
"grad_norm": 4.899673938751221,
"learning_rate": 1.612794612794613e-05,
"loss": 0.1472,
"step": 230
},
{
"epoch": 0.5833333333333334,
"grad_norm": 5.345510482788086,
"learning_rate": 1.6111111111111115e-05,
"loss": 0.2117,
"step": 231
},
{
"epoch": 0.5858585858585859,
"grad_norm": 3.8797693252563477,
"learning_rate": 1.6094276094276096e-05,
"loss": 0.2047,
"step": 232
},
{
"epoch": 0.5883838383838383,
"grad_norm": 6.221108913421631,
"learning_rate": 1.607744107744108e-05,
"loss": 0.1221,
"step": 233
},
{
"epoch": 0.5909090909090909,
"grad_norm": 3.437472343444824,
"learning_rate": 1.606060606060606e-05,
"loss": 0.1748,
"step": 234
},
{
"epoch": 0.5934343434343434,
"grad_norm": 6.737703323364258,
"learning_rate": 1.6043771043771045e-05,
"loss": 0.1095,
"step": 235
},
{
"epoch": 0.5959595959595959,
"grad_norm": 1.2895629405975342,
"learning_rate": 1.602693602693603e-05,
"loss": 0.0798,
"step": 236
},
{
"epoch": 0.5984848484848485,
"grad_norm": 3.281799554824829,
"learning_rate": 1.601010101010101e-05,
"loss": 0.2223,
"step": 237
},
{
"epoch": 0.601010101010101,
"grad_norm": 3.6054065227508545,
"learning_rate": 1.5993265993265995e-05,
"loss": 0.1802,
"step": 238
},
{
"epoch": 0.6035353535353535,
"grad_norm": 2.032210350036621,
"learning_rate": 1.597643097643098e-05,
"loss": 0.1613,
"step": 239
},
{
"epoch": 0.6060606060606061,
"grad_norm": 3.515641212463379,
"learning_rate": 1.595959595959596e-05,
"loss": 0.1694,
"step": 240
},
{
"epoch": 0.6085858585858586,
"grad_norm": 3.1133809089660645,
"learning_rate": 1.5942760942760945e-05,
"loss": 0.1114,
"step": 241
},
{
"epoch": 0.6111111111111112,
"grad_norm": 3.401221752166748,
"learning_rate": 1.5925925925925926e-05,
"loss": 0.1692,
"step": 242
},
{
"epoch": 0.6136363636363636,
"grad_norm": 1.9235018491744995,
"learning_rate": 1.590909090909091e-05,
"loss": 0.2513,
"step": 243
},
{
"epoch": 0.6161616161616161,
"grad_norm": 2.6812822818756104,
"learning_rate": 1.5892255892255895e-05,
"loss": 0.1857,
"step": 244
},
{
"epoch": 0.6186868686868687,
"grad_norm": 3.470087766647339,
"learning_rate": 1.5875420875420876e-05,
"loss": 0.1377,
"step": 245
},
{
"epoch": 0.6212121212121212,
"grad_norm": 2.309100866317749,
"learning_rate": 1.585858585858586e-05,
"loss": 0.22,
"step": 246
},
{
"epoch": 0.6237373737373737,
"grad_norm": 5.392738342285156,
"learning_rate": 1.584175084175084e-05,
"loss": 0.1767,
"step": 247
},
{
"epoch": 0.6262626262626263,
"grad_norm": 3.751511573791504,
"learning_rate": 1.5824915824915825e-05,
"loss": 0.1504,
"step": 248
},
{
"epoch": 0.6287878787878788,
"grad_norm": 1.9343714714050293,
"learning_rate": 1.580808080808081e-05,
"loss": 0.2363,
"step": 249
},
{
"epoch": 0.6313131313131313,
"grad_norm": 3.65728759765625,
"learning_rate": 1.5791245791245794e-05,
"loss": 0.3138,
"step": 250
},
{
"epoch": 0.6338383838383839,
"grad_norm": 4.637652397155762,
"learning_rate": 1.5774410774410775e-05,
"loss": 0.1518,
"step": 251
},
{
"epoch": 0.6363636363636364,
"grad_norm": 2.6128430366516113,
"learning_rate": 1.575757575757576e-05,
"loss": 0.137,
"step": 252
},
{
"epoch": 0.6388888888888888,
"grad_norm": 2.5993456840515137,
"learning_rate": 1.5740740740740744e-05,
"loss": 0.2126,
"step": 253
},
{
"epoch": 0.6414141414141414,
"grad_norm": 2.630402088165283,
"learning_rate": 1.5723905723905725e-05,
"loss": 0.1712,
"step": 254
},
{
"epoch": 0.6439393939393939,
"grad_norm": 3.6941192150115967,
"learning_rate": 1.570707070707071e-05,
"loss": 0.251,
"step": 255
},
{
"epoch": 0.6464646464646465,
"grad_norm": 3.1765594482421875,
"learning_rate": 1.569023569023569e-05,
"loss": 0.2738,
"step": 256
},
{
"epoch": 0.648989898989899,
"grad_norm": 5.44793701171875,
"learning_rate": 1.5673400673400674e-05,
"loss": 0.1806,
"step": 257
},
{
"epoch": 0.6515151515151515,
"grad_norm": 2.671917676925659,
"learning_rate": 1.565656565656566e-05,
"loss": 0.2302,
"step": 258
},
{
"epoch": 0.6540404040404041,
"grad_norm": 3.816720485687256,
"learning_rate": 1.563973063973064e-05,
"loss": 0.2166,
"step": 259
},
{
"epoch": 0.6565656565656566,
"grad_norm": 4.604842662811279,
"learning_rate": 1.5622895622895624e-05,
"loss": 0.1936,
"step": 260
},
{
"epoch": 0.6590909090909091,
"grad_norm": 3.8842062950134277,
"learning_rate": 1.5606060606060605e-05,
"loss": 0.1321,
"step": 261
},
{
"epoch": 0.6616161616161617,
"grad_norm": 4.19383430480957,
"learning_rate": 1.558922558922559e-05,
"loss": 0.2093,
"step": 262
},
{
"epoch": 0.6641414141414141,
"grad_norm": 6.01501989364624,
"learning_rate": 1.5572390572390574e-05,
"loss": 0.281,
"step": 263
},
{
"epoch": 0.6666666666666666,
"grad_norm": 3.173448324203491,
"learning_rate": 1.555555555555556e-05,
"loss": 0.2289,
"step": 264
},
{
"epoch": 0.6691919191919192,
"grad_norm": 3.035527229309082,
"learning_rate": 1.553872053872054e-05,
"loss": 0.2046,
"step": 265
},
{
"epoch": 0.6717171717171717,
"grad_norm": 4.2569684982299805,
"learning_rate": 1.5521885521885524e-05,
"loss": 0.3241,
"step": 266
},
{
"epoch": 0.6742424242424242,
"grad_norm": 4.195226669311523,
"learning_rate": 1.5505050505050508e-05,
"loss": 0.1396,
"step": 267
},
{
"epoch": 0.6767676767676768,
"grad_norm": 1.8019922971725464,
"learning_rate": 1.548821548821549e-05,
"loss": 0.1341,
"step": 268
},
{
"epoch": 0.6792929292929293,
"grad_norm": 2.006047248840332,
"learning_rate": 1.5471380471380473e-05,
"loss": 0.2256,
"step": 269
},
{
"epoch": 0.6818181818181818,
"grad_norm": 2.592977523803711,
"learning_rate": 1.5454545454545454e-05,
"loss": 0.1794,
"step": 270
},
{
"epoch": 0.6843434343434344,
"grad_norm": 1.8588799238204956,
"learning_rate": 1.543771043771044e-05,
"loss": 0.1279,
"step": 271
},
{
"epoch": 0.6868686868686869,
"grad_norm": 2.6189968585968018,
"learning_rate": 1.5420875420875423e-05,
"loss": 0.0616,
"step": 272
},
{
"epoch": 0.6893939393939394,
"grad_norm": 1.683362603187561,
"learning_rate": 1.5404040404040404e-05,
"loss": 0.0476,
"step": 273
},
{
"epoch": 0.6919191919191919,
"grad_norm": 2.88405179977417,
"learning_rate": 1.538720538720539e-05,
"loss": 0.1016,
"step": 274
},
{
"epoch": 0.6944444444444444,
"grad_norm": 1.6002604961395264,
"learning_rate": 1.537037037037037e-05,
"loss": 0.1343,
"step": 275
},
{
"epoch": 0.696969696969697,
"grad_norm": 1.0753880739212036,
"learning_rate": 1.5353535353535354e-05,
"loss": 0.1,
"step": 276
},
{
"epoch": 0.6994949494949495,
"grad_norm": 3.1269478797912598,
"learning_rate": 1.5336700336700338e-05,
"loss": 0.136,
"step": 277
},
{
"epoch": 0.702020202020202,
"grad_norm": 2.584567070007324,
"learning_rate": 1.5319865319865323e-05,
"loss": 0.2408,
"step": 278
},
{
"epoch": 0.7045454545454546,
"grad_norm": 3.7829692363739014,
"learning_rate": 1.5303030303030304e-05,
"loss": 0.1797,
"step": 279
},
{
"epoch": 0.7070707070707071,
"grad_norm": 1.9160706996917725,
"learning_rate": 1.5286195286195288e-05,
"loss": 0.1379,
"step": 280
},
{
"epoch": 0.7095959595959596,
"grad_norm": 1.7192413806915283,
"learning_rate": 1.5269360269360272e-05,
"loss": 0.0992,
"step": 281
},
{
"epoch": 0.7121212121212122,
"grad_norm": 1.8255947828292847,
"learning_rate": 1.5252525252525255e-05,
"loss": 0.1683,
"step": 282
},
{
"epoch": 0.7146464646464646,
"grad_norm": 1.4913876056671143,
"learning_rate": 1.5235690235690238e-05,
"loss": 0.1559,
"step": 283
},
{
"epoch": 0.7171717171717171,
"grad_norm": 1.4210553169250488,
"learning_rate": 1.521885521885522e-05,
"loss": 0.0947,
"step": 284
},
{
"epoch": 0.7196969696969697,
"grad_norm": 2.0691561698913574,
"learning_rate": 1.5202020202020203e-05,
"loss": 0.1111,
"step": 285
},
{
"epoch": 0.7222222222222222,
"grad_norm": 1.425347089767456,
"learning_rate": 1.5185185185185187e-05,
"loss": 0.1305,
"step": 286
},
{
"epoch": 0.7247474747474747,
"grad_norm": 2.620968818664551,
"learning_rate": 1.516835016835017e-05,
"loss": 0.1324,
"step": 287
},
{
"epoch": 0.7272727272727273,
"grad_norm": 1.2153469324111938,
"learning_rate": 1.5151515151515153e-05,
"loss": 0.1875,
"step": 288
},
{
"epoch": 0.7297979797979798,
"grad_norm": 2.5912091732025146,
"learning_rate": 1.5134680134680136e-05,
"loss": 0.1125,
"step": 289
},
{
"epoch": 0.7323232323232324,
"grad_norm": 3.2707126140594482,
"learning_rate": 1.5117845117845118e-05,
"loss": 0.175,
"step": 290
},
{
"epoch": 0.7348484848484849,
"grad_norm": 2.4020352363586426,
"learning_rate": 1.5101010101010103e-05,
"loss": 0.1203,
"step": 291
},
{
"epoch": 0.7373737373737373,
"grad_norm": 4.660423278808594,
"learning_rate": 1.5084175084175085e-05,
"loss": 0.2577,
"step": 292
},
{
"epoch": 0.73989898989899,
"grad_norm": 5.82301139831543,
"learning_rate": 1.5067340067340068e-05,
"loss": 0.1374,
"step": 293
},
{
"epoch": 0.7424242424242424,
"grad_norm": 1.974256992340088,
"learning_rate": 1.505050505050505e-05,
"loss": 0.1145,
"step": 294
},
{
"epoch": 0.7449494949494949,
"grad_norm": 2.0848591327667236,
"learning_rate": 1.5033670033670035e-05,
"loss": 0.1168,
"step": 295
},
{
"epoch": 0.7474747474747475,
"grad_norm": 2.9144437313079834,
"learning_rate": 1.5016835016835018e-05,
"loss": 0.2312,
"step": 296
},
{
"epoch": 0.75,
"grad_norm": 4.225992202758789,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.1782,
"step": 297
},
{
"epoch": 0.7525252525252525,
"grad_norm": 4.229215145111084,
"learning_rate": 1.4983164983164985e-05,
"loss": 0.1512,
"step": 298
},
{
"epoch": 0.7550505050505051,
"grad_norm": 2.8152060508728027,
"learning_rate": 1.4966329966329967e-05,
"loss": 0.1511,
"step": 299
},
{
"epoch": 0.7575757575757576,
"grad_norm": 3.588789224624634,
"learning_rate": 1.4949494949494952e-05,
"loss": 0.2879,
"step": 300
},
{
"epoch": 0.76010101010101,
"grad_norm": 3.0448029041290283,
"learning_rate": 1.4932659932659934e-05,
"loss": 0.1764,
"step": 301
},
{
"epoch": 0.7626262626262627,
"grad_norm": 1.7650105953216553,
"learning_rate": 1.4915824915824917e-05,
"loss": 0.1692,
"step": 302
},
{
"epoch": 0.7651515151515151,
"grad_norm": 1.2958582639694214,
"learning_rate": 1.48989898989899e-05,
"loss": 0.1425,
"step": 303
},
{
"epoch": 0.7676767676767676,
"grad_norm": 2.6900827884674072,
"learning_rate": 1.4882154882154884e-05,
"loss": 0.2965,
"step": 304
},
{
"epoch": 0.7702020202020202,
"grad_norm": 5.048685550689697,
"learning_rate": 1.4865319865319867e-05,
"loss": 0.2404,
"step": 305
},
{
"epoch": 0.7727272727272727,
"grad_norm": 3.7027716636657715,
"learning_rate": 1.484848484848485e-05,
"loss": 0.148,
"step": 306
},
{
"epoch": 0.7752525252525253,
"grad_norm": 4.220457553863525,
"learning_rate": 1.4831649831649832e-05,
"loss": 0.1583,
"step": 307
},
{
"epoch": 0.7777777777777778,
"grad_norm": 3.0033810138702393,
"learning_rate": 1.4814814814814815e-05,
"loss": 0.2244,
"step": 308
},
{
"epoch": 0.7803030303030303,
"grad_norm": 4.2939043045043945,
"learning_rate": 1.47979797979798e-05,
"loss": 0.1701,
"step": 309
},
{
"epoch": 0.7828282828282829,
"grad_norm": 2.8431057929992676,
"learning_rate": 1.4781144781144782e-05,
"loss": 0.1284,
"step": 310
},
{
"epoch": 0.7853535353535354,
"grad_norm": 1.8190436363220215,
"learning_rate": 1.4764309764309765e-05,
"loss": 0.22,
"step": 311
},
{
"epoch": 0.7878787878787878,
"grad_norm": 4.867546558380127,
"learning_rate": 1.4747474747474747e-05,
"loss": 0.308,
"step": 312
},
{
"epoch": 0.7904040404040404,
"grad_norm": 2.632307767868042,
"learning_rate": 1.473063973063973e-05,
"loss": 0.1137,
"step": 313
},
{
"epoch": 0.7929292929292929,
"grad_norm": 5.3593339920043945,
"learning_rate": 1.4713804713804716e-05,
"loss": 0.1462,
"step": 314
},
{
"epoch": 0.7954545454545454,
"grad_norm": 1.6120672225952148,
"learning_rate": 1.4696969696969699e-05,
"loss": 0.1291,
"step": 315
},
{
"epoch": 0.797979797979798,
"grad_norm": 2.3134396076202393,
"learning_rate": 1.4680134680134681e-05,
"loss": 0.2119,
"step": 316
},
{
"epoch": 0.8005050505050505,
"grad_norm": 3.2344558238983154,
"learning_rate": 1.4663299663299664e-05,
"loss": 0.1588,
"step": 317
},
{
"epoch": 0.803030303030303,
"grad_norm": 3.4733057022094727,
"learning_rate": 1.4646464646464649e-05,
"loss": 0.1466,
"step": 318
},
{
"epoch": 0.8055555555555556,
"grad_norm": 3.2476141452789307,
"learning_rate": 1.4629629629629631e-05,
"loss": 0.2466,
"step": 319
},
{
"epoch": 0.8080808080808081,
"grad_norm": 5.198851108551025,
"learning_rate": 1.4612794612794614e-05,
"loss": 0.2239,
"step": 320
},
{
"epoch": 0.8106060606060606,
"grad_norm": 2.9820196628570557,
"learning_rate": 1.4595959595959597e-05,
"loss": 0.2467,
"step": 321
},
{
"epoch": 0.8131313131313131,
"grad_norm": 3.2972326278686523,
"learning_rate": 1.457912457912458e-05,
"loss": 0.1642,
"step": 322
},
{
"epoch": 0.8156565656565656,
"grad_norm": 2.3504161834716797,
"learning_rate": 1.4562289562289564e-05,
"loss": 0.0984,
"step": 323
},
{
"epoch": 0.8181818181818182,
"grad_norm": 4.491511821746826,
"learning_rate": 1.4545454545454546e-05,
"loss": 0.2576,
"step": 324
},
{
"epoch": 0.8207070707070707,
"grad_norm": 3.1529364585876465,
"learning_rate": 1.4528619528619529e-05,
"loss": 0.1666,
"step": 325
},
{
"epoch": 0.8232323232323232,
"grad_norm": 3.350400447845459,
"learning_rate": 1.4511784511784512e-05,
"loss": 0.2227,
"step": 326
},
{
"epoch": 0.8257575757575758,
"grad_norm": 9.460111618041992,
"learning_rate": 1.4494949494949494e-05,
"loss": 0.2407,
"step": 327
},
{
"epoch": 0.8282828282828283,
"grad_norm": 2.648740768432617,
"learning_rate": 1.447811447811448e-05,
"loss": 0.2017,
"step": 328
},
{
"epoch": 0.8308080808080808,
"grad_norm": 2.6164615154266357,
"learning_rate": 1.4461279461279463e-05,
"loss": 0.1981,
"step": 329
},
{
"epoch": 0.8333333333333334,
"grad_norm": 8.09026050567627,
"learning_rate": 1.4444444444444446e-05,
"loss": 0.1033,
"step": 330
},
{
"epoch": 0.8358585858585859,
"grad_norm": 2.6296372413635254,
"learning_rate": 1.4427609427609428e-05,
"loss": 0.1782,
"step": 331
},
{
"epoch": 0.8383838383838383,
"grad_norm": 2.2566778659820557,
"learning_rate": 1.4410774410774413e-05,
"loss": 0.1623,
"step": 332
},
{
"epoch": 0.8409090909090909,
"grad_norm": 2.2079505920410156,
"learning_rate": 1.4393939393939396e-05,
"loss": 0.1723,
"step": 333
},
{
"epoch": 0.8434343434343434,
"grad_norm": 1.1799554824829102,
"learning_rate": 1.4377104377104378e-05,
"loss": 0.1547,
"step": 334
},
{
"epoch": 0.8459595959595959,
"grad_norm": 1.7930541038513184,
"learning_rate": 1.4360269360269361e-05,
"loss": 0.0783,
"step": 335
},
{
"epoch": 0.8484848484848485,
"grad_norm": 1.4967056512832642,
"learning_rate": 1.4343434343434344e-05,
"loss": 0.1916,
"step": 336
},
{
"epoch": 0.851010101010101,
"grad_norm": 4.922051906585693,
"learning_rate": 1.4326599326599328e-05,
"loss": 0.1304,
"step": 337
},
{
"epoch": 0.8535353535353535,
"grad_norm": 2.2897162437438965,
"learning_rate": 1.430976430976431e-05,
"loss": 0.2447,
"step": 338
},
{
"epoch": 0.8560606060606061,
"grad_norm": 2.769693613052368,
"learning_rate": 1.4292929292929293e-05,
"loss": 0.1017,
"step": 339
},
{
"epoch": 0.8585858585858586,
"grad_norm": 1.7574080228805542,
"learning_rate": 1.4276094276094276e-05,
"loss": 0.0973,
"step": 340
},
{
"epoch": 0.8611111111111112,
"grad_norm": 1.2174127101898193,
"learning_rate": 1.4259259259259259e-05,
"loss": 0.1903,
"step": 341
},
{
"epoch": 0.8636363636363636,
"grad_norm": 3.2463648319244385,
"learning_rate": 1.4242424242424245e-05,
"loss": 0.0818,
"step": 342
},
{
"epoch": 0.8661616161616161,
"grad_norm": 6.192782402038574,
"learning_rate": 1.4225589225589227e-05,
"loss": 0.2601,
"step": 343
},
{
"epoch": 0.8686868686868687,
"grad_norm": 2.965963125228882,
"learning_rate": 1.420875420875421e-05,
"loss": 0.1399,
"step": 344
},
{
"epoch": 0.8712121212121212,
"grad_norm": 2.0515079498291016,
"learning_rate": 1.4191919191919193e-05,
"loss": 0.2507,
"step": 345
},
{
"epoch": 0.8737373737373737,
"grad_norm": 2.2152068614959717,
"learning_rate": 1.4175084175084177e-05,
"loss": 0.107,
"step": 346
},
{
"epoch": 0.8762626262626263,
"grad_norm": 1.5435770750045776,
"learning_rate": 1.415824915824916e-05,
"loss": 0.1083,
"step": 347
},
{
"epoch": 0.8787878787878788,
"grad_norm": 9.01554012298584,
"learning_rate": 1.4141414141414143e-05,
"loss": 0.1817,
"step": 348
},
{
"epoch": 0.8813131313131313,
"grad_norm": 4.514248847961426,
"learning_rate": 1.4124579124579125e-05,
"loss": 0.111,
"step": 349
},
{
"epoch": 0.8838383838383839,
"grad_norm": 2.0948216915130615,
"learning_rate": 1.4107744107744108e-05,
"loss": 0.141,
"step": 350
},
{
"epoch": 0.8863636363636364,
"grad_norm": 1.3202215433120728,
"learning_rate": 1.4090909090909092e-05,
"loss": 0.122,
"step": 351
},
{
"epoch": 0.8888888888888888,
"grad_norm": 2.4289798736572266,
"learning_rate": 1.4074074074074075e-05,
"loss": 0.1194,
"step": 352
},
{
"epoch": 0.8914141414141414,
"grad_norm": 1.9062124490737915,
"learning_rate": 1.4057239057239058e-05,
"loss": 0.118,
"step": 353
},
{
"epoch": 0.8939393939393939,
"grad_norm": 4.942960739135742,
"learning_rate": 1.404040404040404e-05,
"loss": 0.1745,
"step": 354
},
{
"epoch": 0.8964646464646465,
"grad_norm": 1.1973398923873901,
"learning_rate": 1.4023569023569023e-05,
"loss": 0.0998,
"step": 355
},
{
"epoch": 0.898989898989899,
"grad_norm": 2.537156343460083,
"learning_rate": 1.4006734006734009e-05,
"loss": 0.2053,
"step": 356
},
{
"epoch": 0.9015151515151515,
"grad_norm": 1.6075160503387451,
"learning_rate": 1.3989898989898992e-05,
"loss": 0.1967,
"step": 357
},
{
"epoch": 0.9040404040404041,
"grad_norm": 1.8782991170883179,
"learning_rate": 1.3973063973063974e-05,
"loss": 0.1067,
"step": 358
},
{
"epoch": 0.9065656565656566,
"grad_norm": 1.8922234773635864,
"learning_rate": 1.3956228956228957e-05,
"loss": 0.1048,
"step": 359
},
{
"epoch": 0.9090909090909091,
"grad_norm": 2.411635637283325,
"learning_rate": 1.3939393939393942e-05,
"loss": 0.1557,
"step": 360
},
{
"epoch": 0.9116161616161617,
"grad_norm": 2.8136637210845947,
"learning_rate": 1.3922558922558924e-05,
"loss": 0.207,
"step": 361
},
{
"epoch": 0.9141414141414141,
"grad_norm": 3.0127274990081787,
"learning_rate": 1.3905723905723907e-05,
"loss": 0.1463,
"step": 362
},
{
"epoch": 0.9166666666666666,
"grad_norm": 5.223660469055176,
"learning_rate": 1.388888888888889e-05,
"loss": 0.1901,
"step": 363
},
{
"epoch": 0.9191919191919192,
"grad_norm": 2.7952096462249756,
"learning_rate": 1.3872053872053872e-05,
"loss": 0.1744,
"step": 364
},
{
"epoch": 0.9217171717171717,
"grad_norm": 4.329316139221191,
"learning_rate": 1.3855218855218857e-05,
"loss": 0.1503,
"step": 365
},
{
"epoch": 0.9242424242424242,
"grad_norm": 1.9337060451507568,
"learning_rate": 1.383838383838384e-05,
"loss": 0.1652,
"step": 366
},
{
"epoch": 0.9267676767676768,
"grad_norm": 6.0468645095825195,
"learning_rate": 1.3821548821548822e-05,
"loss": 0.21,
"step": 367
},
{
"epoch": 0.9292929292929293,
"grad_norm": 6.893640041351318,
"learning_rate": 1.3804713804713805e-05,
"loss": 0.0772,
"step": 368
},
{
"epoch": 0.9318181818181818,
"grad_norm": 11.513550758361816,
"learning_rate": 1.378787878787879e-05,
"loss": 0.1589,
"step": 369
},
{
"epoch": 0.9343434343434344,
"grad_norm": 3.4360713958740234,
"learning_rate": 1.3771043771043773e-05,
"loss": 0.1376,
"step": 370
},
{
"epoch": 0.9368686868686869,
"grad_norm": 1.2209364175796509,
"learning_rate": 1.3754208754208756e-05,
"loss": 0.1248,
"step": 371
},
{
"epoch": 0.9393939393939394,
"grad_norm": 4.991886615753174,
"learning_rate": 1.3737373737373739e-05,
"loss": 0.1584,
"step": 372
},
{
"epoch": 0.9419191919191919,
"grad_norm": 5.338135242462158,
"learning_rate": 1.3720538720538721e-05,
"loss": 0.1409,
"step": 373
},
{
"epoch": 0.9444444444444444,
"grad_norm": 1.5582698583602905,
"learning_rate": 1.3703703703703706e-05,
"loss": 0.0927,
"step": 374
},
{
"epoch": 0.946969696969697,
"grad_norm": 2.0467121601104736,
"learning_rate": 1.3686868686868689e-05,
"loss": 0.1042,
"step": 375
},
{
"epoch": 0.9494949494949495,
"grad_norm": 3.5191733837127686,
"learning_rate": 1.3670033670033671e-05,
"loss": 0.1543,
"step": 376
},
{
"epoch": 0.952020202020202,
"grad_norm": 0.8805956244468689,
"learning_rate": 1.3653198653198654e-05,
"loss": 0.0977,
"step": 377
},
{
"epoch": 0.9545454545454546,
"grad_norm": 1.0059103965759277,
"learning_rate": 1.3636363636363637e-05,
"loss": 0.111,
"step": 378
},
{
"epoch": 0.9570707070707071,
"grad_norm": 2.59409761428833,
"learning_rate": 1.3619528619528621e-05,
"loss": 0.0839,
"step": 379
},
{
"epoch": 0.9595959595959596,
"grad_norm": 3.308858633041382,
"learning_rate": 1.3602693602693604e-05,
"loss": 0.1505,
"step": 380
},
{
"epoch": 0.9621212121212122,
"grad_norm": 7.8376970291137695,
"learning_rate": 1.3585858585858586e-05,
"loss": 0.1694,
"step": 381
},
{
"epoch": 0.9646464646464646,
"grad_norm": 2.214016914367676,
"learning_rate": 1.3569023569023569e-05,
"loss": 0.2274,
"step": 382
},
{
"epoch": 0.9671717171717171,
"grad_norm": 2.3379106521606445,
"learning_rate": 1.3552188552188555e-05,
"loss": 0.1562,
"step": 383
},
{
"epoch": 0.9696969696969697,
"grad_norm": 4.5499043464660645,
"learning_rate": 1.3535353535353538e-05,
"loss": 0.1899,
"step": 384
},
{
"epoch": 0.9722222222222222,
"grad_norm": 5.938162803649902,
"learning_rate": 1.351851851851852e-05,
"loss": 0.1627,
"step": 385
},
{
"epoch": 0.9747474747474747,
"grad_norm": 2.1362643241882324,
"learning_rate": 1.3501683501683503e-05,
"loss": 0.1437,
"step": 386
},
{
"epoch": 0.9772727272727273,
"grad_norm": 5.690845012664795,
"learning_rate": 1.3484848484848486e-05,
"loss": 0.1572,
"step": 387
},
{
"epoch": 0.9797979797979798,
"grad_norm": 1.170046329498291,
"learning_rate": 1.346801346801347e-05,
"loss": 0.0697,
"step": 388
},
{
"epoch": 0.9823232323232324,
"grad_norm": 2.7204504013061523,
"learning_rate": 1.3451178451178453e-05,
"loss": 0.1125,
"step": 389
},
{
"epoch": 0.9848484848484849,
"grad_norm": 2.044360637664795,
"learning_rate": 1.3434343434343436e-05,
"loss": 0.0664,
"step": 390
},
{
"epoch": 0.9873737373737373,
"grad_norm": 2.956345558166504,
"learning_rate": 1.3417508417508418e-05,
"loss": 0.1254,
"step": 391
},
{
"epoch": 0.98989898989899,
"grad_norm": 3.5149104595184326,
"learning_rate": 1.3400673400673401e-05,
"loss": 0.243,
"step": 392
},
{
"epoch": 0.9924242424242424,
"grad_norm": 3.848884344100952,
"learning_rate": 1.3383838383838385e-05,
"loss": 0.2502,
"step": 393
},
{
"epoch": 0.9949494949494949,
"grad_norm": 5.738306522369385,
"learning_rate": 1.3367003367003368e-05,
"loss": 0.1173,
"step": 394
},
{
"epoch": 0.9974747474747475,
"grad_norm": 3.4760327339172363,
"learning_rate": 1.335016835016835e-05,
"loss": 0.251,
"step": 395
},
{
"epoch": 1.0,
"grad_norm": 3.0074808597564697,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.2484,
"step": 396
},
{
"epoch": 1.0,
"eval_accuracy": 0.7363636363636363,
"eval_f1": 0.8896302474284127,
"eval_loss": 0.1552901417016983,
"eval_runtime": 38.4517,
"eval_samples_per_second": 22.886,
"eval_steps_per_second": 0.494,
"step": 396
},
{
"epoch": 1.0025252525252526,
"grad_norm": 2.1957874298095703,
"learning_rate": 1.331649831649832e-05,
"loss": 0.2159,
"step": 397
},
{
"epoch": 1.005050505050505,
"grad_norm": 1.4837441444396973,
"learning_rate": 1.3299663299663302e-05,
"loss": 0.1777,
"step": 398
},
{
"epoch": 1.0075757575757576,
"grad_norm": 1.9972238540649414,
"learning_rate": 1.3282828282828285e-05,
"loss": 0.1156,
"step": 399
},
{
"epoch": 1.0101010101010102,
"grad_norm": 2.300161361694336,
"learning_rate": 1.3265993265993267e-05,
"loss": 0.1664,
"step": 400
},
{
"epoch": 1.0126262626262625,
"grad_norm": 5.714986801147461,
"learning_rate": 1.324915824915825e-05,
"loss": 0.1888,
"step": 401
},
{
"epoch": 1.0151515151515151,
"grad_norm": 2.443563222885132,
"learning_rate": 1.3232323232323234e-05,
"loss": 0.1548,
"step": 402
},
{
"epoch": 1.0176767676767677,
"grad_norm": 4.469695091247559,
"learning_rate": 1.3215488215488217e-05,
"loss": 0.197,
"step": 403
},
{
"epoch": 1.02020202020202,
"grad_norm": 2.7210092544555664,
"learning_rate": 1.31986531986532e-05,
"loss": 0.1858,
"step": 404
},
{
"epoch": 1.0227272727272727,
"grad_norm": 2.2780253887176514,
"learning_rate": 1.3181818181818183e-05,
"loss": 0.1188,
"step": 405
},
{
"epoch": 1.0252525252525253,
"grad_norm": 4.452651500701904,
"learning_rate": 1.3164983164983165e-05,
"loss": 0.1813,
"step": 406
},
{
"epoch": 1.0277777777777777,
"grad_norm": 3.2592673301696777,
"learning_rate": 1.314814814814815e-05,
"loss": 0.1642,
"step": 407
},
{
"epoch": 1.0303030303030303,
"grad_norm": 2.8365068435668945,
"learning_rate": 1.3131313131313132e-05,
"loss": 0.1951,
"step": 408
},
{
"epoch": 1.0328282828282829,
"grad_norm": 1.204214334487915,
"learning_rate": 1.3114478114478115e-05,
"loss": 0.0943,
"step": 409
},
{
"epoch": 1.0353535353535352,
"grad_norm": 3.9835519790649414,
"learning_rate": 1.3097643097643098e-05,
"loss": 0.1914,
"step": 410
},
{
"epoch": 1.0378787878787878,
"grad_norm": 2.162397861480713,
"learning_rate": 1.3080808080808084e-05,
"loss": 0.1681,
"step": 411
},
{
"epoch": 1.0404040404040404,
"grad_norm": 5.173393726348877,
"learning_rate": 1.3063973063973066e-05,
"loss": 0.1114,
"step": 412
},
{
"epoch": 1.0429292929292928,
"grad_norm": 3.75376558303833,
"learning_rate": 1.3047138047138049e-05,
"loss": 0.2572,
"step": 413
},
{
"epoch": 1.0454545454545454,
"grad_norm": 2.164644479751587,
"learning_rate": 1.3030303030303032e-05,
"loss": 0.0901,
"step": 414
},
{
"epoch": 1.047979797979798,
"grad_norm": 1.5438755750656128,
"learning_rate": 1.3013468013468014e-05,
"loss": 0.1127,
"step": 415
},
{
"epoch": 1.0505050505050506,
"grad_norm": 1.1772854328155518,
"learning_rate": 1.2996632996632999e-05,
"loss": 0.1177,
"step": 416
},
{
"epoch": 1.053030303030303,
"grad_norm": 1.5766481161117554,
"learning_rate": 1.2979797979797981e-05,
"loss": 0.186,
"step": 417
},
{
"epoch": 1.0555555555555556,
"grad_norm": 3.8098015785217285,
"learning_rate": 1.2962962962962964e-05,
"loss": 0.2378,
"step": 418
},
{
"epoch": 1.0580808080808082,
"grad_norm": 2.2948317527770996,
"learning_rate": 1.2946127946127947e-05,
"loss": 0.2139,
"step": 419
},
{
"epoch": 1.0606060606060606,
"grad_norm": 3.4112160205841064,
"learning_rate": 1.2929292929292931e-05,
"loss": 0.1936,
"step": 420
},
{
"epoch": 1.0631313131313131,
"grad_norm": 6.034069061279297,
"learning_rate": 1.2912457912457914e-05,
"loss": 0.0954,
"step": 421
},
{
"epoch": 1.0656565656565657,
"grad_norm": 1.4731574058532715,
"learning_rate": 1.2895622895622897e-05,
"loss": 0.1835,
"step": 422
},
{
"epoch": 1.0681818181818181,
"grad_norm": 2.9212472438812256,
"learning_rate": 1.287878787878788e-05,
"loss": 0.1626,
"step": 423
},
{
"epoch": 1.0707070707070707,
"grad_norm": 2.7289297580718994,
"learning_rate": 1.2861952861952862e-05,
"loss": 0.0934,
"step": 424
},
{
"epoch": 1.0732323232323233,
"grad_norm": 2.0637686252593994,
"learning_rate": 1.2845117845117846e-05,
"loss": 0.1157,
"step": 425
},
{
"epoch": 1.0757575757575757,
"grad_norm": 5.231685638427734,
"learning_rate": 1.2828282828282829e-05,
"loss": 0.1704,
"step": 426
},
{
"epoch": 1.0782828282828283,
"grad_norm": 2.5837466716766357,
"learning_rate": 1.2811447811447812e-05,
"loss": 0.1756,
"step": 427
},
{
"epoch": 1.0808080808080809,
"grad_norm": 1.4013397693634033,
"learning_rate": 1.2794612794612794e-05,
"loss": 0.0978,
"step": 428
},
{
"epoch": 1.0833333333333333,
"grad_norm": 2.0431172847747803,
"learning_rate": 1.2777777777777777e-05,
"loss": 0.1109,
"step": 429
},
{
"epoch": 1.0858585858585859,
"grad_norm": 1.8190507888793945,
"learning_rate": 1.2760942760942763e-05,
"loss": 0.1234,
"step": 430
},
{
"epoch": 1.0883838383838385,
"grad_norm": 2.79791259765625,
"learning_rate": 1.2744107744107746e-05,
"loss": 0.1882,
"step": 431
},
{
"epoch": 1.0909090909090908,
"grad_norm": 1.9243866205215454,
"learning_rate": 1.2727272727272728e-05,
"loss": 0.2005,
"step": 432
},
{
"epoch": 1.0934343434343434,
"grad_norm": 1.8950653076171875,
"learning_rate": 1.2710437710437711e-05,
"loss": 0.1346,
"step": 433
},
{
"epoch": 1.095959595959596,
"grad_norm": 3.081726551055908,
"learning_rate": 1.2693602693602696e-05,
"loss": 0.1366,
"step": 434
},
{
"epoch": 1.0984848484848484,
"grad_norm": 1.9953432083129883,
"learning_rate": 1.2676767676767678e-05,
"loss": 0.1675,
"step": 435
},
{
"epoch": 1.101010101010101,
"grad_norm": 1.969563603401184,
"learning_rate": 1.2659932659932661e-05,
"loss": 0.0675,
"step": 436
},
{
"epoch": 1.1035353535353536,
"grad_norm": 2.647690773010254,
"learning_rate": 1.2643097643097644e-05,
"loss": 0.1804,
"step": 437
},
{
"epoch": 1.106060606060606,
"grad_norm": 1.8048343658447266,
"learning_rate": 1.2626262626262626e-05,
"loss": 0.1463,
"step": 438
},
{
"epoch": 1.1085858585858586,
"grad_norm": 3.305330514907837,
"learning_rate": 1.260942760942761e-05,
"loss": 0.1339,
"step": 439
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.416746973991394,
"learning_rate": 1.2592592592592593e-05,
"loss": 0.1269,
"step": 440
},
{
"epoch": 1.1136363636363635,
"grad_norm": 5.987617015838623,
"learning_rate": 1.2575757575757576e-05,
"loss": 0.1437,
"step": 441
},
{
"epoch": 1.1161616161616161,
"grad_norm": 2.646730422973633,
"learning_rate": 1.2558922558922559e-05,
"loss": 0.1424,
"step": 442
},
{
"epoch": 1.1186868686868687,
"grad_norm": 12.504968643188477,
"learning_rate": 1.2542087542087541e-05,
"loss": 0.2163,
"step": 443
},
{
"epoch": 1.121212121212121,
"grad_norm": 3.575183153152466,
"learning_rate": 1.2525252525252527e-05,
"loss": 0.2721,
"step": 444
},
{
"epoch": 1.1237373737373737,
"grad_norm": 2.08457088470459,
"learning_rate": 1.250841750841751e-05,
"loss": 0.0727,
"step": 445
},
{
"epoch": 1.1262626262626263,
"grad_norm": 3.8432371616363525,
"learning_rate": 1.2491582491582493e-05,
"loss": 0.1455,
"step": 446
},
{
"epoch": 1.128787878787879,
"grad_norm": 1.7455401420593262,
"learning_rate": 1.2474747474747475e-05,
"loss": 0.1424,
"step": 447
},
{
"epoch": 1.1313131313131313,
"grad_norm": 2.4613749980926514,
"learning_rate": 1.245791245791246e-05,
"loss": 0.2304,
"step": 448
},
{
"epoch": 1.1338383838383839,
"grad_norm": 0.8536304831504822,
"learning_rate": 1.2441077441077443e-05,
"loss": 0.0967,
"step": 449
},
{
"epoch": 1.1363636363636362,
"grad_norm": 1.8662675619125366,
"learning_rate": 1.2424242424242425e-05,
"loss": 0.1165,
"step": 450
},
{
"epoch": 1.1388888888888888,
"grad_norm": 6.983151435852051,
"learning_rate": 1.2407407407407408e-05,
"loss": 0.1065,
"step": 451
},
{
"epoch": 1.1414141414141414,
"grad_norm": 3.3465776443481445,
"learning_rate": 1.239057239057239e-05,
"loss": 0.1295,
"step": 452
},
{
"epoch": 1.143939393939394,
"grad_norm": 3.347223997116089,
"learning_rate": 1.2373737373737375e-05,
"loss": 0.0947,
"step": 453
},
{
"epoch": 1.1464646464646464,
"grad_norm": 2.8548214435577393,
"learning_rate": 1.2356902356902358e-05,
"loss": 0.1031,
"step": 454
},
{
"epoch": 1.148989898989899,
"grad_norm": 5.722323417663574,
"learning_rate": 1.234006734006734e-05,
"loss": 0.2435,
"step": 455
},
{
"epoch": 1.1515151515151516,
"grad_norm": 4.030499458312988,
"learning_rate": 1.2323232323232323e-05,
"loss": 0.2088,
"step": 456
},
{
"epoch": 1.154040404040404,
"grad_norm": 1.1742424964904785,
"learning_rate": 1.2306397306397306e-05,
"loss": 0.149,
"step": 457
},
{
"epoch": 1.1565656565656566,
"grad_norm": 3.6498565673828125,
"learning_rate": 1.2289562289562292e-05,
"loss": 0.1704,
"step": 458
},
{
"epoch": 1.1590909090909092,
"grad_norm": 7.278556823730469,
"learning_rate": 1.2272727272727274e-05,
"loss": 0.2622,
"step": 459
},
{
"epoch": 1.1616161616161615,
"grad_norm": 5.127131462097168,
"learning_rate": 1.2255892255892257e-05,
"loss": 0.1456,
"step": 460
},
{
"epoch": 1.1641414141414141,
"grad_norm": 5.602865219116211,
"learning_rate": 1.223905723905724e-05,
"loss": 0.2437,
"step": 461
},
{
"epoch": 1.1666666666666667,
"grad_norm": 2.088778257369995,
"learning_rate": 1.2222222222222224e-05,
"loss": 0.1368,
"step": 462
},
{
"epoch": 1.1691919191919191,
"grad_norm": 3.3843162059783936,
"learning_rate": 1.2205387205387207e-05,
"loss": 0.1656,
"step": 463
},
{
"epoch": 1.1717171717171717,
"grad_norm": 6.630326271057129,
"learning_rate": 1.218855218855219e-05,
"loss": 0.1077,
"step": 464
},
{
"epoch": 1.1742424242424243,
"grad_norm": 1.9552891254425049,
"learning_rate": 1.2171717171717172e-05,
"loss": 0.069,
"step": 465
},
{
"epoch": 1.1767676767676767,
"grad_norm": 2.806879997253418,
"learning_rate": 1.2154882154882155e-05,
"loss": 0.1524,
"step": 466
},
{
"epoch": 1.1792929292929293,
"grad_norm": 5.727405071258545,
"learning_rate": 1.213804713804714e-05,
"loss": 0.2153,
"step": 467
},
{
"epoch": 1.1818181818181819,
"grad_norm": 2.179191827774048,
"learning_rate": 1.2121212121212122e-05,
"loss": 0.0931,
"step": 468
},
{
"epoch": 1.1843434343434343,
"grad_norm": 4.29899263381958,
"learning_rate": 1.2104377104377105e-05,
"loss": 0.3174,
"step": 469
},
{
"epoch": 1.1868686868686869,
"grad_norm": 3.667917490005493,
"learning_rate": 1.2087542087542087e-05,
"loss": 0.1926,
"step": 470
},
{
"epoch": 1.1893939393939394,
"grad_norm": 2.626540422439575,
"learning_rate": 1.207070707070707e-05,
"loss": 0.1145,
"step": 471
},
{
"epoch": 1.1919191919191918,
"grad_norm": 2.2297780513763428,
"learning_rate": 1.2053872053872056e-05,
"loss": 0.1552,
"step": 472
},
{
"epoch": 1.1944444444444444,
"grad_norm": 5.271230220794678,
"learning_rate": 1.2037037037037039e-05,
"loss": 0.0832,
"step": 473
},
{
"epoch": 1.196969696969697,
"grad_norm": 3.2105331420898438,
"learning_rate": 1.2020202020202021e-05,
"loss": 0.1523,
"step": 474
},
{
"epoch": 1.1994949494949494,
"grad_norm": 3.91426157951355,
"learning_rate": 1.2003367003367004e-05,
"loss": 0.0819,
"step": 475
},
{
"epoch": 1.202020202020202,
"grad_norm": 3.227917194366455,
"learning_rate": 1.1986531986531988e-05,
"loss": 0.2343,
"step": 476
},
{
"epoch": 1.2045454545454546,
"grad_norm": 3.9899652004241943,
"learning_rate": 1.1969696969696971e-05,
"loss": 0.1466,
"step": 477
},
{
"epoch": 1.2070707070707072,
"grad_norm": 1.5494883060455322,
"learning_rate": 1.1952861952861954e-05,
"loss": 0.1265,
"step": 478
},
{
"epoch": 1.2095959595959596,
"grad_norm": 3.4442131519317627,
"learning_rate": 1.1936026936026937e-05,
"loss": 0.1355,
"step": 479
},
{
"epoch": 1.2121212121212122,
"grad_norm": 2.505126714706421,
"learning_rate": 1.191919191919192e-05,
"loss": 0.1529,
"step": 480
},
{
"epoch": 1.2146464646464645,
"grad_norm": 3.982832431793213,
"learning_rate": 1.1902356902356904e-05,
"loss": 0.1428,
"step": 481
},
{
"epoch": 1.2171717171717171,
"grad_norm": 2.3964552879333496,
"learning_rate": 1.1885521885521886e-05,
"loss": 0.1062,
"step": 482
},
{
"epoch": 1.2196969696969697,
"grad_norm": 11.200109481811523,
"learning_rate": 1.1868686868686869e-05,
"loss": 0.1621,
"step": 483
},
{
"epoch": 1.2222222222222223,
"grad_norm": 3.8272476196289062,
"learning_rate": 1.1851851851851852e-05,
"loss": 0.1814,
"step": 484
},
{
"epoch": 1.2247474747474747,
"grad_norm": 2.3707261085510254,
"learning_rate": 1.1835016835016838e-05,
"loss": 0.1253,
"step": 485
},
{
"epoch": 1.2272727272727273,
"grad_norm": 6.070870399475098,
"learning_rate": 1.181818181818182e-05,
"loss": 0.1005,
"step": 486
},
{
"epoch": 1.22979797979798,
"grad_norm": 2.2206804752349854,
"learning_rate": 1.1801346801346803e-05,
"loss": 0.0647,
"step": 487
},
{
"epoch": 1.2323232323232323,
"grad_norm": 1.8281488418579102,
"learning_rate": 1.1784511784511786e-05,
"loss": 0.0642,
"step": 488
},
{
"epoch": 1.2348484848484849,
"grad_norm": 2.690546751022339,
"learning_rate": 1.1767676767676768e-05,
"loss": 0.1824,
"step": 489
},
{
"epoch": 1.2373737373737375,
"grad_norm": 8.035049438476562,
"learning_rate": 1.1750841750841753e-05,
"loss": 0.2047,
"step": 490
},
{
"epoch": 1.2398989898989898,
"grad_norm": 2.475928783416748,
"learning_rate": 1.1734006734006735e-05,
"loss": 0.0882,
"step": 491
},
{
"epoch": 1.2424242424242424,
"grad_norm": 2.2812142372131348,
"learning_rate": 1.1717171717171718e-05,
"loss": 0.1556,
"step": 492
},
{
"epoch": 1.244949494949495,
"grad_norm": 1.1551276445388794,
"learning_rate": 1.17003367003367e-05,
"loss": 0.0905,
"step": 493
},
{
"epoch": 1.2474747474747474,
"grad_norm": 1.8045101165771484,
"learning_rate": 1.1683501683501684e-05,
"loss": 0.1399,
"step": 494
},
{
"epoch": 1.25,
"grad_norm": 2.3668212890625,
"learning_rate": 1.1666666666666668e-05,
"loss": 0.1231,
"step": 495
},
{
"epoch": 1.2525252525252526,
"grad_norm": 4.466938018798828,
"learning_rate": 1.164983164983165e-05,
"loss": 0.1478,
"step": 496
},
{
"epoch": 1.255050505050505,
"grad_norm": 6.409605026245117,
"learning_rate": 1.1632996632996633e-05,
"loss": 0.0892,
"step": 497
},
{
"epoch": 1.2575757575757576,
"grad_norm": 3.112314224243164,
"learning_rate": 1.1616161616161616e-05,
"loss": 0.2378,
"step": 498
},
{
"epoch": 1.2601010101010102,
"grad_norm": 2.3682100772857666,
"learning_rate": 1.1599326599326602e-05,
"loss": 0.0708,
"step": 499
},
{
"epoch": 1.2626262626262625,
"grad_norm": 1.4351693391799927,
"learning_rate": 1.1582491582491585e-05,
"loss": 0.1386,
"step": 500
},
{
"epoch": 1.2651515151515151,
"grad_norm": 1.9613323211669922,
"learning_rate": 1.1565656565656567e-05,
"loss": 0.0956,
"step": 501
},
{
"epoch": 1.2676767676767677,
"grad_norm": 3.0842106342315674,
"learning_rate": 1.154882154882155e-05,
"loss": 0.1891,
"step": 502
},
{
"epoch": 1.2702020202020203,
"grad_norm": 1.4221595525741577,
"learning_rate": 1.1531986531986533e-05,
"loss": 0.1238,
"step": 503
},
{
"epoch": 1.2727272727272727,
"grad_norm": 2.890872001647949,
"learning_rate": 1.1515151515151517e-05,
"loss": 0.1804,
"step": 504
},
{
"epoch": 1.2752525252525253,
"grad_norm": 2.4430460929870605,
"learning_rate": 1.14983164983165e-05,
"loss": 0.1566,
"step": 505
},
{
"epoch": 1.2777777777777777,
"grad_norm": 4.546942710876465,
"learning_rate": 1.1481481481481482e-05,
"loss": 0.088,
"step": 506
},
{
"epoch": 1.2803030303030303,
"grad_norm": 3.6536505222320557,
"learning_rate": 1.1464646464646465e-05,
"loss": 0.2054,
"step": 507
},
{
"epoch": 1.2828282828282829,
"grad_norm": 1.6071276664733887,
"learning_rate": 1.1447811447811448e-05,
"loss": 0.1044,
"step": 508
},
{
"epoch": 1.2853535353535355,
"grad_norm": 3.828359365463257,
"learning_rate": 1.1430976430976432e-05,
"loss": 0.1279,
"step": 509
},
{
"epoch": 1.2878787878787878,
"grad_norm": 3.433269500732422,
"learning_rate": 1.1414141414141415e-05,
"loss": 0.1197,
"step": 510
},
{
"epoch": 1.2904040404040404,
"grad_norm": 1.6743693351745605,
"learning_rate": 1.1397306397306398e-05,
"loss": 0.1057,
"step": 511
},
{
"epoch": 1.2929292929292928,
"grad_norm": 5.027686595916748,
"learning_rate": 1.138047138047138e-05,
"loss": 0.1509,
"step": 512
},
{
"epoch": 1.2954545454545454,
"grad_norm": 1.6235765218734741,
"learning_rate": 1.1363636363636366e-05,
"loss": 0.1147,
"step": 513
},
{
"epoch": 1.297979797979798,
"grad_norm": 2.351604700088501,
"learning_rate": 1.1346801346801349e-05,
"loss": 0.0844,
"step": 514
},
{
"epoch": 1.3005050505050506,
"grad_norm": 4.877485275268555,
"learning_rate": 1.1329966329966332e-05,
"loss": 0.1426,
"step": 515
},
{
"epoch": 1.303030303030303,
"grad_norm": 1.512285828590393,
"learning_rate": 1.1313131313131314e-05,
"loss": 0.1225,
"step": 516
},
{
"epoch": 1.3055555555555556,
"grad_norm": 2.247408628463745,
"learning_rate": 1.1296296296296297e-05,
"loss": 0.105,
"step": 517
},
{
"epoch": 1.308080808080808,
"grad_norm": 1.3952003717422485,
"learning_rate": 1.1279461279461281e-05,
"loss": 0.0738,
"step": 518
},
{
"epoch": 1.3106060606060606,
"grad_norm": 1.185707688331604,
"learning_rate": 1.1262626262626264e-05,
"loss": 0.0644,
"step": 519
},
{
"epoch": 1.3131313131313131,
"grad_norm": 1.2581713199615479,
"learning_rate": 1.1245791245791247e-05,
"loss": 0.1065,
"step": 520
},
{
"epoch": 1.3156565656565657,
"grad_norm": 5.530824661254883,
"learning_rate": 1.122895622895623e-05,
"loss": 0.1829,
"step": 521
},
{
"epoch": 1.3181818181818181,
"grad_norm": 2.5609781742095947,
"learning_rate": 1.1212121212121212e-05,
"loss": 0.2349,
"step": 522
},
{
"epoch": 1.3207070707070707,
"grad_norm": 2.8253445625305176,
"learning_rate": 1.1195286195286197e-05,
"loss": 0.1403,
"step": 523
},
{
"epoch": 1.3232323232323233,
"grad_norm": 4.705146312713623,
"learning_rate": 1.117845117845118e-05,
"loss": 0.2046,
"step": 524
},
{
"epoch": 1.3257575757575757,
"grad_norm": 4.86195182800293,
"learning_rate": 1.1161616161616162e-05,
"loss": 0.1632,
"step": 525
},
{
"epoch": 1.3282828282828283,
"grad_norm": 2.6909475326538086,
"learning_rate": 1.1144781144781145e-05,
"loss": 0.0647,
"step": 526
},
{
"epoch": 1.3308080808080809,
"grad_norm": 1.6233677864074707,
"learning_rate": 1.112794612794613e-05,
"loss": 0.1891,
"step": 527
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.8622492551803589,
"learning_rate": 1.1111111111111113e-05,
"loss": 0.1507,
"step": 528
},
{
"epoch": 1.3358585858585859,
"grad_norm": 3.173917770385742,
"learning_rate": 1.1094276094276096e-05,
"loss": 0.1618,
"step": 529
},
{
"epoch": 1.3383838383838385,
"grad_norm": 5.947041034698486,
"learning_rate": 1.1077441077441079e-05,
"loss": 0.1692,
"step": 530
},
{
"epoch": 1.3409090909090908,
"grad_norm": 2.8621153831481934,
"learning_rate": 1.1060606060606061e-05,
"loss": 0.218,
"step": 531
},
{
"epoch": 1.3434343434343434,
"grad_norm": 3.8391976356506348,
"learning_rate": 1.1043771043771046e-05,
"loss": 0.2624,
"step": 532
},
{
"epoch": 1.345959595959596,
"grad_norm": 4.155307769775391,
"learning_rate": 1.1026936026936028e-05,
"loss": 0.1923,
"step": 533
},
{
"epoch": 1.3484848484848486,
"grad_norm": 2.796172618865967,
"learning_rate": 1.1010101010101011e-05,
"loss": 0.1886,
"step": 534
},
{
"epoch": 1.351010101010101,
"grad_norm": 3.59019136428833,
"learning_rate": 1.0993265993265994e-05,
"loss": 0.1854,
"step": 535
},
{
"epoch": 1.3535353535353536,
"grad_norm": 2.077014684677124,
"learning_rate": 1.0976430976430978e-05,
"loss": 0.0919,
"step": 536
},
{
"epoch": 1.356060606060606,
"grad_norm": 2.869927167892456,
"learning_rate": 1.0959595959595961e-05,
"loss": 0.1828,
"step": 537
},
{
"epoch": 1.3585858585858586,
"grad_norm": 2.4379348754882812,
"learning_rate": 1.0942760942760944e-05,
"loss": 0.1257,
"step": 538
},
{
"epoch": 1.3611111111111112,
"grad_norm": 2.5572493076324463,
"learning_rate": 1.0925925925925926e-05,
"loss": 0.169,
"step": 539
},
{
"epoch": 1.3636363636363638,
"grad_norm": 7.126609802246094,
"learning_rate": 1.0909090909090909e-05,
"loss": 0.1178,
"step": 540
},
{
"epoch": 1.3661616161616161,
"grad_norm": 3.068500280380249,
"learning_rate": 1.0892255892255893e-05,
"loss": 0.1507,
"step": 541
},
{
"epoch": 1.3686868686868687,
"grad_norm": 3.512056350708008,
"learning_rate": 1.0875420875420876e-05,
"loss": 0.1816,
"step": 542
},
{
"epoch": 1.371212121212121,
"grad_norm": 4.917716979980469,
"learning_rate": 1.0858585858585859e-05,
"loss": 0.1301,
"step": 543
},
{
"epoch": 1.3737373737373737,
"grad_norm": 2.207784414291382,
"learning_rate": 1.0841750841750841e-05,
"loss": 0.1251,
"step": 544
},
{
"epoch": 1.3762626262626263,
"grad_norm": 4.091345310211182,
"learning_rate": 1.0824915824915824e-05,
"loss": 0.2324,
"step": 545
},
{
"epoch": 1.378787878787879,
"grad_norm": 3.5930373668670654,
"learning_rate": 1.080808080808081e-05,
"loss": 0.2327,
"step": 546
},
{
"epoch": 1.3813131313131313,
"grad_norm": 0.9397197365760803,
"learning_rate": 1.0791245791245793e-05,
"loss": 0.0909,
"step": 547
},
{
"epoch": 1.3838383838383839,
"grad_norm": 1.6392264366149902,
"learning_rate": 1.0774410774410775e-05,
"loss": 0.065,
"step": 548
},
{
"epoch": 1.3863636363636362,
"grad_norm": 3.9621989727020264,
"learning_rate": 1.0757575757575758e-05,
"loss": 0.0888,
"step": 549
},
{
"epoch": 1.3888888888888888,
"grad_norm": 1.8630791902542114,
"learning_rate": 1.0740740740740742e-05,
"loss": 0.0705,
"step": 550
},
{
"epoch": 1.3914141414141414,
"grad_norm": 2.29435133934021,
"learning_rate": 1.0723905723905725e-05,
"loss": 0.1626,
"step": 551
},
{
"epoch": 1.393939393939394,
"grad_norm": 3.3439769744873047,
"learning_rate": 1.0707070707070708e-05,
"loss": 0.1741,
"step": 552
},
{
"epoch": 1.3964646464646464,
"grad_norm": 3.894381523132324,
"learning_rate": 1.069023569023569e-05,
"loss": 0.2404,
"step": 553
},
{
"epoch": 1.398989898989899,
"grad_norm": 2.4891560077667236,
"learning_rate": 1.0673400673400673e-05,
"loss": 0.1854,
"step": 554
},
{
"epoch": 1.4015151515151514,
"grad_norm": 2.0606627464294434,
"learning_rate": 1.0656565656565658e-05,
"loss": 0.1896,
"step": 555
},
{
"epoch": 1.404040404040404,
"grad_norm": 1.3142637014389038,
"learning_rate": 1.063973063973064e-05,
"loss": 0.0976,
"step": 556
},
{
"epoch": 1.4065656565656566,
"grad_norm": 1.7551708221435547,
"learning_rate": 1.0622895622895623e-05,
"loss": 0.1013,
"step": 557
},
{
"epoch": 1.4090909090909092,
"grad_norm": 2.389742612838745,
"learning_rate": 1.0606060606060606e-05,
"loss": 0.0802,
"step": 558
},
{
"epoch": 1.4116161616161615,
"grad_norm": 5.079484462738037,
"learning_rate": 1.0589225589225588e-05,
"loss": 0.1066,
"step": 559
},
{
"epoch": 1.4141414141414141,
"grad_norm": 1.7105693817138672,
"learning_rate": 1.0572390572390574e-05,
"loss": 0.0917,
"step": 560
},
{
"epoch": 1.4166666666666667,
"grad_norm": 2.481248617172241,
"learning_rate": 1.0555555555555557e-05,
"loss": 0.0901,
"step": 561
},
{
"epoch": 1.4191919191919191,
"grad_norm": 4.0751495361328125,
"learning_rate": 1.053872053872054e-05,
"loss": 0.1493,
"step": 562
},
{
"epoch": 1.4217171717171717,
"grad_norm": 2.6854546070098877,
"learning_rate": 1.0521885521885522e-05,
"loss": 0.1751,
"step": 563
},
{
"epoch": 1.4242424242424243,
"grad_norm": 7.801976203918457,
"learning_rate": 1.0505050505050507e-05,
"loss": 0.09,
"step": 564
},
{
"epoch": 1.4267676767676767,
"grad_norm": 1.9461811780929565,
"learning_rate": 1.048821548821549e-05,
"loss": 0.0539,
"step": 565
},
{
"epoch": 1.4292929292929293,
"grad_norm": 1.0220575332641602,
"learning_rate": 1.0471380471380472e-05,
"loss": 0.0629,
"step": 566
},
{
"epoch": 1.4318181818181819,
"grad_norm": 3.8231167793273926,
"learning_rate": 1.0454545454545455e-05,
"loss": 0.0949,
"step": 567
},
{
"epoch": 1.4343434343434343,
"grad_norm": 4.782219886779785,
"learning_rate": 1.0437710437710438e-05,
"loss": 0.2014,
"step": 568
},
{
"epoch": 1.4368686868686869,
"grad_norm": 1.7311866283416748,
"learning_rate": 1.0420875420875422e-05,
"loss": 0.1586,
"step": 569
},
{
"epoch": 1.4393939393939394,
"grad_norm": 1.6415760517120361,
"learning_rate": 1.0404040404040405e-05,
"loss": 0.0832,
"step": 570
},
{
"epoch": 1.441919191919192,
"grad_norm": 3.1272056102752686,
"learning_rate": 1.0387205387205387e-05,
"loss": 0.1086,
"step": 571
},
{
"epoch": 1.4444444444444444,
"grad_norm": 1.3914761543273926,
"learning_rate": 1.037037037037037e-05,
"loss": 0.0353,
"step": 572
},
{
"epoch": 1.446969696969697,
"grad_norm": 2.641190528869629,
"learning_rate": 1.0353535353535353e-05,
"loss": 0.1669,
"step": 573
},
{
"epoch": 1.4494949494949494,
"grad_norm": 1.974168300628662,
"learning_rate": 1.0336700336700339e-05,
"loss": 0.1607,
"step": 574
},
{
"epoch": 1.452020202020202,
"grad_norm": 1.5384374856948853,
"learning_rate": 1.0319865319865321e-05,
"loss": 0.066,
"step": 575
},
{
"epoch": 1.4545454545454546,
"grad_norm": 2.555971145629883,
"learning_rate": 1.0303030303030304e-05,
"loss": 0.1178,
"step": 576
},
{
"epoch": 1.4570707070707072,
"grad_norm": 3.460545301437378,
"learning_rate": 1.0286195286195287e-05,
"loss": 0.163,
"step": 577
},
{
"epoch": 1.4595959595959596,
"grad_norm": 2.9746346473693848,
"learning_rate": 1.0269360269360271e-05,
"loss": 0.2179,
"step": 578
},
{
"epoch": 1.4621212121212122,
"grad_norm": 1.8450326919555664,
"learning_rate": 1.0252525252525254e-05,
"loss": 0.0707,
"step": 579
},
{
"epoch": 1.4646464646464645,
"grad_norm": 1.596994400024414,
"learning_rate": 1.0235690235690236e-05,
"loss": 0.0712,
"step": 580
},
{
"epoch": 1.4671717171717171,
"grad_norm": 2.0924813747406006,
"learning_rate": 1.021885521885522e-05,
"loss": 0.1328,
"step": 581
},
{
"epoch": 1.4696969696969697,
"grad_norm": 2.734872579574585,
"learning_rate": 1.0202020202020202e-05,
"loss": 0.2434,
"step": 582
},
{
"epoch": 1.4722222222222223,
"grad_norm": 2.7146146297454834,
"learning_rate": 1.0185185185185186e-05,
"loss": 0.1953,
"step": 583
},
{
"epoch": 1.4747474747474747,
"grad_norm": 2.9375946521759033,
"learning_rate": 1.0168350168350169e-05,
"loss": 0.1656,
"step": 584
},
{
"epoch": 1.4772727272727273,
"grad_norm": 2.132500648498535,
"learning_rate": 1.0151515151515152e-05,
"loss": 0.112,
"step": 585
},
{
"epoch": 1.4797979797979797,
"grad_norm": 2.179478883743286,
"learning_rate": 1.0134680134680134e-05,
"loss": 0.0973,
"step": 586
},
{
"epoch": 1.4823232323232323,
"grad_norm": 3.4565017223358154,
"learning_rate": 1.0117845117845117e-05,
"loss": 0.1256,
"step": 587
},
{
"epoch": 1.4848484848484849,
"grad_norm": 1.9032288789749146,
"learning_rate": 1.0101010101010103e-05,
"loss": 0.0915,
"step": 588
},
{
"epoch": 1.4873737373737375,
"grad_norm": 8.383233070373535,
"learning_rate": 1.0084175084175086e-05,
"loss": 0.1538,
"step": 589
},
{
"epoch": 1.4898989898989898,
"grad_norm": 4.910621166229248,
"learning_rate": 1.0067340067340068e-05,
"loss": 0.1799,
"step": 590
},
{
"epoch": 1.4924242424242424,
"grad_norm": 2.6224441528320312,
"learning_rate": 1.0050505050505051e-05,
"loss": 0.155,
"step": 591
},
{
"epoch": 1.494949494949495,
"grad_norm": 3.4021310806274414,
"learning_rate": 1.0033670033670035e-05,
"loss": 0.1196,
"step": 592
},
{
"epoch": 1.4974747474747474,
"grad_norm": 2.7120611667633057,
"learning_rate": 1.0016835016835018e-05,
"loss": 0.08,
"step": 593
},
{
"epoch": 1.5,
"grad_norm": 1.137710452079773,
"learning_rate": 1e-05,
"loss": 0.078,
"step": 594
},
{
"epoch": 1.5025252525252526,
"grad_norm": 2.9225590229034424,
"learning_rate": 9.983164983164983e-06,
"loss": 0.2341,
"step": 595
},
{
"epoch": 1.5050505050505052,
"grad_norm": 1.6335861682891846,
"learning_rate": 9.966329966329968e-06,
"loss": 0.1192,
"step": 596
},
{
"epoch": 1.5075757575757576,
"grad_norm": 2.710495948791504,
"learning_rate": 9.94949494949495e-06,
"loss": 0.1237,
"step": 597
},
{
"epoch": 1.51010101010101,
"grad_norm": 2.903191328048706,
"learning_rate": 9.932659932659933e-06,
"loss": 0.0784,
"step": 598
},
{
"epoch": 1.5126262626262625,
"grad_norm": 3.359354019165039,
"learning_rate": 9.915824915824916e-06,
"loss": 0.2288,
"step": 599
},
{
"epoch": 1.5151515151515151,
"grad_norm": 2.92893648147583,
"learning_rate": 9.8989898989899e-06,
"loss": 0.1936,
"step": 600
},
{
"epoch": 1.5176767676767677,
"grad_norm": 3.7757456302642822,
"learning_rate": 9.882154882154883e-06,
"loss": 0.2431,
"step": 601
},
{
"epoch": 1.5202020202020203,
"grad_norm": 2.7293543815612793,
"learning_rate": 9.865319865319866e-06,
"loss": 0.1475,
"step": 602
},
{
"epoch": 1.5227272727272727,
"grad_norm": 4.0022873878479,
"learning_rate": 9.84848484848485e-06,
"loss": 0.1055,
"step": 603
},
{
"epoch": 1.5252525252525253,
"grad_norm": 4.107253074645996,
"learning_rate": 9.831649831649833e-06,
"loss": 0.1306,
"step": 604
},
{
"epoch": 1.5277777777777777,
"grad_norm": 2.5653955936431885,
"learning_rate": 9.814814814814815e-06,
"loss": 0.1317,
"step": 605
},
{
"epoch": 1.5303030303030303,
"grad_norm": 2.9474546909332275,
"learning_rate": 9.797979797979798e-06,
"loss": 0.1486,
"step": 606
},
{
"epoch": 1.5328282828282829,
"grad_norm": 1.209354043006897,
"learning_rate": 9.781144781144782e-06,
"loss": 0.1019,
"step": 607
},
{
"epoch": 1.5353535353535355,
"grad_norm": 2.3573384284973145,
"learning_rate": 9.764309764309765e-06,
"loss": 0.0792,
"step": 608
},
{
"epoch": 1.5378787878787878,
"grad_norm": 2.1612727642059326,
"learning_rate": 9.747474747474748e-06,
"loss": 0.1402,
"step": 609
},
{
"epoch": 1.5404040404040404,
"grad_norm": 1.7895665168762207,
"learning_rate": 9.730639730639732e-06,
"loss": 0.118,
"step": 610
},
{
"epoch": 1.5429292929292928,
"grad_norm": 1.4610426425933838,
"learning_rate": 9.713804713804715e-06,
"loss": 0.1428,
"step": 611
},
{
"epoch": 1.5454545454545454,
"grad_norm": 2.2483487129211426,
"learning_rate": 9.696969696969698e-06,
"loss": 0.1732,
"step": 612
},
{
"epoch": 1.547979797979798,
"grad_norm": 5.811710834503174,
"learning_rate": 9.68013468013468e-06,
"loss": 0.1112,
"step": 613
},
{
"epoch": 1.5505050505050506,
"grad_norm": 6.1415815353393555,
"learning_rate": 9.663299663299665e-06,
"loss": 0.138,
"step": 614
},
{
"epoch": 1.553030303030303,
"grad_norm": 1.204952597618103,
"learning_rate": 9.646464646464647e-06,
"loss": 0.0998,
"step": 615
},
{
"epoch": 1.5555555555555556,
"grad_norm": 2.5513834953308105,
"learning_rate": 9.62962962962963e-06,
"loss": 0.0971,
"step": 616
},
{
"epoch": 1.558080808080808,
"grad_norm": 4.2005181312561035,
"learning_rate": 9.612794612794614e-06,
"loss": 0.1096,
"step": 617
},
{
"epoch": 1.5606060606060606,
"grad_norm": 2.5134921073913574,
"learning_rate": 9.595959595959597e-06,
"loss": 0.1817,
"step": 618
},
{
"epoch": 1.5631313131313131,
"grad_norm": 3.6018764972686768,
"learning_rate": 9.57912457912458e-06,
"loss": 0.0849,
"step": 619
},
{
"epoch": 1.5656565656565657,
"grad_norm": 1.6318095922470093,
"learning_rate": 9.562289562289562e-06,
"loss": 0.0661,
"step": 620
},
{
"epoch": 1.5681818181818183,
"grad_norm": 3.3563179969787598,
"learning_rate": 9.545454545454547e-06,
"loss": 0.1141,
"step": 621
},
{
"epoch": 1.5707070707070707,
"grad_norm": 2.074086904525757,
"learning_rate": 9.52861952861953e-06,
"loss": 0.1207,
"step": 622
},
{
"epoch": 1.573232323232323,
"grad_norm": 2.5464348793029785,
"learning_rate": 9.511784511784512e-06,
"loss": 0.1951,
"step": 623
},
{
"epoch": 1.5757575757575757,
"grad_norm": 5.284518718719482,
"learning_rate": 9.494949494949497e-06,
"loss": 0.1868,
"step": 624
},
{
"epoch": 1.5782828282828283,
"grad_norm": 2.5765862464904785,
"learning_rate": 9.47811447811448e-06,
"loss": 0.187,
"step": 625
},
{
"epoch": 1.5808080808080809,
"grad_norm": 4.491573333740234,
"learning_rate": 9.461279461279462e-06,
"loss": 0.1033,
"step": 626
},
{
"epoch": 1.5833333333333335,
"grad_norm": 4.794037818908691,
"learning_rate": 9.444444444444445e-06,
"loss": 0.1638,
"step": 627
},
{
"epoch": 1.5858585858585859,
"grad_norm": 1.3392722606658936,
"learning_rate": 9.427609427609429e-06,
"loss": 0.0673,
"step": 628
},
{
"epoch": 1.5883838383838382,
"grad_norm": 2.59481143951416,
"learning_rate": 9.410774410774412e-06,
"loss": 0.1506,
"step": 629
},
{
"epoch": 1.5909090909090908,
"grad_norm": 1.926398754119873,
"learning_rate": 9.393939393939396e-06,
"loss": 0.0817,
"step": 630
},
{
"epoch": 1.5934343434343434,
"grad_norm": 3.796034812927246,
"learning_rate": 9.377104377104379e-06,
"loss": 0.1526,
"step": 631
},
{
"epoch": 1.595959595959596,
"grad_norm": 3.06642484664917,
"learning_rate": 9.360269360269361e-06,
"loss": 0.158,
"step": 632
},
{
"epoch": 1.5984848484848486,
"grad_norm": 2.3332364559173584,
"learning_rate": 9.343434343434344e-06,
"loss": 0.1412,
"step": 633
},
{
"epoch": 1.601010101010101,
"grad_norm": 13.372260093688965,
"learning_rate": 9.326599326599327e-06,
"loss": 0.0737,
"step": 634
},
{
"epoch": 1.6035353535353534,
"grad_norm": 2.744684934616089,
"learning_rate": 9.309764309764311e-06,
"loss": 0.1845,
"step": 635
},
{
"epoch": 1.606060606060606,
"grad_norm": 4.262907981872559,
"learning_rate": 9.292929292929294e-06,
"loss": 0.2397,
"step": 636
},
{
"epoch": 1.6085858585858586,
"grad_norm": 2.6066222190856934,
"learning_rate": 9.276094276094278e-06,
"loss": 0.0889,
"step": 637
},
{
"epoch": 1.6111111111111112,
"grad_norm": 5.02886962890625,
"learning_rate": 9.25925925925926e-06,
"loss": 0.3094,
"step": 638
},
{
"epoch": 1.6136363636363638,
"grad_norm": 1.2655010223388672,
"learning_rate": 9.242424242424244e-06,
"loss": 0.1043,
"step": 639
},
{
"epoch": 1.6161616161616161,
"grad_norm": 2.1592676639556885,
"learning_rate": 9.225589225589226e-06,
"loss": 0.1541,
"step": 640
},
{
"epoch": 1.6186868686868687,
"grad_norm": 1.4674041271209717,
"learning_rate": 9.208754208754209e-06,
"loss": 0.0803,
"step": 641
},
{
"epoch": 1.621212121212121,
"grad_norm": 1.3324946165084839,
"learning_rate": 9.191919191919193e-06,
"loss": 0.1697,
"step": 642
},
{
"epoch": 1.6237373737373737,
"grad_norm": 4.259162902832031,
"learning_rate": 9.175084175084176e-06,
"loss": 0.1512,
"step": 643
},
{
"epoch": 1.6262626262626263,
"grad_norm": 1.390676498413086,
"learning_rate": 9.15824915824916e-06,
"loss": 0.0868,
"step": 644
},
{
"epoch": 1.628787878787879,
"grad_norm": 2.026618242263794,
"learning_rate": 9.141414141414143e-06,
"loss": 0.0679,
"step": 645
},
{
"epoch": 1.6313131313131313,
"grad_norm": 3.238002061843872,
"learning_rate": 9.124579124579126e-06,
"loss": 0.1706,
"step": 646
},
{
"epoch": 1.6338383838383839,
"grad_norm": 1.8931351900100708,
"learning_rate": 9.107744107744108e-06,
"loss": 0.0645,
"step": 647
},
{
"epoch": 1.6363636363636362,
"grad_norm": 1.5486174821853638,
"learning_rate": 9.090909090909091e-06,
"loss": 0.139,
"step": 648
},
{
"epoch": 1.6388888888888888,
"grad_norm": 1.988709807395935,
"learning_rate": 9.074074074074075e-06,
"loss": 0.2034,
"step": 649
},
{
"epoch": 1.6414141414141414,
"grad_norm": 2.529951572418213,
"learning_rate": 9.057239057239058e-06,
"loss": 0.0846,
"step": 650
},
{
"epoch": 1.643939393939394,
"grad_norm": 6.469368934631348,
"learning_rate": 9.040404040404042e-06,
"loss": 0.1614,
"step": 651
},
{
"epoch": 1.6464646464646466,
"grad_norm": 1.5296707153320312,
"learning_rate": 9.023569023569025e-06,
"loss": 0.074,
"step": 652
},
{
"epoch": 1.648989898989899,
"grad_norm": 3.4863650798797607,
"learning_rate": 9.006734006734008e-06,
"loss": 0.1207,
"step": 653
},
{
"epoch": 1.6515151515151514,
"grad_norm": 4.34932804107666,
"learning_rate": 8.98989898989899e-06,
"loss": 0.1209,
"step": 654
},
{
"epoch": 1.654040404040404,
"grad_norm": 2.05281400680542,
"learning_rate": 8.973063973063973e-06,
"loss": 0.085,
"step": 655
},
{
"epoch": 1.6565656565656566,
"grad_norm": 5.7974677085876465,
"learning_rate": 8.956228956228958e-06,
"loss": 0.1432,
"step": 656
},
{
"epoch": 1.6590909090909092,
"grad_norm": 13.796086311340332,
"learning_rate": 8.93939393939394e-06,
"loss": 0.1743,
"step": 657
},
{
"epoch": 1.6616161616161618,
"grad_norm": 1.2835731506347656,
"learning_rate": 8.922558922558923e-06,
"loss": 0.1032,
"step": 658
},
{
"epoch": 1.6641414141414141,
"grad_norm": 1.330572247505188,
"learning_rate": 8.905723905723906e-06,
"loss": 0.1194,
"step": 659
},
{
"epoch": 1.6666666666666665,
"grad_norm": 2.2639822959899902,
"learning_rate": 8.888888888888888e-06,
"loss": 0.173,
"step": 660
},
{
"epoch": 1.6691919191919191,
"grad_norm": 2.2905423641204834,
"learning_rate": 8.872053872053873e-06,
"loss": 0.1052,
"step": 661
},
{
"epoch": 1.6717171717171717,
"grad_norm": 6.86669397354126,
"learning_rate": 8.855218855218855e-06,
"loss": 0.1658,
"step": 662
},
{
"epoch": 1.6742424242424243,
"grad_norm": 1.9337157011032104,
"learning_rate": 8.83838383838384e-06,
"loss": 0.139,
"step": 663
},
{
"epoch": 1.676767676767677,
"grad_norm": 1.348889708518982,
"learning_rate": 8.821548821548822e-06,
"loss": 0.1243,
"step": 664
},
{
"epoch": 1.6792929292929293,
"grad_norm": 1.4817837476730347,
"learning_rate": 8.804713804713805e-06,
"loss": 0.0633,
"step": 665
},
{
"epoch": 1.6818181818181817,
"grad_norm": 3.970458507537842,
"learning_rate": 8.787878787878788e-06,
"loss": 0.1427,
"step": 666
},
{
"epoch": 1.6843434343434343,
"grad_norm": 6.352334976196289,
"learning_rate": 8.77104377104377e-06,
"loss": 0.1437,
"step": 667
},
{
"epoch": 1.6868686868686869,
"grad_norm": 0.6994425654411316,
"learning_rate": 8.754208754208755e-06,
"loss": 0.0398,
"step": 668
},
{
"epoch": 1.6893939393939394,
"grad_norm": 4.77330207824707,
"learning_rate": 8.737373737373738e-06,
"loss": 0.1319,
"step": 669
},
{
"epoch": 1.691919191919192,
"grad_norm": 3.855506420135498,
"learning_rate": 8.720538720538722e-06,
"loss": 0.1467,
"step": 670
},
{
"epoch": 1.6944444444444444,
"grad_norm": 4.957710266113281,
"learning_rate": 8.703703703703705e-06,
"loss": 0.1414,
"step": 671
},
{
"epoch": 1.696969696969697,
"grad_norm": 2.640568971633911,
"learning_rate": 8.686868686868687e-06,
"loss": 0.2187,
"step": 672
},
{
"epoch": 1.6994949494949494,
"grad_norm": 3.6980481147766113,
"learning_rate": 8.67003367003367e-06,
"loss": 0.1197,
"step": 673
},
{
"epoch": 1.702020202020202,
"grad_norm": 3.419555187225342,
"learning_rate": 8.653198653198653e-06,
"loss": 0.164,
"step": 674
},
{
"epoch": 1.7045454545454546,
"grad_norm": 3.6955320835113525,
"learning_rate": 8.636363636363637e-06,
"loss": 0.1821,
"step": 675
},
{
"epoch": 1.7070707070707072,
"grad_norm": 1.2104640007019043,
"learning_rate": 8.61952861952862e-06,
"loss": 0.0747,
"step": 676
},
{
"epoch": 1.7095959595959596,
"grad_norm": 3.7086238861083984,
"learning_rate": 8.602693602693604e-06,
"loss": 0.1402,
"step": 677
},
{
"epoch": 1.7121212121212122,
"grad_norm": 1.6543469429016113,
"learning_rate": 8.585858585858587e-06,
"loss": 0.0869,
"step": 678
},
{
"epoch": 1.7146464646464645,
"grad_norm": 4.50585412979126,
"learning_rate": 8.56902356902357e-06,
"loss": 0.0926,
"step": 679
},
{
"epoch": 1.7171717171717171,
"grad_norm": 2.2351365089416504,
"learning_rate": 8.552188552188552e-06,
"loss": 0.0886,
"step": 680
},
{
"epoch": 1.7196969696969697,
"grad_norm": 1.8379594087600708,
"learning_rate": 8.535353535353535e-06,
"loss": 0.0671,
"step": 681
},
{
"epoch": 1.7222222222222223,
"grad_norm": 2.2375223636627197,
"learning_rate": 8.518518518518519e-06,
"loss": 0.1455,
"step": 682
},
{
"epoch": 1.7247474747474747,
"grad_norm": 1.758262038230896,
"learning_rate": 8.501683501683502e-06,
"loss": 0.067,
"step": 683
},
{
"epoch": 1.7272727272727273,
"grad_norm": 1.4083460569381714,
"learning_rate": 8.484848484848486e-06,
"loss": 0.0492,
"step": 684
},
{
"epoch": 1.7297979797979797,
"grad_norm": 2.864366292953491,
"learning_rate": 8.468013468013469e-06,
"loss": 0.1483,
"step": 685
},
{
"epoch": 1.7323232323232323,
"grad_norm": 1.695508360862732,
"learning_rate": 8.451178451178452e-06,
"loss": 0.0559,
"step": 686
},
{
"epoch": 1.7348484848484849,
"grad_norm": 5.666776180267334,
"learning_rate": 8.434343434343434e-06,
"loss": 0.1655,
"step": 687
},
{
"epoch": 1.7373737373737375,
"grad_norm": 4.942101001739502,
"learning_rate": 8.417508417508419e-06,
"loss": 0.0525,
"step": 688
},
{
"epoch": 1.73989898989899,
"grad_norm": 2.151745557785034,
"learning_rate": 8.400673400673401e-06,
"loss": 0.137,
"step": 689
},
{
"epoch": 1.7424242424242424,
"grad_norm": 2.4058070182800293,
"learning_rate": 8.383838383838384e-06,
"loss": 0.0805,
"step": 690
},
{
"epoch": 1.7449494949494948,
"grad_norm": 4.35892915725708,
"learning_rate": 8.367003367003368e-06,
"loss": 0.0764,
"step": 691
},
{
"epoch": 1.7474747474747474,
"grad_norm": 1.3333408832550049,
"learning_rate": 8.350168350168351e-06,
"loss": 0.0576,
"step": 692
},
{
"epoch": 1.75,
"grad_norm": 4.402344703674316,
"learning_rate": 8.333333333333334e-06,
"loss": 0.1821,
"step": 693
},
{
"epoch": 1.7525252525252526,
"grad_norm": 4.358558654785156,
"learning_rate": 8.316498316498316e-06,
"loss": 0.1767,
"step": 694
},
{
"epoch": 1.7550505050505052,
"grad_norm": 2.602311372756958,
"learning_rate": 8.2996632996633e-06,
"loss": 0.1474,
"step": 695
},
{
"epoch": 1.7575757575757576,
"grad_norm": 3.5266802310943604,
"learning_rate": 8.282828282828283e-06,
"loss": 0.1917,
"step": 696
},
{
"epoch": 1.76010101010101,
"grad_norm": 5.978867053985596,
"learning_rate": 8.265993265993266e-06,
"loss": 0.1884,
"step": 697
},
{
"epoch": 1.7626262626262625,
"grad_norm": 2.8455355167388916,
"learning_rate": 8.24915824915825e-06,
"loss": 0.1302,
"step": 698
},
{
"epoch": 1.7651515151515151,
"grad_norm": 4.014955520629883,
"learning_rate": 8.232323232323233e-06,
"loss": 0.1731,
"step": 699
},
{
"epoch": 1.7676767676767677,
"grad_norm": 4.700746536254883,
"learning_rate": 8.215488215488216e-06,
"loss": 0.1765,
"step": 700
},
{
"epoch": 1.7702020202020203,
"grad_norm": 3.462686061859131,
"learning_rate": 8.198653198653199e-06,
"loss": 0.0926,
"step": 701
},
{
"epoch": 1.7727272727272727,
"grad_norm": 1.5547245740890503,
"learning_rate": 8.181818181818183e-06,
"loss": 0.0325,
"step": 702
},
{
"epoch": 1.7752525252525253,
"grad_norm": 2.274096727371216,
"learning_rate": 8.164983164983166e-06,
"loss": 0.0642,
"step": 703
},
{
"epoch": 1.7777777777777777,
"grad_norm": 2.7937772274017334,
"learning_rate": 8.148148148148148e-06,
"loss": 0.1084,
"step": 704
},
{
"epoch": 1.7803030303030303,
"grad_norm": 1.720742106437683,
"learning_rate": 8.131313131313133e-06,
"loss": 0.101,
"step": 705
},
{
"epoch": 1.7828282828282829,
"grad_norm": 4.517067909240723,
"learning_rate": 8.114478114478115e-06,
"loss": 0.1059,
"step": 706
},
{
"epoch": 1.7853535353535355,
"grad_norm": 2.7258083820343018,
"learning_rate": 8.097643097643098e-06,
"loss": 0.1329,
"step": 707
},
{
"epoch": 1.7878787878787878,
"grad_norm": 2.474179983139038,
"learning_rate": 8.08080808080808e-06,
"loss": 0.1007,
"step": 708
},
{
"epoch": 1.7904040404040404,
"grad_norm": 2.3355281352996826,
"learning_rate": 8.063973063973065e-06,
"loss": 0.1863,
"step": 709
},
{
"epoch": 1.7929292929292928,
"grad_norm": 3.959667444229126,
"learning_rate": 8.047138047138048e-06,
"loss": 0.0882,
"step": 710
},
{
"epoch": 1.7954545454545454,
"grad_norm": 5.953159809112549,
"learning_rate": 8.03030303030303e-06,
"loss": 0.1024,
"step": 711
},
{
"epoch": 1.797979797979798,
"grad_norm": 3.069732427597046,
"learning_rate": 8.013468013468015e-06,
"loss": 0.084,
"step": 712
},
{
"epoch": 1.8005050505050506,
"grad_norm": 3.06427001953125,
"learning_rate": 7.996632996632998e-06,
"loss": 0.2176,
"step": 713
},
{
"epoch": 1.803030303030303,
"grad_norm": 5.320972442626953,
"learning_rate": 7.97979797979798e-06,
"loss": 0.1877,
"step": 714
},
{
"epoch": 1.8055555555555556,
"grad_norm": 3.8155035972595215,
"learning_rate": 7.962962962962963e-06,
"loss": 0.14,
"step": 715
},
{
"epoch": 1.808080808080808,
"grad_norm": 2.791696310043335,
"learning_rate": 7.946127946127947e-06,
"loss": 0.0694,
"step": 716
},
{
"epoch": 1.8106060606060606,
"grad_norm": 1.7592320442199707,
"learning_rate": 7.92929292929293e-06,
"loss": 0.0426,
"step": 717
},
{
"epoch": 1.8131313131313131,
"grad_norm": 8.306157112121582,
"learning_rate": 7.912457912457913e-06,
"loss": 0.1455,
"step": 718
},
{
"epoch": 1.8156565656565657,
"grad_norm": 3.3673255443573,
"learning_rate": 7.895622895622897e-06,
"loss": 0.1412,
"step": 719
},
{
"epoch": 1.8181818181818183,
"grad_norm": 3.755908966064453,
"learning_rate": 7.87878787878788e-06,
"loss": 0.1096,
"step": 720
},
{
"epoch": 1.8207070707070707,
"grad_norm": 1.6641695499420166,
"learning_rate": 7.861952861952862e-06,
"loss": 0.1231,
"step": 721
},
{
"epoch": 1.823232323232323,
"grad_norm": 3.577352285385132,
"learning_rate": 7.845117845117845e-06,
"loss": 0.07,
"step": 722
},
{
"epoch": 1.8257575757575757,
"grad_norm": 3.3195016384124756,
"learning_rate": 7.82828282828283e-06,
"loss": 0.2131,
"step": 723
},
{
"epoch": 1.8282828282828283,
"grad_norm": 2.113675594329834,
"learning_rate": 7.811447811447812e-06,
"loss": 0.077,
"step": 724
},
{
"epoch": 1.8308080808080809,
"grad_norm": 2.248725414276123,
"learning_rate": 7.794612794612795e-06,
"loss": 0.1106,
"step": 725
},
{
"epoch": 1.8333333333333335,
"grad_norm": 3.8289642333984375,
"learning_rate": 7.77777777777778e-06,
"loss": 0.0919,
"step": 726
},
{
"epoch": 1.8358585858585859,
"grad_norm": 2.4651291370391846,
"learning_rate": 7.760942760942762e-06,
"loss": 0.0724,
"step": 727
},
{
"epoch": 1.8383838383838382,
"grad_norm": 4.7950358390808105,
"learning_rate": 7.744107744107745e-06,
"loss": 0.1148,
"step": 728
},
{
"epoch": 1.8409090909090908,
"grad_norm": 8.350399017333984,
"learning_rate": 7.727272727272727e-06,
"loss": 0.1526,
"step": 729
},
{
"epoch": 1.8434343434343434,
"grad_norm": 2.8314502239227295,
"learning_rate": 7.710437710437712e-06,
"loss": 0.1417,
"step": 730
},
{
"epoch": 1.845959595959596,
"grad_norm": 3.023043155670166,
"learning_rate": 7.693602693602694e-06,
"loss": 0.1971,
"step": 731
},
{
"epoch": 1.8484848484848486,
"grad_norm": 1.6119197607040405,
"learning_rate": 7.676767676767677e-06,
"loss": 0.0754,
"step": 732
},
{
"epoch": 1.851010101010101,
"grad_norm": 5.730337142944336,
"learning_rate": 7.659932659932661e-06,
"loss": 0.0786,
"step": 733
},
{
"epoch": 1.8535353535353534,
"grad_norm": 0.6242827773094177,
"learning_rate": 7.643097643097644e-06,
"loss": 0.0237,
"step": 734
},
{
"epoch": 1.856060606060606,
"grad_norm": 3.5328094959259033,
"learning_rate": 7.6262626262626275e-06,
"loss": 0.1308,
"step": 735
},
{
"epoch": 1.8585858585858586,
"grad_norm": 2.5661208629608154,
"learning_rate": 7.60942760942761e-06,
"loss": 0.1202,
"step": 736
},
{
"epoch": 1.8611111111111112,
"grad_norm": 1.5449377298355103,
"learning_rate": 7.592592592592594e-06,
"loss": 0.0886,
"step": 737
},
{
"epoch": 1.8636363636363638,
"grad_norm": 4.09519100189209,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.1398,
"step": 738
},
{
"epoch": 1.8661616161616161,
"grad_norm": 3.5463318824768066,
"learning_rate": 7.558922558922559e-06,
"loss": 0.1023,
"step": 739
},
{
"epoch": 1.8686868686868687,
"grad_norm": 2.5558698177337646,
"learning_rate": 7.542087542087543e-06,
"loss": 0.1335,
"step": 740
},
{
"epoch": 1.871212121212121,
"grad_norm": 1.5937213897705078,
"learning_rate": 7.525252525252525e-06,
"loss": 0.0928,
"step": 741
},
{
"epoch": 1.8737373737373737,
"grad_norm": 2.4672536849975586,
"learning_rate": 7.508417508417509e-06,
"loss": 0.2052,
"step": 742
},
{
"epoch": 1.8762626262626263,
"grad_norm": 1.365451693534851,
"learning_rate": 7.491582491582492e-06,
"loss": 0.1414,
"step": 743
},
{
"epoch": 1.878787878787879,
"grad_norm": 0.8678475618362427,
"learning_rate": 7.474747474747476e-06,
"loss": 0.0786,
"step": 744
},
{
"epoch": 1.8813131313131313,
"grad_norm": 3.8532655239105225,
"learning_rate": 7.457912457912459e-06,
"loss": 0.2117,
"step": 745
},
{
"epoch": 1.8838383838383839,
"grad_norm": 5.75984525680542,
"learning_rate": 7.441077441077442e-06,
"loss": 0.1238,
"step": 746
},
{
"epoch": 1.8863636363636362,
"grad_norm": 1.1473771333694458,
"learning_rate": 7.424242424242425e-06,
"loss": 0.0895,
"step": 747
},
{
"epoch": 1.8888888888888888,
"grad_norm": 1.526085376739502,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.1088,
"step": 748
},
{
"epoch": 1.8914141414141414,
"grad_norm": 4.124934673309326,
"learning_rate": 7.390572390572391e-06,
"loss": 0.0826,
"step": 749
},
{
"epoch": 1.893939393939394,
"grad_norm": 6.274197101593018,
"learning_rate": 7.373737373737374e-06,
"loss": 0.1513,
"step": 750
},
{
"epoch": 1.8964646464646466,
"grad_norm": 1.4224315881729126,
"learning_rate": 7.356902356902358e-06,
"loss": 0.1091,
"step": 751
},
{
"epoch": 1.898989898989899,
"grad_norm": 4.506265640258789,
"learning_rate": 7.340067340067341e-06,
"loss": 0.1017,
"step": 752
},
{
"epoch": 1.9015151515151514,
"grad_norm": 1.0609605312347412,
"learning_rate": 7.323232323232324e-06,
"loss": 0.0597,
"step": 753
},
{
"epoch": 1.904040404040404,
"grad_norm": 3.9881186485290527,
"learning_rate": 7.306397306397307e-06,
"loss": 0.1244,
"step": 754
},
{
"epoch": 1.9065656565656566,
"grad_norm": 1.8625434637069702,
"learning_rate": 7.28956228956229e-06,
"loss": 0.147,
"step": 755
},
{
"epoch": 1.9090909090909092,
"grad_norm": 8.011527061462402,
"learning_rate": 7.272727272727273e-06,
"loss": 0.0823,
"step": 756
},
{
"epoch": 1.9116161616161618,
"grad_norm": 2.0574049949645996,
"learning_rate": 7.255892255892256e-06,
"loss": 0.0667,
"step": 757
},
{
"epoch": 1.9141414141414141,
"grad_norm": 1.5154629945755005,
"learning_rate": 7.23905723905724e-06,
"loss": 0.1717,
"step": 758
},
{
"epoch": 1.9166666666666665,
"grad_norm": 2.105567455291748,
"learning_rate": 7.222222222222223e-06,
"loss": 0.1676,
"step": 759
},
{
"epoch": 1.9191919191919191,
"grad_norm": 1.6874696016311646,
"learning_rate": 7.2053872053872064e-06,
"loss": 0.1089,
"step": 760
},
{
"epoch": 1.9217171717171717,
"grad_norm": 2.980811357498169,
"learning_rate": 7.188552188552189e-06,
"loss": 0.1806,
"step": 761
},
{
"epoch": 1.9242424242424243,
"grad_norm": 2.0981791019439697,
"learning_rate": 7.171717171717172e-06,
"loss": 0.0859,
"step": 762
},
{
"epoch": 1.926767676767677,
"grad_norm": 1.835482120513916,
"learning_rate": 7.154882154882155e-06,
"loss": 0.1364,
"step": 763
},
{
"epoch": 1.9292929292929293,
"grad_norm": 4.000125885009766,
"learning_rate": 7.138047138047138e-06,
"loss": 0.1354,
"step": 764
},
{
"epoch": 1.9318181818181817,
"grad_norm": 4.924983978271484,
"learning_rate": 7.121212121212122e-06,
"loss": 0.1154,
"step": 765
},
{
"epoch": 1.9343434343434343,
"grad_norm": 1.5840011835098267,
"learning_rate": 7.104377104377105e-06,
"loss": 0.1016,
"step": 766
},
{
"epoch": 1.9368686868686869,
"grad_norm": 1.5436311960220337,
"learning_rate": 7.087542087542089e-06,
"loss": 0.1168,
"step": 767
},
{
"epoch": 1.9393939393939394,
"grad_norm": 2.4922754764556885,
"learning_rate": 7.070707070707071e-06,
"loss": 0.1187,
"step": 768
},
{
"epoch": 1.941919191919192,
"grad_norm": 3.206899881362915,
"learning_rate": 7.053872053872054e-06,
"loss": 0.1184,
"step": 769
},
{
"epoch": 1.9444444444444444,
"grad_norm": 4.3798828125,
"learning_rate": 7.0370370370370375e-06,
"loss": 0.1997,
"step": 770
},
{
"epoch": 1.946969696969697,
"grad_norm": 1.3223721981048584,
"learning_rate": 7.02020202020202e-06,
"loss": 0.073,
"step": 771
},
{
"epoch": 1.9494949494949494,
"grad_norm": 2.0767436027526855,
"learning_rate": 7.0033670033670045e-06,
"loss": 0.1251,
"step": 772
},
{
"epoch": 1.952020202020202,
"grad_norm": 1.8936235904693604,
"learning_rate": 6.986531986531987e-06,
"loss": 0.0956,
"step": 773
},
{
"epoch": 1.9545454545454546,
"grad_norm": 6.86482048034668,
"learning_rate": 6.969696969696971e-06,
"loss": 0.1269,
"step": 774
},
{
"epoch": 1.9570707070707072,
"grad_norm": 2.885071039199829,
"learning_rate": 6.9528619528619534e-06,
"loss": 0.0974,
"step": 775
},
{
"epoch": 1.9595959595959596,
"grad_norm": 4.58144474029541,
"learning_rate": 6.936026936026936e-06,
"loss": 0.3284,
"step": 776
},
{
"epoch": 1.9621212121212122,
"grad_norm": 4.064563274383545,
"learning_rate": 6.91919191919192e-06,
"loss": 0.1659,
"step": 777
},
{
"epoch": 1.9646464646464645,
"grad_norm": 1.5637133121490479,
"learning_rate": 6.902356902356902e-06,
"loss": 0.1369,
"step": 778
},
{
"epoch": 1.9671717171717171,
"grad_norm": 2.932281494140625,
"learning_rate": 6.885521885521887e-06,
"loss": 0.0865,
"step": 779
},
{
"epoch": 1.9696969696969697,
"grad_norm": 1.1261810064315796,
"learning_rate": 6.868686868686869e-06,
"loss": 0.1245,
"step": 780
},
{
"epoch": 1.9722222222222223,
"grad_norm": 3.991880178451538,
"learning_rate": 6.851851851851853e-06,
"loss": 0.188,
"step": 781
},
{
"epoch": 1.9747474747474747,
"grad_norm": 1.7972675561904907,
"learning_rate": 6.835016835016836e-06,
"loss": 0.1832,
"step": 782
},
{
"epoch": 1.9772727272727273,
"grad_norm": 2.0975348949432373,
"learning_rate": 6.818181818181818e-06,
"loss": 0.0416,
"step": 783
},
{
"epoch": 1.9797979797979797,
"grad_norm": 2.6938462257385254,
"learning_rate": 6.801346801346802e-06,
"loss": 0.1471,
"step": 784
},
{
"epoch": 1.9823232323232323,
"grad_norm": 2.680722951889038,
"learning_rate": 6.7845117845117845e-06,
"loss": 0.1255,
"step": 785
},
{
"epoch": 1.9848484848484849,
"grad_norm": 4.923444747924805,
"learning_rate": 6.767676767676769e-06,
"loss": 0.1087,
"step": 786
},
{
"epoch": 1.9873737373737375,
"grad_norm": 3.3977975845336914,
"learning_rate": 6.7508417508417515e-06,
"loss": 0.1198,
"step": 787
},
{
"epoch": 1.98989898989899,
"grad_norm": 2.9619626998901367,
"learning_rate": 6.734006734006735e-06,
"loss": 0.104,
"step": 788
},
{
"epoch": 1.9924242424242424,
"grad_norm": 1.3148123025894165,
"learning_rate": 6.717171717171718e-06,
"loss": 0.0854,
"step": 789
},
{
"epoch": 1.9949494949494948,
"grad_norm": 1.7584114074707031,
"learning_rate": 6.7003367003367004e-06,
"loss": 0.136,
"step": 790
},
{
"epoch": 1.9974747474747474,
"grad_norm": 8.245304107666016,
"learning_rate": 6.683501683501684e-06,
"loss": 0.1525,
"step": 791
},
{
"epoch": 2.0,
"grad_norm": 1.205091118812561,
"learning_rate": 6.666666666666667e-06,
"loss": 0.0617,
"step": 792
},
{
"epoch": 2.0,
"eval_accuracy": 0.759090909090909,
"eval_f1": 0.8949265317438315,
"eval_loss": 0.13888753950595856,
"eval_runtime": 43.47,
"eval_samples_per_second": 20.244,
"eval_steps_per_second": 0.437,
"step": 792
},
{
"epoch": 2.0025252525252526,
"grad_norm": 2.01649808883667,
"learning_rate": 6.649831649831651e-06,
"loss": 0.1073,
"step": 793
},
{
"epoch": 2.005050505050505,
"grad_norm": 6.579789161682129,
"learning_rate": 6.632996632996634e-06,
"loss": 0.1704,
"step": 794
},
{
"epoch": 2.007575757575758,
"grad_norm": 2.323598623275757,
"learning_rate": 6.616161616161617e-06,
"loss": 0.1983,
"step": 795
},
{
"epoch": 2.01010101010101,
"grad_norm": 2.126936674118042,
"learning_rate": 6.5993265993266e-06,
"loss": 0.1026,
"step": 796
},
{
"epoch": 2.0126262626262625,
"grad_norm": 1.035873293876648,
"learning_rate": 6.582491582491583e-06,
"loss": 0.0488,
"step": 797
},
{
"epoch": 2.015151515151515,
"grad_norm": 2.2837603092193604,
"learning_rate": 6.565656565656566e-06,
"loss": 0.1894,
"step": 798
},
{
"epoch": 2.0176767676767677,
"grad_norm": 7.866192817687988,
"learning_rate": 6.548821548821549e-06,
"loss": 0.2146,
"step": 799
},
{
"epoch": 2.0202020202020203,
"grad_norm": 4.450189590454102,
"learning_rate": 6.531986531986533e-06,
"loss": 0.0731,
"step": 800
},
{
"epoch": 2.022727272727273,
"grad_norm": 2.2905592918395996,
"learning_rate": 6.515151515151516e-06,
"loss": 0.0736,
"step": 801
},
{
"epoch": 2.025252525252525,
"grad_norm": 1.7175313234329224,
"learning_rate": 6.498316498316499e-06,
"loss": 0.1525,
"step": 802
},
{
"epoch": 2.0277777777777777,
"grad_norm": 3.22578763961792,
"learning_rate": 6.481481481481482e-06,
"loss": 0.1093,
"step": 803
},
{
"epoch": 2.0303030303030303,
"grad_norm": 1.8242607116699219,
"learning_rate": 6.464646464646466e-06,
"loss": 0.1138,
"step": 804
},
{
"epoch": 2.032828282828283,
"grad_norm": 2.7062501907348633,
"learning_rate": 6.447811447811448e-06,
"loss": 0.0932,
"step": 805
},
{
"epoch": 2.0353535353535355,
"grad_norm": 1.2171615362167358,
"learning_rate": 6.430976430976431e-06,
"loss": 0.0692,
"step": 806
},
{
"epoch": 2.037878787878788,
"grad_norm": 5.950473308563232,
"learning_rate": 6.4141414141414145e-06,
"loss": 0.264,
"step": 807
},
{
"epoch": 2.04040404040404,
"grad_norm": 4.191005706787109,
"learning_rate": 6.397306397306397e-06,
"loss": 0.0666,
"step": 808
},
{
"epoch": 2.042929292929293,
"grad_norm": 3.99367618560791,
"learning_rate": 6.3804713804713816e-06,
"loss": 0.1528,
"step": 809
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.7054336667060852,
"learning_rate": 6.363636363636364e-06,
"loss": 0.0477,
"step": 810
},
{
"epoch": 2.047979797979798,
"grad_norm": 3.71244478225708,
"learning_rate": 6.346801346801348e-06,
"loss": 0.1287,
"step": 811
},
{
"epoch": 2.0505050505050506,
"grad_norm": 3.171588897705078,
"learning_rate": 6.3299663299663304e-06,
"loss": 0.087,
"step": 812
},
{
"epoch": 2.053030303030303,
"grad_norm": 1.4060291051864624,
"learning_rate": 6.313131313131313e-06,
"loss": 0.1075,
"step": 813
},
{
"epoch": 2.0555555555555554,
"grad_norm": 2.073291540145874,
"learning_rate": 6.296296296296297e-06,
"loss": 0.0653,
"step": 814
},
{
"epoch": 2.058080808080808,
"grad_norm": 6.517178058624268,
"learning_rate": 6.279461279461279e-06,
"loss": 0.1234,
"step": 815
},
{
"epoch": 2.0606060606060606,
"grad_norm": 11.045914649963379,
"learning_rate": 6.262626262626264e-06,
"loss": 0.1273,
"step": 816
},
{
"epoch": 2.063131313131313,
"grad_norm": 1.7747228145599365,
"learning_rate": 6.245791245791246e-06,
"loss": 0.1618,
"step": 817
},
{
"epoch": 2.0656565656565657,
"grad_norm": 1.5213820934295654,
"learning_rate": 6.22895622895623e-06,
"loss": 0.106,
"step": 818
},
{
"epoch": 2.0681818181818183,
"grad_norm": 1.4155036211013794,
"learning_rate": 6.212121212121213e-06,
"loss": 0.0759,
"step": 819
},
{
"epoch": 2.0707070707070705,
"grad_norm": 1.0913715362548828,
"learning_rate": 6.195286195286195e-06,
"loss": 0.0908,
"step": 820
},
{
"epoch": 2.073232323232323,
"grad_norm": 4.059940814971924,
"learning_rate": 6.178451178451179e-06,
"loss": 0.1544,
"step": 821
},
{
"epoch": 2.0757575757575757,
"grad_norm": 1.2122453451156616,
"learning_rate": 6.1616161616161615e-06,
"loss": 0.0959,
"step": 822
},
{
"epoch": 2.0782828282828283,
"grad_norm": 2.069533109664917,
"learning_rate": 6.144781144781146e-06,
"loss": 0.0488,
"step": 823
},
{
"epoch": 2.080808080808081,
"grad_norm": 1.685937523841858,
"learning_rate": 6.1279461279461286e-06,
"loss": 0.1315,
"step": 824
},
{
"epoch": 2.0833333333333335,
"grad_norm": 3.1984479427337646,
"learning_rate": 6.111111111111112e-06,
"loss": 0.117,
"step": 825
},
{
"epoch": 2.0858585858585856,
"grad_norm": 3.422079086303711,
"learning_rate": 6.094276094276095e-06,
"loss": 0.1104,
"step": 826
},
{
"epoch": 2.0883838383838382,
"grad_norm": 1.3577680587768555,
"learning_rate": 6.0774410774410774e-06,
"loss": 0.0583,
"step": 827
},
{
"epoch": 2.090909090909091,
"grad_norm": 2.0477261543273926,
"learning_rate": 6.060606060606061e-06,
"loss": 0.2046,
"step": 828
},
{
"epoch": 2.0934343434343434,
"grad_norm": 2.3478550910949707,
"learning_rate": 6.043771043771044e-06,
"loss": 0.1482,
"step": 829
},
{
"epoch": 2.095959595959596,
"grad_norm": 1.0065677165985107,
"learning_rate": 6.026936026936028e-06,
"loss": 0.0322,
"step": 830
},
{
"epoch": 2.0984848484848486,
"grad_norm": 2.0075066089630127,
"learning_rate": 6.010101010101011e-06,
"loss": 0.1149,
"step": 831
},
{
"epoch": 2.101010101010101,
"grad_norm": 2.6007728576660156,
"learning_rate": 5.993265993265994e-06,
"loss": 0.1527,
"step": 832
},
{
"epoch": 2.1035353535353534,
"grad_norm": 2.199341058731079,
"learning_rate": 5.976430976430977e-06,
"loss": 0.0776,
"step": 833
},
{
"epoch": 2.106060606060606,
"grad_norm": 2.4440650939941406,
"learning_rate": 5.95959595959596e-06,
"loss": 0.0902,
"step": 834
},
{
"epoch": 2.1085858585858586,
"grad_norm": 1.7312313318252563,
"learning_rate": 5.942760942760943e-06,
"loss": 0.0723,
"step": 835
},
{
"epoch": 2.111111111111111,
"grad_norm": 2.232499122619629,
"learning_rate": 5.925925925925926e-06,
"loss": 0.1162,
"step": 836
},
{
"epoch": 2.1136363636363638,
"grad_norm": 2.4596776962280273,
"learning_rate": 5.90909090909091e-06,
"loss": 0.1808,
"step": 837
},
{
"epoch": 2.1161616161616164,
"grad_norm": 4.917704105377197,
"learning_rate": 5.892255892255893e-06,
"loss": 0.1169,
"step": 838
},
{
"epoch": 2.1186868686868685,
"grad_norm": 3.716489553451538,
"learning_rate": 5.875420875420876e-06,
"loss": 0.0809,
"step": 839
},
{
"epoch": 2.121212121212121,
"grad_norm": 4.413392066955566,
"learning_rate": 5.858585858585859e-06,
"loss": 0.1834,
"step": 840
},
{
"epoch": 2.1237373737373737,
"grad_norm": 1.872174859046936,
"learning_rate": 5.841750841750842e-06,
"loss": 0.1462,
"step": 841
},
{
"epoch": 2.1262626262626263,
"grad_norm": 3.7974910736083984,
"learning_rate": 5.824915824915825e-06,
"loss": 0.0939,
"step": 842
},
{
"epoch": 2.128787878787879,
"grad_norm": 1.4759098291397095,
"learning_rate": 5.808080808080808e-06,
"loss": 0.0713,
"step": 843
},
{
"epoch": 2.1313131313131315,
"grad_norm": 2.160318613052368,
"learning_rate": 5.791245791245792e-06,
"loss": 0.2184,
"step": 844
},
{
"epoch": 2.1338383838383836,
"grad_norm": 2.485347270965576,
"learning_rate": 5.774410774410775e-06,
"loss": 0.1732,
"step": 845
},
{
"epoch": 2.1363636363636362,
"grad_norm": 0.8993260264396667,
"learning_rate": 5.7575757575757586e-06,
"loss": 0.0909,
"step": 846
},
{
"epoch": 2.138888888888889,
"grad_norm": 1.436485767364502,
"learning_rate": 5.740740740740741e-06,
"loss": 0.0394,
"step": 847
},
{
"epoch": 2.1414141414141414,
"grad_norm": 0.9625018835067749,
"learning_rate": 5.723905723905724e-06,
"loss": 0.0382,
"step": 848
},
{
"epoch": 2.143939393939394,
"grad_norm": 1.4799765348434448,
"learning_rate": 5.7070707070707075e-06,
"loss": 0.094,
"step": 849
},
{
"epoch": 2.1464646464646466,
"grad_norm": 3.625958204269409,
"learning_rate": 5.69023569023569e-06,
"loss": 0.1394,
"step": 850
},
{
"epoch": 2.148989898989899,
"grad_norm": 1.3515892028808594,
"learning_rate": 5.6734006734006745e-06,
"loss": 0.1068,
"step": 851
},
{
"epoch": 2.1515151515151514,
"grad_norm": 1.9746239185333252,
"learning_rate": 5.656565656565657e-06,
"loss": 0.1404,
"step": 852
},
{
"epoch": 2.154040404040404,
"grad_norm": 3.5076723098754883,
"learning_rate": 5.639730639730641e-06,
"loss": 0.1236,
"step": 853
},
{
"epoch": 2.1565656565656566,
"grad_norm": 1.3625231981277466,
"learning_rate": 5.622895622895623e-06,
"loss": 0.0698,
"step": 854
},
{
"epoch": 2.159090909090909,
"grad_norm": 2.441847324371338,
"learning_rate": 5.606060606060606e-06,
"loss": 0.1029,
"step": 855
},
{
"epoch": 2.1616161616161618,
"grad_norm": 3.1259806156158447,
"learning_rate": 5.58922558922559e-06,
"loss": 0.105,
"step": 856
},
{
"epoch": 2.1641414141414144,
"grad_norm": 5.127650260925293,
"learning_rate": 5.572390572390572e-06,
"loss": 0.1202,
"step": 857
},
{
"epoch": 2.1666666666666665,
"grad_norm": 1.3531067371368408,
"learning_rate": 5.555555555555557e-06,
"loss": 0.0812,
"step": 858
},
{
"epoch": 2.169191919191919,
"grad_norm": 5.6110920906066895,
"learning_rate": 5.538720538720539e-06,
"loss": 0.0898,
"step": 859
},
{
"epoch": 2.1717171717171717,
"grad_norm": 2.4415769577026367,
"learning_rate": 5.521885521885523e-06,
"loss": 0.231,
"step": 860
},
{
"epoch": 2.1742424242424243,
"grad_norm": 3.1470277309417725,
"learning_rate": 5.5050505050505056e-06,
"loss": 0.0609,
"step": 861
},
{
"epoch": 2.176767676767677,
"grad_norm": 2.625209093093872,
"learning_rate": 5.488215488215489e-06,
"loss": 0.1126,
"step": 862
},
{
"epoch": 2.179292929292929,
"grad_norm": 9.551560401916504,
"learning_rate": 5.471380471380472e-06,
"loss": 0.0788,
"step": 863
},
{
"epoch": 2.1818181818181817,
"grad_norm": 2.088391065597534,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.1863,
"step": 864
},
{
"epoch": 2.1843434343434343,
"grad_norm": 2.9452109336853027,
"learning_rate": 5.437710437710438e-06,
"loss": 0.1449,
"step": 865
},
{
"epoch": 2.186868686868687,
"grad_norm": 2.6503803730010986,
"learning_rate": 5.420875420875421e-06,
"loss": 0.1128,
"step": 866
},
{
"epoch": 2.1893939393939394,
"grad_norm": 6.2185587882995605,
"learning_rate": 5.404040404040405e-06,
"loss": 0.2125,
"step": 867
},
{
"epoch": 2.191919191919192,
"grad_norm": 1.5772247314453125,
"learning_rate": 5.387205387205388e-06,
"loss": 0.117,
"step": 868
},
{
"epoch": 2.1944444444444446,
"grad_norm": 4.648830413818359,
"learning_rate": 5.370370370370371e-06,
"loss": 0.1646,
"step": 869
},
{
"epoch": 2.196969696969697,
"grad_norm": 2.4655864238739014,
"learning_rate": 5.353535353535354e-06,
"loss": 0.0718,
"step": 870
},
{
"epoch": 2.1994949494949494,
"grad_norm": 1.3793933391571045,
"learning_rate": 5.336700336700337e-06,
"loss": 0.1087,
"step": 871
},
{
"epoch": 2.202020202020202,
"grad_norm": 2.5595717430114746,
"learning_rate": 5.31986531986532e-06,
"loss": 0.1177,
"step": 872
},
{
"epoch": 2.2045454545454546,
"grad_norm": 4.922736167907715,
"learning_rate": 5.303030303030303e-06,
"loss": 0.0976,
"step": 873
},
{
"epoch": 2.207070707070707,
"grad_norm": 2.5227010250091553,
"learning_rate": 5.286195286195287e-06,
"loss": 0.1744,
"step": 874
},
{
"epoch": 2.20959595959596,
"grad_norm": 1.9036935567855835,
"learning_rate": 5.26936026936027e-06,
"loss": 0.1184,
"step": 875
},
{
"epoch": 2.212121212121212,
"grad_norm": 1.5138955116271973,
"learning_rate": 5.252525252525253e-06,
"loss": 0.1052,
"step": 876
},
{
"epoch": 2.2146464646464645,
"grad_norm": 2.0152668952941895,
"learning_rate": 5.235690235690236e-06,
"loss": 0.0952,
"step": 877
},
{
"epoch": 2.217171717171717,
"grad_norm": 13.834627151489258,
"learning_rate": 5.218855218855219e-06,
"loss": 0.0788,
"step": 878
},
{
"epoch": 2.2196969696969697,
"grad_norm": 2.163512945175171,
"learning_rate": 5.202020202020202e-06,
"loss": 0.1584,
"step": 879
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.2292289733886719,
"learning_rate": 5.185185185185185e-06,
"loss": 0.1036,
"step": 880
},
{
"epoch": 2.224747474747475,
"grad_norm": 2.1541199684143066,
"learning_rate": 5.168350168350169e-06,
"loss": 0.0994,
"step": 881
},
{
"epoch": 2.227272727272727,
"grad_norm": 2.9435672760009766,
"learning_rate": 5.151515151515152e-06,
"loss": 0.1071,
"step": 882
},
{
"epoch": 2.2297979797979797,
"grad_norm": 4.930500507354736,
"learning_rate": 5.1346801346801356e-06,
"loss": 0.2478,
"step": 883
},
{
"epoch": 2.2323232323232323,
"grad_norm": 13.543425559997559,
"learning_rate": 5.117845117845118e-06,
"loss": 0.0954,
"step": 884
},
{
"epoch": 2.234848484848485,
"grad_norm": 1.8627355098724365,
"learning_rate": 5.101010101010101e-06,
"loss": 0.2159,
"step": 885
},
{
"epoch": 2.2373737373737375,
"grad_norm": 1.9947534799575806,
"learning_rate": 5.0841750841750845e-06,
"loss": 0.0787,
"step": 886
},
{
"epoch": 2.23989898989899,
"grad_norm": 5.217324733734131,
"learning_rate": 5.067340067340067e-06,
"loss": 0.1191,
"step": 887
},
{
"epoch": 2.242424242424242,
"grad_norm": 1.540475845336914,
"learning_rate": 5.0505050505050515e-06,
"loss": 0.0773,
"step": 888
},
{
"epoch": 2.244949494949495,
"grad_norm": 4.879143714904785,
"learning_rate": 5.033670033670034e-06,
"loss": 0.1236,
"step": 889
},
{
"epoch": 2.2474747474747474,
"grad_norm": 4.0901641845703125,
"learning_rate": 5.016835016835018e-06,
"loss": 0.0619,
"step": 890
},
{
"epoch": 2.25,
"grad_norm": 1.8532190322875977,
"learning_rate": 5e-06,
"loss": 0.0767,
"step": 891
},
{
"epoch": 2.2525252525252526,
"grad_norm": 3.4842894077301025,
"learning_rate": 4.983164983164984e-06,
"loss": 0.0984,
"step": 892
},
{
"epoch": 2.255050505050505,
"grad_norm": 1.4197821617126465,
"learning_rate": 4.966329966329967e-06,
"loss": 0.0441,
"step": 893
},
{
"epoch": 2.257575757575758,
"grad_norm": 1.3725179433822632,
"learning_rate": 4.94949494949495e-06,
"loss": 0.0636,
"step": 894
},
{
"epoch": 2.26010101010101,
"grad_norm": 3.0550286769866943,
"learning_rate": 4.932659932659933e-06,
"loss": 0.1222,
"step": 895
},
{
"epoch": 2.2626262626262625,
"grad_norm": 1.3511768579483032,
"learning_rate": 4.915824915824916e-06,
"loss": 0.102,
"step": 896
},
{
"epoch": 2.265151515151515,
"grad_norm": 2.8341774940490723,
"learning_rate": 4.898989898989899e-06,
"loss": 0.1176,
"step": 897
},
{
"epoch": 2.2676767676767677,
"grad_norm": 5.220274925231934,
"learning_rate": 4.8821548821548826e-06,
"loss": 0.1828,
"step": 898
},
{
"epoch": 2.2702020202020203,
"grad_norm": 2.0751826763153076,
"learning_rate": 4.865319865319866e-06,
"loss": 0.0472,
"step": 899
},
{
"epoch": 2.2727272727272725,
"grad_norm": 1.0210275650024414,
"learning_rate": 4.848484848484849e-06,
"loss": 0.1155,
"step": 900
},
{
"epoch": 2.275252525252525,
"grad_norm": 2.244605541229248,
"learning_rate": 4.831649831649832e-06,
"loss": 0.1298,
"step": 901
},
{
"epoch": 2.2777777777777777,
"grad_norm": 1.2191749811172485,
"learning_rate": 4.814814814814815e-06,
"loss": 0.0553,
"step": 902
},
{
"epoch": 2.2803030303030303,
"grad_norm": 2.009685516357422,
"learning_rate": 4.7979797979797985e-06,
"loss": 0.2061,
"step": 903
},
{
"epoch": 2.282828282828283,
"grad_norm": 2.537893056869507,
"learning_rate": 4.781144781144781e-06,
"loss": 0.1638,
"step": 904
},
{
"epoch": 2.2853535353535355,
"grad_norm": 1.8385186195373535,
"learning_rate": 4.764309764309765e-06,
"loss": 0.1457,
"step": 905
},
{
"epoch": 2.287878787878788,
"grad_norm": 3.0959956645965576,
"learning_rate": 4.747474747474748e-06,
"loss": 0.0624,
"step": 906
},
{
"epoch": 2.29040404040404,
"grad_norm": 1.0412582159042358,
"learning_rate": 4.730639730639731e-06,
"loss": 0.0605,
"step": 907
},
{
"epoch": 2.292929292929293,
"grad_norm": 1.1493240594863892,
"learning_rate": 4.7138047138047145e-06,
"loss": 0.0818,
"step": 908
},
{
"epoch": 2.2954545454545454,
"grad_norm": 1.573701024055481,
"learning_rate": 4.696969696969698e-06,
"loss": 0.1485,
"step": 909
},
{
"epoch": 2.297979797979798,
"grad_norm": 3.5485622882843018,
"learning_rate": 4.680134680134681e-06,
"loss": 0.0746,
"step": 910
},
{
"epoch": 2.3005050505050506,
"grad_norm": 2.589240550994873,
"learning_rate": 4.663299663299663e-06,
"loss": 0.0669,
"step": 911
},
{
"epoch": 2.303030303030303,
"grad_norm": 3.300288677215576,
"learning_rate": 4.646464646464647e-06,
"loss": 0.1589,
"step": 912
},
{
"epoch": 2.3055555555555554,
"grad_norm": 2.2439637184143066,
"learning_rate": 4.62962962962963e-06,
"loss": 0.0927,
"step": 913
},
{
"epoch": 2.308080808080808,
"grad_norm": 3.438167095184326,
"learning_rate": 4.612794612794613e-06,
"loss": 0.1405,
"step": 914
},
{
"epoch": 2.3106060606060606,
"grad_norm": 1.1554774045944214,
"learning_rate": 4.595959595959597e-06,
"loss": 0.1113,
"step": 915
},
{
"epoch": 2.313131313131313,
"grad_norm": 2.269124984741211,
"learning_rate": 4.57912457912458e-06,
"loss": 0.1054,
"step": 916
},
{
"epoch": 2.3156565656565657,
"grad_norm": 3.707484722137451,
"learning_rate": 4.562289562289563e-06,
"loss": 0.1573,
"step": 917
},
{
"epoch": 2.3181818181818183,
"grad_norm": 3.806281089782715,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.1247,
"step": 918
},
{
"epoch": 2.320707070707071,
"grad_norm": 5.063516616821289,
"learning_rate": 4.528619528619529e-06,
"loss": 0.077,
"step": 919
},
{
"epoch": 2.323232323232323,
"grad_norm": 1.84391450881958,
"learning_rate": 4.5117845117845126e-06,
"loss": 0.13,
"step": 920
},
{
"epoch": 2.3257575757575757,
"grad_norm": 2.5902676582336426,
"learning_rate": 4.494949494949495e-06,
"loss": 0.1043,
"step": 921
},
{
"epoch": 2.3282828282828283,
"grad_norm": 1.1772695779800415,
"learning_rate": 4.478114478114479e-06,
"loss": 0.0875,
"step": 922
},
{
"epoch": 2.330808080808081,
"grad_norm": 1.865903377532959,
"learning_rate": 4.4612794612794615e-06,
"loss": 0.1552,
"step": 923
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.9699102640151978,
"learning_rate": 4.444444444444444e-06,
"loss": 0.0433,
"step": 924
},
{
"epoch": 2.3358585858585856,
"grad_norm": 3.4536280632019043,
"learning_rate": 4.427609427609428e-06,
"loss": 0.1309,
"step": 925
},
{
"epoch": 2.3383838383838382,
"grad_norm": 9.139911651611328,
"learning_rate": 4.410774410774411e-06,
"loss": 0.1629,
"step": 926
},
{
"epoch": 2.340909090909091,
"grad_norm": 2.665511131286621,
"learning_rate": 4.393939393939394e-06,
"loss": 0.1347,
"step": 927
},
{
"epoch": 2.3434343434343434,
"grad_norm": 1.851479172706604,
"learning_rate": 4.377104377104377e-06,
"loss": 0.0453,
"step": 928
},
{
"epoch": 2.345959595959596,
"grad_norm": 4.813875675201416,
"learning_rate": 4.360269360269361e-06,
"loss": 0.1395,
"step": 929
},
{
"epoch": 2.3484848484848486,
"grad_norm": 1.4313777685165405,
"learning_rate": 4.343434343434344e-06,
"loss": 0.1065,
"step": 930
},
{
"epoch": 2.351010101010101,
"grad_norm": 3.5636346340179443,
"learning_rate": 4.326599326599326e-06,
"loss": 0.2765,
"step": 931
},
{
"epoch": 2.3535353535353534,
"grad_norm": 2.2551841735839844,
"learning_rate": 4.30976430976431e-06,
"loss": 0.0752,
"step": 932
},
{
"epoch": 2.356060606060606,
"grad_norm": 2.989997625350952,
"learning_rate": 4.292929292929293e-06,
"loss": 0.0626,
"step": 933
},
{
"epoch": 2.3585858585858586,
"grad_norm": 2.648948907852173,
"learning_rate": 4.276094276094276e-06,
"loss": 0.1131,
"step": 934
},
{
"epoch": 2.361111111111111,
"grad_norm": 4.4058685302734375,
"learning_rate": 4.2592592592592596e-06,
"loss": 0.1696,
"step": 935
},
{
"epoch": 2.3636363636363638,
"grad_norm": 2.665522575378418,
"learning_rate": 4.242424242424243e-06,
"loss": 0.1484,
"step": 936
},
{
"epoch": 2.3661616161616164,
"grad_norm": 0.8671731352806091,
"learning_rate": 4.225589225589226e-06,
"loss": 0.0346,
"step": 937
},
{
"epoch": 2.3686868686868685,
"grad_norm": 5.202394962310791,
"learning_rate": 4.208754208754209e-06,
"loss": 0.1108,
"step": 938
},
{
"epoch": 2.371212121212121,
"grad_norm": 1.2443658113479614,
"learning_rate": 4.191919191919192e-06,
"loss": 0.0727,
"step": 939
},
{
"epoch": 2.3737373737373737,
"grad_norm": 2.493161678314209,
"learning_rate": 4.1750841750841755e-06,
"loss": 0.1396,
"step": 940
},
{
"epoch": 2.3762626262626263,
"grad_norm": 1.5535367727279663,
"learning_rate": 4.158249158249158e-06,
"loss": 0.1128,
"step": 941
},
{
"epoch": 2.378787878787879,
"grad_norm": 1.4870634078979492,
"learning_rate": 4.141414141414142e-06,
"loss": 0.1251,
"step": 942
},
{
"epoch": 2.3813131313131315,
"grad_norm": 1.0928040742874146,
"learning_rate": 4.124579124579125e-06,
"loss": 0.1148,
"step": 943
},
{
"epoch": 2.3838383838383836,
"grad_norm": 1.3592982292175293,
"learning_rate": 4.107744107744108e-06,
"loss": 0.1567,
"step": 944
},
{
"epoch": 2.3863636363636362,
"grad_norm": 3.2275450229644775,
"learning_rate": 4.0909090909090915e-06,
"loss": 0.1898,
"step": 945
},
{
"epoch": 2.388888888888889,
"grad_norm": 5.524433135986328,
"learning_rate": 4.074074074074074e-06,
"loss": 0.1526,
"step": 946
},
{
"epoch": 2.3914141414141414,
"grad_norm": 2.3239119052886963,
"learning_rate": 4.057239057239058e-06,
"loss": 0.1879,
"step": 947
},
{
"epoch": 2.393939393939394,
"grad_norm": 2.8176567554473877,
"learning_rate": 4.04040404040404e-06,
"loss": 0.0453,
"step": 948
},
{
"epoch": 2.3964646464646466,
"grad_norm": 4.552126884460449,
"learning_rate": 4.023569023569024e-06,
"loss": 0.1098,
"step": 949
},
{
"epoch": 2.398989898989899,
"grad_norm": 3.1059579849243164,
"learning_rate": 4.0067340067340074e-06,
"loss": 0.1238,
"step": 950
},
{
"epoch": 2.4015151515151514,
"grad_norm": 2.0037975311279297,
"learning_rate": 3.98989898989899e-06,
"loss": 0.1101,
"step": 951
},
{
"epoch": 2.404040404040404,
"grad_norm": 1.432120442390442,
"learning_rate": 3.973063973063974e-06,
"loss": 0.1475,
"step": 952
},
{
"epoch": 2.4065656565656566,
"grad_norm": 4.496235370635986,
"learning_rate": 3.956228956228956e-06,
"loss": 0.1285,
"step": 953
},
{
"epoch": 2.409090909090909,
"grad_norm": 2.675267457962036,
"learning_rate": 3.93939393939394e-06,
"loss": 0.1076,
"step": 954
},
{
"epoch": 2.4116161616161618,
"grad_norm": 1.4617221355438232,
"learning_rate": 3.9225589225589225e-06,
"loss": 0.0742,
"step": 955
},
{
"epoch": 2.4141414141414144,
"grad_norm": 2.676470994949341,
"learning_rate": 3.905723905723906e-06,
"loss": 0.1042,
"step": 956
},
{
"epoch": 2.4166666666666665,
"grad_norm": 3.1182193756103516,
"learning_rate": 3.88888888888889e-06,
"loss": 0.0782,
"step": 957
},
{
"epoch": 2.419191919191919,
"grad_norm": 1.4750274419784546,
"learning_rate": 3.872053872053872e-06,
"loss": 0.0824,
"step": 958
},
{
"epoch": 2.4217171717171717,
"grad_norm": 5.715966701507568,
"learning_rate": 3.855218855218856e-06,
"loss": 0.1555,
"step": 959
},
{
"epoch": 2.4242424242424243,
"grad_norm": 1.0495116710662842,
"learning_rate": 3.8383838383838385e-06,
"loss": 0.1218,
"step": 960
},
{
"epoch": 2.426767676767677,
"grad_norm": 3.0049309730529785,
"learning_rate": 3.821548821548822e-06,
"loss": 0.1283,
"step": 961
},
{
"epoch": 2.429292929292929,
"grad_norm": 1.6869391202926636,
"learning_rate": 3.804713804713805e-06,
"loss": 0.0797,
"step": 962
},
{
"epoch": 2.4318181818181817,
"grad_norm": 2.2413532733917236,
"learning_rate": 3.7878787878787882e-06,
"loss": 0.1478,
"step": 963
},
{
"epoch": 2.4343434343434343,
"grad_norm": 2.301522731781006,
"learning_rate": 3.7710437710437713e-06,
"loss": 0.1209,
"step": 964
},
{
"epoch": 2.436868686868687,
"grad_norm": 3.226301431655884,
"learning_rate": 3.7542087542087544e-06,
"loss": 0.1697,
"step": 965
},
{
"epoch": 2.4393939393939394,
"grad_norm": 2.822960376739502,
"learning_rate": 3.737373737373738e-06,
"loss": 0.0748,
"step": 966
},
{
"epoch": 2.441919191919192,
"grad_norm": 8.013906478881836,
"learning_rate": 3.720538720538721e-06,
"loss": 0.1067,
"step": 967
},
{
"epoch": 2.4444444444444446,
"grad_norm": 1.2187291383743286,
"learning_rate": 3.7037037037037037e-06,
"loss": 0.1054,
"step": 968
},
{
"epoch": 2.446969696969697,
"grad_norm": 1.9397814273834229,
"learning_rate": 3.686868686868687e-06,
"loss": 0.0697,
"step": 969
},
{
"epoch": 2.4494949494949494,
"grad_norm": 2.722252130508423,
"learning_rate": 3.6700336700336704e-06,
"loss": 0.1208,
"step": 970
},
{
"epoch": 2.452020202020202,
"grad_norm": 1.2536653280258179,
"learning_rate": 3.6531986531986535e-06,
"loss": 0.0446,
"step": 971
},
{
"epoch": 2.4545454545454546,
"grad_norm": 2.2456796169281006,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.1854,
"step": 972
},
{
"epoch": 2.457070707070707,
"grad_norm": 3.275261163711548,
"learning_rate": 3.61952861952862e-06,
"loss": 0.1852,
"step": 973
},
{
"epoch": 2.45959595959596,
"grad_norm": 1.8232449293136597,
"learning_rate": 3.6026936026936032e-06,
"loss": 0.1223,
"step": 974
},
{
"epoch": 2.462121212121212,
"grad_norm": 1.9537675380706787,
"learning_rate": 3.585858585858586e-06,
"loss": 0.0819,
"step": 975
},
{
"epoch": 2.4646464646464645,
"grad_norm": 2.161625862121582,
"learning_rate": 3.569023569023569e-06,
"loss": 0.2049,
"step": 976
},
{
"epoch": 2.467171717171717,
"grad_norm": 2.769174575805664,
"learning_rate": 3.5521885521885525e-06,
"loss": 0.0915,
"step": 977
},
{
"epoch": 2.4696969696969697,
"grad_norm": 3.9444172382354736,
"learning_rate": 3.5353535353535356e-06,
"loss": 0.114,
"step": 978
},
{
"epoch": 2.4722222222222223,
"grad_norm": 1.980569839477539,
"learning_rate": 3.5185185185185187e-06,
"loss": 0.0776,
"step": 979
},
{
"epoch": 2.474747474747475,
"grad_norm": 2.1277084350585938,
"learning_rate": 3.5016835016835023e-06,
"loss": 0.1238,
"step": 980
},
{
"epoch": 2.4772727272727275,
"grad_norm": 2.6043457984924316,
"learning_rate": 3.4848484848484854e-06,
"loss": 0.1507,
"step": 981
},
{
"epoch": 2.4797979797979797,
"grad_norm": 1.3472402095794678,
"learning_rate": 3.468013468013468e-06,
"loss": 0.1353,
"step": 982
},
{
"epoch": 2.4823232323232323,
"grad_norm": 4.820988655090332,
"learning_rate": 3.451178451178451e-06,
"loss": 0.1156,
"step": 983
},
{
"epoch": 2.484848484848485,
"grad_norm": 3.138719320297241,
"learning_rate": 3.4343434343434347e-06,
"loss": 0.1345,
"step": 984
},
{
"epoch": 2.4873737373737375,
"grad_norm": 1.767815113067627,
"learning_rate": 3.417508417508418e-06,
"loss": 0.0567,
"step": 985
},
{
"epoch": 2.48989898989899,
"grad_norm": 1.7450860738754272,
"learning_rate": 3.400673400673401e-06,
"loss": 0.1188,
"step": 986
},
{
"epoch": 2.492424242424242,
"grad_norm": 1.7766708135604858,
"learning_rate": 3.3838383838383844e-06,
"loss": 0.1241,
"step": 987
},
{
"epoch": 2.494949494949495,
"grad_norm": 4.628079414367676,
"learning_rate": 3.3670033670033675e-06,
"loss": 0.1263,
"step": 988
},
{
"epoch": 2.4974747474747474,
"grad_norm": 1.5541713237762451,
"learning_rate": 3.3501683501683502e-06,
"loss": 0.0608,
"step": 989
},
{
"epoch": 2.5,
"grad_norm": 4.456207752227783,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.1484,
"step": 990
},
{
"epoch": 2.5025252525252526,
"grad_norm": 3.9640469551086426,
"learning_rate": 3.316498316498317e-06,
"loss": 0.1574,
"step": 991
},
{
"epoch": 2.505050505050505,
"grad_norm": 1.5159541368484497,
"learning_rate": 3.2996632996633e-06,
"loss": 0.0687,
"step": 992
},
{
"epoch": 2.507575757575758,
"grad_norm": 2.402961254119873,
"learning_rate": 3.282828282828283e-06,
"loss": 0.1176,
"step": 993
},
{
"epoch": 2.51010101010101,
"grad_norm": 1.6217000484466553,
"learning_rate": 3.2659932659932666e-06,
"loss": 0.1544,
"step": 994
},
{
"epoch": 2.5126262626262625,
"grad_norm": 3.1921989917755127,
"learning_rate": 3.2491582491582497e-06,
"loss": 0.1447,
"step": 995
},
{
"epoch": 2.515151515151515,
"grad_norm": 1.179274082183838,
"learning_rate": 3.232323232323233e-06,
"loss": 0.0994,
"step": 996
},
{
"epoch": 2.5176767676767677,
"grad_norm": 3.9791829586029053,
"learning_rate": 3.2154882154882155e-06,
"loss": 0.1909,
"step": 997
},
{
"epoch": 2.5202020202020203,
"grad_norm": 2.757751941680908,
"learning_rate": 3.1986531986531986e-06,
"loss": 0.105,
"step": 998
},
{
"epoch": 2.5227272727272725,
"grad_norm": 0.8614385724067688,
"learning_rate": 3.181818181818182e-06,
"loss": 0.0791,
"step": 999
},
{
"epoch": 2.525252525252525,
"grad_norm": 0.6211748123168945,
"learning_rate": 3.1649831649831652e-06,
"loss": 0.0379,
"step": 1000
},
{
"epoch": 2.5277777777777777,
"grad_norm": 2.238368272781372,
"learning_rate": 3.1481481481481483e-06,
"loss": 0.1195,
"step": 1001
},
{
"epoch": 2.5303030303030303,
"grad_norm": 2.4499704837799072,
"learning_rate": 3.131313131313132e-06,
"loss": 0.1324,
"step": 1002
},
{
"epoch": 2.532828282828283,
"grad_norm": 3.4274697303771973,
"learning_rate": 3.114478114478115e-06,
"loss": 0.1922,
"step": 1003
},
{
"epoch": 2.5353535353535355,
"grad_norm": 2.302090883255005,
"learning_rate": 3.0976430976430976e-06,
"loss": 0.1323,
"step": 1004
},
{
"epoch": 2.537878787878788,
"grad_norm": 3.9652259349823,
"learning_rate": 3.0808080808080807e-06,
"loss": 0.1251,
"step": 1005
},
{
"epoch": 2.5404040404040407,
"grad_norm": 6.590030670166016,
"learning_rate": 3.0639730639730643e-06,
"loss": 0.0688,
"step": 1006
},
{
"epoch": 2.542929292929293,
"grad_norm": 0.5998873114585876,
"learning_rate": 3.0471380471380474e-06,
"loss": 0.0546,
"step": 1007
},
{
"epoch": 2.5454545454545454,
"grad_norm": 4.4240899085998535,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.1345,
"step": 1008
},
{
"epoch": 2.547979797979798,
"grad_norm": 2.6441352367401123,
"learning_rate": 3.013468013468014e-06,
"loss": 0.0666,
"step": 1009
},
{
"epoch": 2.5505050505050506,
"grad_norm": 1.1558561325073242,
"learning_rate": 2.996632996632997e-06,
"loss": 0.0784,
"step": 1010
},
{
"epoch": 2.5530303030303028,
"grad_norm": 1.2861305475234985,
"learning_rate": 2.97979797979798e-06,
"loss": 0.0839,
"step": 1011
},
{
"epoch": 2.5555555555555554,
"grad_norm": 2.3291330337524414,
"learning_rate": 2.962962962962963e-06,
"loss": 0.0824,
"step": 1012
},
{
"epoch": 2.558080808080808,
"grad_norm": 1.6665867567062378,
"learning_rate": 2.9461279461279464e-06,
"loss": 0.1121,
"step": 1013
},
{
"epoch": 2.5606060606060606,
"grad_norm": 1.4039171934127808,
"learning_rate": 2.9292929292929295e-06,
"loss": 0.0941,
"step": 1014
},
{
"epoch": 2.563131313131313,
"grad_norm": 1.706173062324524,
"learning_rate": 2.9124579124579126e-06,
"loss": 0.1561,
"step": 1015
},
{
"epoch": 2.5656565656565657,
"grad_norm": 1.4657055139541626,
"learning_rate": 2.895622895622896e-06,
"loss": 0.0968,
"step": 1016
},
{
"epoch": 2.5681818181818183,
"grad_norm": 2.3425521850585938,
"learning_rate": 2.8787878787878793e-06,
"loss": 0.0576,
"step": 1017
},
{
"epoch": 2.570707070707071,
"grad_norm": 1.266230821609497,
"learning_rate": 2.861952861952862e-06,
"loss": 0.0754,
"step": 1018
},
{
"epoch": 2.573232323232323,
"grad_norm": 2.496561288833618,
"learning_rate": 2.845117845117845e-06,
"loss": 0.0982,
"step": 1019
},
{
"epoch": 2.5757575757575757,
"grad_norm": 2.888542890548706,
"learning_rate": 2.8282828282828286e-06,
"loss": 0.0865,
"step": 1020
},
{
"epoch": 2.5782828282828283,
"grad_norm": 1.9701051712036133,
"learning_rate": 2.8114478114478117e-06,
"loss": 0.0496,
"step": 1021
},
{
"epoch": 2.580808080808081,
"grad_norm": 5.326476573944092,
"learning_rate": 2.794612794612795e-06,
"loss": 0.1212,
"step": 1022
},
{
"epoch": 2.5833333333333335,
"grad_norm": 3.695080041885376,
"learning_rate": 2.7777777777777783e-06,
"loss": 0.0967,
"step": 1023
},
{
"epoch": 2.5858585858585856,
"grad_norm": 2.2361230850219727,
"learning_rate": 2.7609427609427614e-06,
"loss": 0.0793,
"step": 1024
},
{
"epoch": 2.5883838383838382,
"grad_norm": 1.3065497875213623,
"learning_rate": 2.7441077441077445e-06,
"loss": 0.0676,
"step": 1025
},
{
"epoch": 2.590909090909091,
"grad_norm": 2.1756739616394043,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.1675,
"step": 1026
},
{
"epoch": 2.5934343434343434,
"grad_norm": 2.2035090923309326,
"learning_rate": 2.7104377104377103e-06,
"loss": 0.1765,
"step": 1027
},
{
"epoch": 2.595959595959596,
"grad_norm": 1.7042522430419922,
"learning_rate": 2.693602693602694e-06,
"loss": 0.1223,
"step": 1028
},
{
"epoch": 2.5984848484848486,
"grad_norm": 1.2529280185699463,
"learning_rate": 2.676767676767677e-06,
"loss": 0.0723,
"step": 1029
},
{
"epoch": 2.601010101010101,
"grad_norm": 1.5967926979064941,
"learning_rate": 2.65993265993266e-06,
"loss": 0.1243,
"step": 1030
},
{
"epoch": 2.6035353535353534,
"grad_norm": 1.8551892042160034,
"learning_rate": 2.6430976430976436e-06,
"loss": 0.0677,
"step": 1031
},
{
"epoch": 2.606060606060606,
"grad_norm": 0.9810446500778198,
"learning_rate": 2.6262626262626267e-06,
"loss": 0.0399,
"step": 1032
},
{
"epoch": 2.6085858585858586,
"grad_norm": 4.027339935302734,
"learning_rate": 2.6094276094276094e-06,
"loss": 0.1253,
"step": 1033
},
{
"epoch": 2.611111111111111,
"grad_norm": 1.6822688579559326,
"learning_rate": 2.5925925925925925e-06,
"loss": 0.1235,
"step": 1034
},
{
"epoch": 2.6136363636363638,
"grad_norm": 2.5733704566955566,
"learning_rate": 2.575757575757576e-06,
"loss": 0.094,
"step": 1035
},
{
"epoch": 2.616161616161616,
"grad_norm": 2.587446689605713,
"learning_rate": 2.558922558922559e-06,
"loss": 0.0614,
"step": 1036
},
{
"epoch": 2.6186868686868685,
"grad_norm": 3.116171360015869,
"learning_rate": 2.5420875420875422e-06,
"loss": 0.063,
"step": 1037
},
{
"epoch": 2.621212121212121,
"grad_norm": 4.079165458679199,
"learning_rate": 2.5252525252525258e-06,
"loss": 0.1302,
"step": 1038
},
{
"epoch": 2.6237373737373737,
"grad_norm": 3.22881817817688,
"learning_rate": 2.508417508417509e-06,
"loss": 0.1311,
"step": 1039
},
{
"epoch": 2.6262626262626263,
"grad_norm": 2.3561739921569824,
"learning_rate": 2.491582491582492e-06,
"loss": 0.1138,
"step": 1040
},
{
"epoch": 2.628787878787879,
"grad_norm": 1.6347684860229492,
"learning_rate": 2.474747474747475e-06,
"loss": 0.1246,
"step": 1041
},
{
"epoch": 2.6313131313131315,
"grad_norm": 2.9931626319885254,
"learning_rate": 2.457912457912458e-06,
"loss": 0.1445,
"step": 1042
},
{
"epoch": 2.633838383838384,
"grad_norm": 0.5848364233970642,
"learning_rate": 2.4410774410774413e-06,
"loss": 0.0661,
"step": 1043
},
{
"epoch": 2.6363636363636362,
"grad_norm": 4.181141376495361,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.1234,
"step": 1044
},
{
"epoch": 2.638888888888889,
"grad_norm": 5.948246002197266,
"learning_rate": 2.4074074074074075e-06,
"loss": 0.134,
"step": 1045
},
{
"epoch": 2.6414141414141414,
"grad_norm": 1.8077932596206665,
"learning_rate": 2.3905723905723906e-06,
"loss": 0.1052,
"step": 1046
},
{
"epoch": 2.643939393939394,
"grad_norm": 4.848948955535889,
"learning_rate": 2.373737373737374e-06,
"loss": 0.1963,
"step": 1047
},
{
"epoch": 2.6464646464646466,
"grad_norm": 2.3405141830444336,
"learning_rate": 2.3569023569023572e-06,
"loss": 0.24,
"step": 1048
},
{
"epoch": 2.648989898989899,
"grad_norm": 3.162492036819458,
"learning_rate": 2.3400673400673403e-06,
"loss": 0.0911,
"step": 1049
},
{
"epoch": 2.6515151515151514,
"grad_norm": 4.6703619956970215,
"learning_rate": 2.3232323232323234e-06,
"loss": 0.0713,
"step": 1050
},
{
"epoch": 2.654040404040404,
"grad_norm": 1.252194881439209,
"learning_rate": 2.3063973063973065e-06,
"loss": 0.0678,
"step": 1051
},
{
"epoch": 2.6565656565656566,
"grad_norm": 1.4940955638885498,
"learning_rate": 2.28956228956229e-06,
"loss": 0.0321,
"step": 1052
},
{
"epoch": 2.659090909090909,
"grad_norm": 2.759089469909668,
"learning_rate": 2.2727272727272728e-06,
"loss": 0.0759,
"step": 1053
},
{
"epoch": 2.6616161616161618,
"grad_norm": 4.008279800415039,
"learning_rate": 2.2558922558922563e-06,
"loss": 0.1421,
"step": 1054
},
{
"epoch": 2.6641414141414144,
"grad_norm": 2.280316114425659,
"learning_rate": 2.2390572390572394e-06,
"loss": 0.0971,
"step": 1055
},
{
"epoch": 2.6666666666666665,
"grad_norm": 1.5876095294952393,
"learning_rate": 2.222222222222222e-06,
"loss": 0.0945,
"step": 1056
},
{
"epoch": 2.669191919191919,
"grad_norm": 2.7003700733184814,
"learning_rate": 2.2053872053872056e-06,
"loss": 0.1862,
"step": 1057
},
{
"epoch": 2.6717171717171717,
"grad_norm": 2.837354898452759,
"learning_rate": 2.1885521885521887e-06,
"loss": 0.0816,
"step": 1058
},
{
"epoch": 2.6742424242424243,
"grad_norm": 1.9325331449508667,
"learning_rate": 2.171717171717172e-06,
"loss": 0.09,
"step": 1059
},
{
"epoch": 2.676767676767677,
"grad_norm": 1.9655112028121948,
"learning_rate": 2.154882154882155e-06,
"loss": 0.189,
"step": 1060
},
{
"epoch": 2.679292929292929,
"grad_norm": 0.8985033631324768,
"learning_rate": 2.138047138047138e-06,
"loss": 0.0415,
"step": 1061
},
{
"epoch": 2.6818181818181817,
"grad_norm": 2.287306785583496,
"learning_rate": 2.1212121212121216e-06,
"loss": 0.2154,
"step": 1062
},
{
"epoch": 2.6843434343434343,
"grad_norm": 2.1749632358551025,
"learning_rate": 2.1043771043771047e-06,
"loss": 0.1085,
"step": 1063
},
{
"epoch": 2.686868686868687,
"grad_norm": 3.1133999824523926,
"learning_rate": 2.0875420875420878e-06,
"loss": 0.109,
"step": 1064
},
{
"epoch": 2.6893939393939394,
"grad_norm": 1.5289435386657715,
"learning_rate": 2.070707070707071e-06,
"loss": 0.0488,
"step": 1065
},
{
"epoch": 2.691919191919192,
"grad_norm": 2.7709944248199463,
"learning_rate": 2.053872053872054e-06,
"loss": 0.1032,
"step": 1066
},
{
"epoch": 2.6944444444444446,
"grad_norm": 3.149768114089966,
"learning_rate": 2.037037037037037e-06,
"loss": 0.0486,
"step": 1067
},
{
"epoch": 2.6969696969696972,
"grad_norm": 3.0890722274780273,
"learning_rate": 2.02020202020202e-06,
"loss": 0.1869,
"step": 1068
},
{
"epoch": 2.6994949494949494,
"grad_norm": 4.697057247161865,
"learning_rate": 2.0033670033670037e-06,
"loss": 0.2966,
"step": 1069
},
{
"epoch": 2.702020202020202,
"grad_norm": 3.644277334213257,
"learning_rate": 1.986531986531987e-06,
"loss": 0.0869,
"step": 1070
},
{
"epoch": 2.7045454545454546,
"grad_norm": 1.996146559715271,
"learning_rate": 1.96969696969697e-06,
"loss": 0.1297,
"step": 1071
},
{
"epoch": 2.707070707070707,
"grad_norm": 1.3258694410324097,
"learning_rate": 1.952861952861953e-06,
"loss": 0.0937,
"step": 1072
},
{
"epoch": 2.7095959595959593,
"grad_norm": 2.5805246829986572,
"learning_rate": 1.936026936026936e-06,
"loss": 0.107,
"step": 1073
},
{
"epoch": 2.712121212121212,
"grad_norm": 1.8007394075393677,
"learning_rate": 1.9191919191919192e-06,
"loss": 0.0987,
"step": 1074
},
{
"epoch": 2.7146464646464645,
"grad_norm": 2.052168369293213,
"learning_rate": 1.9023569023569026e-06,
"loss": 0.0753,
"step": 1075
},
{
"epoch": 2.717171717171717,
"grad_norm": 1.795806646347046,
"learning_rate": 1.8855218855218857e-06,
"loss": 0.0898,
"step": 1076
},
{
"epoch": 2.7196969696969697,
"grad_norm": 2.1112513542175293,
"learning_rate": 1.868686868686869e-06,
"loss": 0.0948,
"step": 1077
},
{
"epoch": 2.7222222222222223,
"grad_norm": 1.7274150848388672,
"learning_rate": 1.8518518518518519e-06,
"loss": 0.0699,
"step": 1078
},
{
"epoch": 2.724747474747475,
"grad_norm": 3.7306082248687744,
"learning_rate": 1.8350168350168352e-06,
"loss": 0.138,
"step": 1079
},
{
"epoch": 2.7272727272727275,
"grad_norm": 1.8672465085983276,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.0541,
"step": 1080
},
{
"epoch": 2.7297979797979797,
"grad_norm": 2.303978443145752,
"learning_rate": 1.8013468013468016e-06,
"loss": 0.1123,
"step": 1081
},
{
"epoch": 2.7323232323232323,
"grad_norm": 1.74871027469635,
"learning_rate": 1.7845117845117845e-06,
"loss": 0.1266,
"step": 1082
},
{
"epoch": 2.734848484848485,
"grad_norm": 3.29699969291687,
"learning_rate": 1.7676767676767678e-06,
"loss": 0.1918,
"step": 1083
},
{
"epoch": 2.7373737373737375,
"grad_norm": 2.935121774673462,
"learning_rate": 1.7508417508417511e-06,
"loss": 0.115,
"step": 1084
},
{
"epoch": 2.73989898989899,
"grad_norm": 4.8938140869140625,
"learning_rate": 1.734006734006734e-06,
"loss": 0.1566,
"step": 1085
},
{
"epoch": 2.742424242424242,
"grad_norm": 3.4594430923461914,
"learning_rate": 1.7171717171717173e-06,
"loss": 0.0905,
"step": 1086
},
{
"epoch": 2.744949494949495,
"grad_norm": 2.121217966079712,
"learning_rate": 1.7003367003367005e-06,
"loss": 0.1123,
"step": 1087
},
{
"epoch": 2.7474747474747474,
"grad_norm": 2.414285182952881,
"learning_rate": 1.6835016835016838e-06,
"loss": 0.1652,
"step": 1088
},
{
"epoch": 2.75,
"grad_norm": 3.6288323402404785,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.0463,
"step": 1089
},
{
"epoch": 2.7525252525252526,
"grad_norm": 1.9368160963058472,
"learning_rate": 1.64983164983165e-06,
"loss": 0.0567,
"step": 1090
},
{
"epoch": 2.755050505050505,
"grad_norm": 1.847935438156128,
"learning_rate": 1.6329966329966333e-06,
"loss": 0.0958,
"step": 1091
},
{
"epoch": 2.757575757575758,
"grad_norm": 1.821707010269165,
"learning_rate": 1.6161616161616164e-06,
"loss": 0.0851,
"step": 1092
},
{
"epoch": 2.76010101010101,
"grad_norm": 3.361027240753174,
"learning_rate": 1.5993265993265993e-06,
"loss": 0.0712,
"step": 1093
},
{
"epoch": 2.7626262626262625,
"grad_norm": 1.8871111869812012,
"learning_rate": 1.5824915824915826e-06,
"loss": 0.0637,
"step": 1094
},
{
"epoch": 2.765151515151515,
"grad_norm": 3.3805835247039795,
"learning_rate": 1.565656565656566e-06,
"loss": 0.166,
"step": 1095
},
{
"epoch": 2.7676767676767677,
"grad_norm": 1.451699137687683,
"learning_rate": 1.5488215488215488e-06,
"loss": 0.1075,
"step": 1096
},
{
"epoch": 2.7702020202020203,
"grad_norm": 1.6252110004425049,
"learning_rate": 1.5319865319865321e-06,
"loss": 0.0511,
"step": 1097
},
{
"epoch": 2.7727272727272725,
"grad_norm": 1.8269497156143188,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.0603,
"step": 1098
},
{
"epoch": 2.775252525252525,
"grad_norm": 1.9480081796646118,
"learning_rate": 1.4983164983164986e-06,
"loss": 0.1297,
"step": 1099
},
{
"epoch": 2.7777777777777777,
"grad_norm": 1.0791457891464233,
"learning_rate": 1.4814814814814815e-06,
"loss": 0.127,
"step": 1100
},
{
"epoch": 2.7803030303030303,
"grad_norm": 1.6918015480041504,
"learning_rate": 1.4646464646464648e-06,
"loss": 0.1229,
"step": 1101
},
{
"epoch": 2.782828282828283,
"grad_norm": 1.6666957139968872,
"learning_rate": 1.447811447811448e-06,
"loss": 0.1041,
"step": 1102
},
{
"epoch": 2.7853535353535355,
"grad_norm": 1.4526945352554321,
"learning_rate": 1.430976430976431e-06,
"loss": 0.1327,
"step": 1103
},
{
"epoch": 2.787878787878788,
"grad_norm": 4.764105319976807,
"learning_rate": 1.4141414141414143e-06,
"loss": 0.1007,
"step": 1104
},
{
"epoch": 2.7904040404040407,
"grad_norm": 1.458585262298584,
"learning_rate": 1.3973063973063974e-06,
"loss": 0.0867,
"step": 1105
},
{
"epoch": 2.792929292929293,
"grad_norm": 1.1463141441345215,
"learning_rate": 1.3804713804713807e-06,
"loss": 0.0722,
"step": 1106
},
{
"epoch": 2.7954545454545454,
"grad_norm": 2.6391751766204834,
"learning_rate": 1.3636363636363636e-06,
"loss": 0.0808,
"step": 1107
},
{
"epoch": 2.797979797979798,
"grad_norm": 2.5230796337127686,
"learning_rate": 1.346801346801347e-06,
"loss": 0.1696,
"step": 1108
},
{
"epoch": 2.8005050505050506,
"grad_norm": 2.990051507949829,
"learning_rate": 1.32996632996633e-06,
"loss": 0.1824,
"step": 1109
},
{
"epoch": 2.8030303030303028,
"grad_norm": 5.150264739990234,
"learning_rate": 1.3131313131313134e-06,
"loss": 0.08,
"step": 1110
},
{
"epoch": 2.8055555555555554,
"grad_norm": 2.4451775550842285,
"learning_rate": 1.2962962962962962e-06,
"loss": 0.0639,
"step": 1111
},
{
"epoch": 2.808080808080808,
"grad_norm": 8.441463470458984,
"learning_rate": 1.2794612794612796e-06,
"loss": 0.071,
"step": 1112
},
{
"epoch": 2.8106060606060606,
"grad_norm": 7.7809882164001465,
"learning_rate": 1.2626262626262629e-06,
"loss": 0.0897,
"step": 1113
},
{
"epoch": 2.813131313131313,
"grad_norm": 8.197009086608887,
"learning_rate": 1.245791245791246e-06,
"loss": 0.1037,
"step": 1114
},
{
"epoch": 2.8156565656565657,
"grad_norm": 2.672224283218384,
"learning_rate": 1.228956228956229e-06,
"loss": 0.0338,
"step": 1115
},
{
"epoch": 2.8181818181818183,
"grad_norm": 2.55483078956604,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.0677,
"step": 1116
},
{
"epoch": 2.820707070707071,
"grad_norm": 7.761810779571533,
"learning_rate": 1.1952861952861953e-06,
"loss": 0.0817,
"step": 1117
},
{
"epoch": 2.823232323232323,
"grad_norm": 2.313318967819214,
"learning_rate": 1.1784511784511786e-06,
"loss": 0.1253,
"step": 1118
},
{
"epoch": 2.8257575757575757,
"grad_norm": 0.8076485991477966,
"learning_rate": 1.1616161616161617e-06,
"loss": 0.0363,
"step": 1119
},
{
"epoch": 2.8282828282828283,
"grad_norm": 2.6288771629333496,
"learning_rate": 1.144781144781145e-06,
"loss": 0.1451,
"step": 1120
},
{
"epoch": 2.830808080808081,
"grad_norm": 1.7148422002792358,
"learning_rate": 1.1279461279461281e-06,
"loss": 0.1067,
"step": 1121
},
{
"epoch": 2.8333333333333335,
"grad_norm": 1.2999204397201538,
"learning_rate": 1.111111111111111e-06,
"loss": 0.0544,
"step": 1122
},
{
"epoch": 2.8358585858585856,
"grad_norm": 2.9060170650482178,
"learning_rate": 1.0942760942760944e-06,
"loss": 0.1528,
"step": 1123
},
{
"epoch": 2.8383838383838382,
"grad_norm": 2.594888210296631,
"learning_rate": 1.0774410774410775e-06,
"loss": 0.104,
"step": 1124
},
{
"epoch": 2.840909090909091,
"grad_norm": 7.884887218475342,
"learning_rate": 1.0606060606060608e-06,
"loss": 0.1301,
"step": 1125
},
{
"epoch": 2.8434343434343434,
"grad_norm": 1.9427886009216309,
"learning_rate": 1.0437710437710439e-06,
"loss": 0.2201,
"step": 1126
},
{
"epoch": 2.845959595959596,
"grad_norm": 6.63613748550415,
"learning_rate": 1.026936026936027e-06,
"loss": 0.1505,
"step": 1127
},
{
"epoch": 2.8484848484848486,
"grad_norm": 2.172806739807129,
"learning_rate": 1.01010101010101e-06,
"loss": 0.1074,
"step": 1128
},
{
"epoch": 2.851010101010101,
"grad_norm": 2.2825562953948975,
"learning_rate": 9.932659932659934e-07,
"loss": 0.1252,
"step": 1129
},
{
"epoch": 2.8535353535353534,
"grad_norm": 1.1408872604370117,
"learning_rate": 9.764309764309765e-07,
"loss": 0.1207,
"step": 1130
},
{
"epoch": 2.856060606060606,
"grad_norm": 2.4947509765625,
"learning_rate": 9.595959595959596e-07,
"loss": 0.0963,
"step": 1131
},
{
"epoch": 2.8585858585858586,
"grad_norm": 7.295626640319824,
"learning_rate": 9.427609427609428e-07,
"loss": 0.1011,
"step": 1132
},
{
"epoch": 2.861111111111111,
"grad_norm": 9.468647956848145,
"learning_rate": 9.259259259259259e-07,
"loss": 0.0915,
"step": 1133
},
{
"epoch": 2.8636363636363638,
"grad_norm": 1.7602087259292603,
"learning_rate": 9.090909090909091e-07,
"loss": 0.0556,
"step": 1134
},
{
"epoch": 2.866161616161616,
"grad_norm": 1.6855865716934204,
"learning_rate": 8.922558922558923e-07,
"loss": 0.0916,
"step": 1135
},
{
"epoch": 2.8686868686868685,
"grad_norm": 3.8684542179107666,
"learning_rate": 8.754208754208756e-07,
"loss": 0.0927,
"step": 1136
},
{
"epoch": 2.871212121212121,
"grad_norm": 1.5681943893432617,
"learning_rate": 8.585858585858587e-07,
"loss": 0.0907,
"step": 1137
},
{
"epoch": 2.8737373737373737,
"grad_norm": 2.357790470123291,
"learning_rate": 8.417508417508419e-07,
"loss": 0.0963,
"step": 1138
},
{
"epoch": 2.8762626262626263,
"grad_norm": 2.0638039112091064,
"learning_rate": 8.24915824915825e-07,
"loss": 0.1217,
"step": 1139
},
{
"epoch": 2.878787878787879,
"grad_norm": 7.039210319519043,
"learning_rate": 8.080808080808082e-07,
"loss": 0.1581,
"step": 1140
},
{
"epoch": 2.8813131313131315,
"grad_norm": 2.2965760231018066,
"learning_rate": 7.912457912457913e-07,
"loss": 0.1482,
"step": 1141
},
{
"epoch": 2.883838383838384,
"grad_norm": 1.5618226528167725,
"learning_rate": 7.744107744107744e-07,
"loss": 0.1567,
"step": 1142
},
{
"epoch": 2.8863636363636362,
"grad_norm": 1.2720274925231934,
"learning_rate": 7.575757575757576e-07,
"loss": 0.1048,
"step": 1143
},
{
"epoch": 2.888888888888889,
"grad_norm": 1.6947522163391113,
"learning_rate": 7.407407407407407e-07,
"loss": 0.0891,
"step": 1144
},
{
"epoch": 2.8914141414141414,
"grad_norm": 3.2767159938812256,
"learning_rate": 7.23905723905724e-07,
"loss": 0.2284,
"step": 1145
},
{
"epoch": 2.893939393939394,
"grad_norm": 12.075784683227539,
"learning_rate": 7.070707070707071e-07,
"loss": 0.1004,
"step": 1146
},
{
"epoch": 2.8964646464646466,
"grad_norm": 1.556806206703186,
"learning_rate": 6.902356902356904e-07,
"loss": 0.1137,
"step": 1147
},
{
"epoch": 2.898989898989899,
"grad_norm": 3.214446783065796,
"learning_rate": 6.734006734006735e-07,
"loss": 0.1453,
"step": 1148
},
{
"epoch": 2.9015151515151514,
"grad_norm": 2.274674892425537,
"learning_rate": 6.565656565656567e-07,
"loss": 0.1567,
"step": 1149
},
{
"epoch": 2.904040404040404,
"grad_norm": 4.8869781494140625,
"learning_rate": 6.397306397306398e-07,
"loss": 0.2099,
"step": 1150
},
{
"epoch": 2.9065656565656566,
"grad_norm": 4.9651923179626465,
"learning_rate": 6.22895622895623e-07,
"loss": 0.1808,
"step": 1151
},
{
"epoch": 2.909090909090909,
"grad_norm": 4.156426906585693,
"learning_rate": 6.060606060606061e-07,
"loss": 0.0797,
"step": 1152
},
{
"epoch": 2.9116161616161618,
"grad_norm": 2.8879013061523438,
"learning_rate": 5.892255892255893e-07,
"loss": 0.1232,
"step": 1153
},
{
"epoch": 2.9141414141414144,
"grad_norm": 1.8005517721176147,
"learning_rate": 5.723905723905725e-07,
"loss": 0.112,
"step": 1154
},
{
"epoch": 2.9166666666666665,
"grad_norm": 3.8166842460632324,
"learning_rate": 5.555555555555555e-07,
"loss": 0.0776,
"step": 1155
},
{
"epoch": 2.919191919191919,
"grad_norm": 4.17734432220459,
"learning_rate": 5.387205387205387e-07,
"loss": 0.2496,
"step": 1156
},
{
"epoch": 2.9217171717171717,
"grad_norm": 2.027888536453247,
"learning_rate": 5.218855218855219e-07,
"loss": 0.1184,
"step": 1157
},
{
"epoch": 2.9242424242424243,
"grad_norm": 0.865708589553833,
"learning_rate": 5.05050505050505e-07,
"loss": 0.0604,
"step": 1158
},
{
"epoch": 2.926767676767677,
"grad_norm": 1.5890415906906128,
"learning_rate": 4.882154882154883e-07,
"loss": 0.1305,
"step": 1159
},
{
"epoch": 2.929292929292929,
"grad_norm": 1.054485559463501,
"learning_rate": 4.713804713804714e-07,
"loss": 0.077,
"step": 1160
},
{
"epoch": 2.9318181818181817,
"grad_norm": 1.1664531230926514,
"learning_rate": 4.5454545454545457e-07,
"loss": 0.065,
"step": 1161
},
{
"epoch": 2.9343434343434343,
"grad_norm": 1.196090579032898,
"learning_rate": 4.377104377104378e-07,
"loss": 0.0854,
"step": 1162
},
{
"epoch": 2.936868686868687,
"grad_norm": 1.983268141746521,
"learning_rate": 4.2087542087542094e-07,
"loss": 0.1021,
"step": 1163
},
{
"epoch": 2.9393939393939394,
"grad_norm": 5.308765888214111,
"learning_rate": 4.040404040404041e-07,
"loss": 0.1535,
"step": 1164
},
{
"epoch": 2.941919191919192,
"grad_norm": 3.1391713619232178,
"learning_rate": 3.872053872053872e-07,
"loss": 0.1295,
"step": 1165
},
{
"epoch": 2.9444444444444446,
"grad_norm": 1.9112738370895386,
"learning_rate": 3.7037037037037036e-07,
"loss": 0.1338,
"step": 1166
},
{
"epoch": 2.9469696969696972,
"grad_norm": 1.7345768213272095,
"learning_rate": 3.535353535353536e-07,
"loss": 0.1048,
"step": 1167
},
{
"epoch": 2.9494949494949494,
"grad_norm": 1.8400707244873047,
"learning_rate": 3.3670033670033673e-07,
"loss": 0.1345,
"step": 1168
},
{
"epoch": 2.952020202020202,
"grad_norm": 5.5112152099609375,
"learning_rate": 3.198653198653199e-07,
"loss": 0.0901,
"step": 1169
},
{
"epoch": 2.9545454545454546,
"grad_norm": 1.7662899494171143,
"learning_rate": 3.0303030303030305e-07,
"loss": 0.1025,
"step": 1170
},
{
"epoch": 2.957070707070707,
"grad_norm": 5.364653587341309,
"learning_rate": 2.8619528619528626e-07,
"loss": 0.098,
"step": 1171
},
{
"epoch": 2.9595959595959593,
"grad_norm": 1.2001750469207764,
"learning_rate": 2.6936026936026936e-07,
"loss": 0.049,
"step": 1172
},
{
"epoch": 2.962121212121212,
"grad_norm": 2.842573642730713,
"learning_rate": 2.525252525252525e-07,
"loss": 0.0885,
"step": 1173
},
{
"epoch": 2.9646464646464645,
"grad_norm": 1.9140822887420654,
"learning_rate": 2.356902356902357e-07,
"loss": 0.1336,
"step": 1174
},
{
"epoch": 2.967171717171717,
"grad_norm": 1.2715041637420654,
"learning_rate": 2.188552188552189e-07,
"loss": 0.044,
"step": 1175
},
{
"epoch": 2.9696969696969697,
"grad_norm": 1.805606722831726,
"learning_rate": 2.0202020202020205e-07,
"loss": 0.1139,
"step": 1176
},
{
"epoch": 2.9722222222222223,
"grad_norm": 0.7524275183677673,
"learning_rate": 1.8518518518518518e-07,
"loss": 0.038,
"step": 1177
},
{
"epoch": 2.974747474747475,
"grad_norm": 1.4970057010650635,
"learning_rate": 1.6835016835016837e-07,
"loss": 0.1246,
"step": 1178
},
{
"epoch": 2.9772727272727275,
"grad_norm": 2.653041124343872,
"learning_rate": 1.5151515151515152e-07,
"loss": 0.1941,
"step": 1179
},
{
"epoch": 2.9797979797979797,
"grad_norm": 2.8758771419525146,
"learning_rate": 1.3468013468013468e-07,
"loss": 0.1387,
"step": 1180
},
{
"epoch": 2.9823232323232323,
"grad_norm": 4.085249423980713,
"learning_rate": 1.1784511784511785e-07,
"loss": 0.0822,
"step": 1181
},
{
"epoch": 2.984848484848485,
"grad_norm": 2.2607507705688477,
"learning_rate": 1.0101010101010103e-07,
"loss": 0.1064,
"step": 1182
},
{
"epoch": 2.9873737373737375,
"grad_norm": 2.853379726409912,
"learning_rate": 8.417508417508418e-08,
"loss": 0.0615,
"step": 1183
},
{
"epoch": 2.98989898989899,
"grad_norm": 3.8462393283843994,
"learning_rate": 6.734006734006734e-08,
"loss": 0.1311,
"step": 1184
},
{
"epoch": 2.992424242424242,
"grad_norm": 4.459750652313232,
"learning_rate": 5.050505050505051e-08,
"loss": 0.2523,
"step": 1185
},
{
"epoch": 2.994949494949495,
"grad_norm": 2.9024791717529297,
"learning_rate": 3.367003367003367e-08,
"loss": 0.0775,
"step": 1186
},
{
"epoch": 2.9974747474747474,
"grad_norm": 2.9558804035186768,
"learning_rate": 1.6835016835016835e-08,
"loss": 0.1257,
"step": 1187
},
{
"epoch": 3.0,
"grad_norm": 2.027782678604126,
"learning_rate": 0.0,
"loss": 0.1071,
"step": 1188
},
{
"epoch": 3.0,
"eval_accuracy": 0.775,
"eval_f1": 0.9063876651982378,
"eval_loss": 0.13235561549663544,
"eval_runtime": 45.6825,
"eval_samples_per_second": 19.263,
"eval_steps_per_second": 0.416,
"step": 1188
}
],
"logging_steps": 1,
"max_steps": 1188,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 251588081479680.0,
"train_batch_size": 20,
"trial_name": null,
"trial_params": null
}